# install Rsubread if needed
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("Rsubread")
library(Rsubread)
#Provide your location of sorted BAM files, GTF annotation file and number of threads, depending on your system
counts = featureCounts(list.files('/mnt/md0/LanshakovKAPA26102021_3/sorted_bam','*.bam$',full.names = T),annot.ext = '~/data_nvme/rat_gen/Rattus_norvegicus.mRatBN7.2.108.chr.gtf',isGTFAnnotationFile=TRUE, nthreads = 27)
#Save counts for future needs
saveRDS(counts,'/home/ssdd/data_nvme/trimmer_seq/counts/counts1.Rdata')
#Pull out from list table with genes counts
counts2 = counts$counts
#Filter it with cut off genes with o count
counts2 = counts2[apply(counts2,1,mean) >= 1,]
#Let's make data frame with samples, group names and experimantal conditions
library(tidyverse)
df <- colnames(counts2)
#On base of df vector with make vectors with experimentl conditions that reffers to respective sample numbers
stress <- c("noFST","FST","noFST","FST","FST","noFST", "FST", "noFST","FST", "FST")
treat <-c("notrim","notrim", "notrim","notrim","notrim","trim","trim","trim","trim","trim")
df1 <-data.frame(df,stress, treat)
colnames(counts2)=df
df1$group<- factor(paste0(df1$stress, ".", df1$treat))
rownames(df1) <- df1[,1]
colnames(counts2)=rownames(df1)
df2 <- data.frame(stress= c(df1$stress[!duplicated(df1$stress)]),color=c("red","green"))
df3<- data.frame(group= c(df1$group[!duplicated(df1$group)]),col=c("red","green", "blue", "brown"))
df1 <- left_join(df1, df2 )
df1 <- df1[,!names(df1) %in% c("col")]
df1 <- left_join(df1, df3 )
#All we need will be in dataframe df1
df1$group_1<- factor(paste0(df1$group, "_", rownames(df1)))
#Compute correlation matrix
cors = cor(counts2,m='sp') # Spearman correlation
corp = cor(counts2,m='p') # Pearsons
corlp = cor(log(1+counts2),m='p') # Pearson's log scale
cors 
#Make color pallete for heatmap 
library(RColorBrewer)
pal <- colorRampPalette(brewer.pal(11, "Spectral"))(100)
#Will make fig.2b from the paper 
heatmap(1-cors, # correlation distances matrix
        symm = T, #  symmetry
        distfun = function(x){as.dist(x)}, col=pal, labRow = df1$stress, labCol =df1$treat,# указываем что мерой расстояния между образцами должно быть значение в соответствующей ячейке входной матрицы (по умолчанию это евклидово расстояние между соответствующими колонками матрицы)
        ColSideColors = df1$color,cexRow = 2,
        cexCol = 2) 


#Code for eps image
cairo_ps(filename =  "heatmap.eps",
         width = 7, height = 6, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)

heatmap(1-cors, # correlation distances matrix
        symm = T, # symmetry
        distfun = function(x){as.dist(x)}, col=pal, labRow = df1$stress, labCol =df1$treat,# указываем что мерой расстояния между образцами должно быть значение в соответствующей ячейке входной матрицы (по умолчанию это евклидово расстояние между соответствующими колонками матрицы)
        ColSideColors = df1$color,cexRow = 1.8,
        cexCol = 1.8) 
dev.off()
dev.set()
#Next code will estimate libraries sizes and will make pca plot 
library(edgeR)
#Make DGEList object with grouping varaible
y <- DGEList(counts=counts2, group=df1$group)
# plot libraries sizes without normalization
barplot(y$samples$lib.size*1e-6, horiz = T, ylab="Library size (millions)", cex.axis =2, cex.names =2, cex.lab=2)
#Draw eps image for fig. 2a paper
cairo_ps(filename =  "library_size.eps",
         width = 4, height = 3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
par(mar=c(5,5,5,1)+.1)
barplot(y$samples$lib.size*1e-6, horiz = T, ylab="Library size (millions)", cex.axis =2, cex.names =1.8, cex.lab=1.8)
dev.off()
dev.set()

#Next will make edger's library  normalization 
require(org.Rn.eg.db)
Symbol <- mapIds(org.Rn.eg.db, keys=rownames(y),  column="ALIAS", keytype="ENSEMBL")
y$genes <- data.frame(Symbol=Symbol)
keep <- filterByExpr(y)
table(keep)
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
y$samples

#Plot pca plots 
plotMDS(y, col=rep(1:2, each=3))
plotMDS(y, col=df1$color)

#Code for making fig2b of the paper.
cairo_ps(filename =  "PCA.eps",
         width = 7, height = 6, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
par(mar=c(5,5,5,1)+.1)
a <- plotMDS(y, pch=as.numeric(df1$group,cex.axis =1.5, cex.names =2, cex.lab=2))
plot(a$x, a$y, pch=as.numeric(df1$group), xlab='Leading logFC dim 1 (35%)', ylab='Leading logFC dim 2 (23%)',cex.axis =1.5, cex.names =2, cex.lab=2)+
  text(a$x, a$y, labels=df1$group, col=df1$col,cex.lab=2)
dev.off()
dev.set()
#Next code for create eps images of fig.2e of counts correlation in different samples
library(ggplot2)
library(ggpubr)
counts2_df <- as.data.frame(counts2)

cairo_ps(filename =  "n1_vs_n3.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p<-counts2_df %>% 
  ggplot(aes(n1_1_S5_R1_001_sorted.bam + 1, n3_S4_R1_001_sorted.bam + 1)) + 
  geom_point() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2")+
  stat_cor(method = "pearson", label.x = log2(5), label.y = log2(4194304), size = 5)+
  geom_abline(colour = "brown") +
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 0))
p 
dev.off()
dev.set()

cairo_ps(filename =  "n1_vs_n34.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p <- counts2_df %>% 
  ggplot(aes(n1_1_S5_R1_001_sorted.bam + 1, n34_S2_R1_001_sorted.bam + 1)) + 
  geom_point() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2")+
  stat_cor(method = "pearson", label.x = log2(5), label.y = log2(4194304), size = 5)+
  geom_abline(colour = "brown") +
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 0))
p 
dev.off()
dev.set()

cairo_ps(filename =  "n1_vs_tr1.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p <- counts2_df %>% 
  ggplot(aes(n1_1_S5_R1_001_sorted.bam + 1, tr_1_S1_R1_001_sorted.bam + 1)) + 
  geom_point() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2")+
  stat_cor(method = "pearson", label.x = log2(5), label.y = log2(520000), size =5)+
  geom_abline(colour = "brown") +
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 0))
p 
dev.off()
dev.set()

cairo_ps(filename =  "tr1_vs_tr3.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p <- counts2_df %>% 
  ggplot(aes(tr_1_S1_R1_001_sorted.bam + 1, tr_3_S5_R1_001_sorted.bam+ 1)) + 
  geom_point() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2")+
  stat_cor(method = "pearson", label.x = log2(5), label.y = log2(520000), size = 5)+
  geom_abline(colour = "brown") +
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 0))
p 
dev.off()
dev.set()

cairo_ps(filename =  "tr1_vs_tr34.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p<- counts2_df %>% 
  ggplot(aes(tr_1_S1_R1_001_sorted.bam + 1, tr_34_S11_R1_001_sorted.bam + 1)) + 
  geom_point() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2")+
  stat_cor(method = "pearson", label.x = log2(2), label.y = log2(17000), size = 5)+
  geom_abline(colour = "brown") +
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 0))
p 
dev.off()
dev.set()

#Now will calculate DEG's  with edger glm
design <- model.matrix(~ 0 + df1$group)
colnames(design) <- levels(df1$group)
design
y <- estimateDisp(y, design, robust=TRUE)
y$common.dispersion
plotBCV(y)
fit <- glmQLFit(y, design, robust=TRUE)
head(fit$coefficients)
plotQLDisp(fit)
con <- makeContrasts(F.nT_vs_nF.nT =  FST.notrim - noFST.notrim, F.T_vs_nF.T = FST.trim - noFST.trim,
                     F.T_vs_F.nT = FST.trim - FST.notrim, nF.T_vs_nF.nT = noFST.trim - noFST.notrim, levels=design)
qlf <- glmQLFTest(fit, contrast=con)
topTags(qlf)
summary(decideTests(qlf))
plotMD(qlf)
#Adjust P-values for Multiple Comparisons with Benjamini, Hochberg
qlf$table$p.adj <- p.adjust(qlf$table$PValue, method="BH")
#Save data
saveRDS(qlf,'/home/ssdd/data_nvme/trimmer_seq/counts/qlf.Rdata')
#pull out tables with gene expression 
df4 <- qlf$table
#Let's annotate it with genes names and genes information
library(biomaRt)
df4$ensembl_gene_id <- rownames(df4)
ensembl <- useEnsembl(biomart = "genes", host = "http://asia.ensembl.org", dataset = "rnorvegicus_gene_ensembl")
G_list <- getBM(filters= "ensembl_gene_id", attributes= c('ensembl_gene_id', 
                                                          'entrezgene_id',
                                                          'rgd_symbol',
                                                          'gene_biotype',"refseq_mrna"), 
                uniqueRows = TRUE ,values=df4$ensembl_gene_id,mart= ensembl)
df4 <- left_join(df4, G_list, by='ensembl_gene_id', copy=FALSE )# if there is no function left_join attach tidyverse library
# Now in the df4 table there are doubling entries because of databases discrepancies 
# Save it
write.csv (df4,'/home/ssdd/data_nvme/trimmer_seq/counts/DEGs_ALL_GENES.csv')
# Now will leave only one entry per gene and leave only significant p.adj < 0.05
df5 <-  distinct(df4, ensembl_gene_id, .keep_all = TRUE)
df5 <- df5 %>% subset(df5$p.adj<0.05)
#Save it as Signif. genes
write.csv (df5,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_1.csv')

#Now let's make DEGs volcano plots.
library(EnhancedVolcano)
#Mainly it is done by this funxtion 
#To draw all gene dots it is made on qlf$table data with this cut offs   pCutoff = 0.05, FCcutoff = 1
p <- EnhancedVolcano(qlf$table,
                     lab = qlf$genes$Symbol,
                     x = 'logFC.F.nT_vs_nF.nT', y = 'p.adj',
                     title = "",
                     subtitle = "",
                     pCutoff = 0.05,
                     FCcutoff = 1,
                     pointSize = 2.0,
                     labSize = 6.0,
                     legendLabels=c('Not sig.','Log (base 2) FC','p-value',
                                    'p-value & Log (base 2) FC'),
                     legendPosition = 'right',
                     legendLabSize = 0,
                     legendIconSize = 0)
p

#To draw eps images  fig.3a was used the following code.
cairo_ps(filename =  "volcano_Fnt_vs_nFnT.eps",
         width =10, height =9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p <- EnhancedVolcano(qlf$table,
                     lab = qlf$genes$Symbol,
                     x = 'logFC.F.nT_vs_nF.nT', y = 'p.adj',
                     title = "",
                     subtitle = "",
                     pCutoff = 0.05,
                     FCcutoff = 1,
                     pointSize = 2.0,
                     labSize = 6.0,
                     legendLabels=c('Not sig.','Log (base 2) FC','p-value',
                                    'p-value & Log (base 2) FC'),
                     legendPosition = 'right',
                     legendLabSize = 0,
                     legendIconSize = 0)
p
dev.off()
dev.set()


cairo_ps(filename =  "volcano_Ft_vs_nFT.eps",
         width =10, height =9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p <- EnhancedVolcano(qlf$table,
                     lab = qlf$genes$Symbol,
                     x = 'logFC.F.T_vs_nF.T', y = 'p.adj',
                     title = "",
                     subtitle = "",
                     pCutoff = 0.05,
                     FCcutoff = 1,
                     pointSize = 2.0,
                     labSize = 6.0,
                     legendLabels=c('Not sig.','Log (base 2) FC','p-value',
                                    'p-value & Log (base 2) FC'),
                     legendPosition = 'right',
                     legendLabSize = 0,
                     legendIconSize = 0)
p
dev.off()
dev.set()


cairo_ps(filename =  "volcano_Ft_vs_nFT.eps",
         width =10, height =9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p<- EnhancedVolcano(qlf$table,
                    lab = qlf$genes$Symbol,
                    x = 'logFC.F.T_vs_nF.T', y = 'p.adj',
                    title = "",
                    subtitle = "",
                    pCutoff = 0.05,
                    FCcutoff = 1,
                    pointSize = 2.0,
                    labSize = 6.0, 
                    legendLabels=c('Not sig.','Log (base 2) FC','p-value',
                                   'p-value & Log (base 2) FC'),
                    legendPosition = 'right',
                    legendLabSize = 0,
                    legendIconSize = 0)
p
dev.off()
dev.set()


cairo_ps(filename =  "volcano_FT_vs_FnT.eps",
         width =10, height =9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p<- EnhancedVolcano(qlf$table,
                    lab = qlf$genes$Symbol,
                    x = 'logFC.F.T_vs_F.nT', y = 'p.adj',
                    title = "",
                    subtitle = "",
                    pCutoff = 0.05,
                    FCcutoff = 1,
                    pointSize = 2.0,
                    labSize = 6.0,
                    legendLabels=c('Not sig.','Log (base 2) FC','p-value',
                                   'p-value & Log (base 2) FC'),
                    legendPosition = 'right',
                    legendLabSize = 0,
                    legendIconSize = 0)
p
dev.off()
dev.set()

#Investigation of gene sets crossection 
#Lets divide initial filtered p.adj<005 dataset df5 into two with |log2FC|>1 with and without DSN treatment (notrim/trim) in stress comparison pairs
df6 <- subset(df5, df5$logFC.F.nT_vs_nF.nT > 1 | df5$logFC.F.nT_vs_nF.nT < -1)
df7 <- subset(df5, df5$logFC.F.T_vs_nF.T > 1 | df5$logFC.F.T_vs_nF.T< -1)

#The same in DSN treatment comparison pairs
df9 <- subset(df5, df5$logFC.F.T_vs_F.nT > 1 | df5$logFC.F.T_vs_F.nT< -1)
df10 <- subset(df5, df5$logFC.nF.T_vs_nF.nT > 1 | df5$logFC.nF.T_vs_nF.nT< -1)

#Lets draw Venn Diagramm 
library("ggVennDiagram")
x <-list(df6$ensembl_gene_id, df7$ensembl_gene_id)
p <- ggVennDiagram(x, category.names =c( "F.nT_vs_nF.nT","F.T_vs_nF.T" ), label_alpha=0, label_size =9)
p + scale_fill_distiller(palette = "RdBu", direction = -1)+
  theme(legend.position = "none")
#Code for eps images fig3b
cairo_ps(filename =  "venn_1_FST.eps",
         width = 4, height = 3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
x <-list(df6$ensembl_gene_id, df7$ensembl_gene_id)
p <- ggVennDiagram(x, category.names =c( "F.nT_vs_nF.nT","F.T_vs_nF.T" ), label_alpha=0, label_size =9)
p + scale_fill_distiller(palette = "RdBu", direction = -1)+
  theme(legend.position = "none")
dev.off()
dev.set()


cairo_ps(filename =  "venn_2_trim.eps",
         width = 4, height = 3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
x <-list(df9$ensembl_gene_id, df10$ensembl_gene_id)
p <- ggVennDiagram(x, category.names =c("F.T_vs_F.nT" , "nF.T_vs_nF.nT" ), label_alpha=0, label_size = 9)
p + scale_fill_distiller(palette = "RdBu", direction = -1)+
  theme(legend.position = "none")
dev.off()
dev.set()

#To know 512 DEG that changed only after DSN treatment
library("VennDiagram")
x<- list(df6$ensembl_gene_id, df7$ensembl_gene_id)
ol= calculate.overlap(x)
ol_size=sapply(ol, length)
# ol$a3 - this is list of 1250 common genes
trim_only <-  df7[! df7$ensembl_gene_id %in% ol$a3, ]

#Now will plot log2FC correlation DSN treated and untreated in stressed/non stressed comparison pairs.
#Correlation of all genes, eps image fig.3c
library(ggpubr)
library(tidyverse)
cairo_ps(filename =  "all_genes_corr.eps",
         width = 4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p <- ggplot(df5, aes(x = logFC.F.nT_vs_nF.nT, y =logFC.F.T_vs_nF.T)) +
  geom_point()+
  stat_cor(method = "pearson", label.x = -4, label.y = 10, size = 5)+
  scale_y_continuous(limits = c(-6, 12))+
  geom_smooth(method=lm)+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p
dev.off()
dev.set()
#Correlation of Log2FC oh that 512 DEG that were present only in DEG list after DSN treatment, fig.3c 
cairo_ps(filename =  "diff_genes_corr.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p <- ggplot(trim_only, aes(x = logFC.F.nT_vs_nF.nT, y = logFC.F.T_vs_nF.T)) +
  geom_point()+
  stat_cor(method = "pearson", label.x = -1, label.y = 5, size = 5)+
  geom_smooth(method=lm)+
  theme_classic()+
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15))
p
dev.off()
dev.set()

# Correlation of Log2FC trimed/untrimed comparison pairs with same stress condition

cairo_ps(filename =  "log2fc_corr_tr_untr_nF.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p <- ggplot(df5, aes(x = logFC.F.T_vs_F.nT, y =logFC.nF.T_vs_nF.nT)) +
  geom_point()+
  stat_cor(method = "pearson", label.x = -4, label.y = 10, size = 5)+
  scale_y_continuous(limits = c(-6, 12))+
  geom_smooth(method=lm)+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p
dev.off()
dev.set()

#To investigate transcripts quantities and 10 most expressed transcripts 
#Let's make normalization transcripts per million TPM
#At first will make function 
calc_tpm <- function(x, gene.length) {
  x <- as.matrix(x)
  len.norm.lib.size <- colSums(x / gene.length)
  return((t(t(x) / len.norm.lib.size) * 1e06) / gene.length)
}
#We need gene length data. Will download it from ensembl with biomart 
G_list2 <- getBM(filters= "ensembl_gene_id", attributes= c('ensembl_gene_id', 'ensembl_transcript_id','transcript_length'), 
                 uniqueRows = TRUE ,values=rownames(y$genes),mart= ensembl)
G_list3 <- G_list2 %>% group_by(ensembl_gene_id) %>% summarize(genelength=median(transcript_length))
rownames(G_list3) <- G_list3$ensembl_gene_id
#Make TPM normalization
tpm_man2 <- calc_tpm(y, gene.length = G_list3$genelength)
#Divide it into two datasets 
tpm_man3 <- tpm_man2[,c(1,3)]#control untrim/noDSN
tpm_man4 <-tpm_man2[,c(6,8)]#control trim/+DSN
#Calculate Median and SD for each gene TPM
library(matrixStats)
tpm_man3_df <- transform(tpm_man3, SD=rowSds(tpm_man3[,1:2], na.rm=TRUE), Median=rowMedians(tpm_man3[,1:2], na.rm=TRUE))
tpm_man4_df <- transform(tpm_man4, SD=rowSds(tpm_man4[,1:2], na.rm=TRUE), Median=rowMedians(tpm_man4[,1:2], na.rm=TRUE))
# Make dataframe for 10  prevalent transcripts
df52_new3 <- tpm_man3_df[order(tpm_man3_df$Median, decreasing = TRUE), ] 
df52_new3 <- head(df52_new3, 10)
df52_new3$row_num <- seq.int(nrow(df52_new3))
df53_new3 <- tpm_man4_df[order(tpm_man4_df$Median, decreasing = TRUE), ] 
df53_new3 <- head(df53_new3, 10)
df53_new3$row_num <- seq.int(nrow(df53_new3))
df52_new3$trim <- c('No Trimmer')
df53_new3$trim <- c('Trimmer')
df52_new3$genes <- rownames(df52_new3)
df53_new3$genes <- rownames(df53_new3)
df52_new3 <- df52_new3[,3:7]
df53_new3 <- df53_new3[,3:7]
df54_new3 <- rbind(df52_new3, df53_new3)
#Download annotation with biomart
G_list4 <- getBM(filters= "ensembl_gene_id", attributes= c('ensembl_gene_id', 
                                                           'description',
                                                           'rgd_symbol', 'rgd_id','entrezgene_description' ), 
                 values=df54_new3$genes,mart= ensembl)

#Merge with annotations
df55_new3 <- merge(G_list4, df54_new3, by.x="ensembl_gene_id",by.y="genes")# Merge table

#Sort with row_num order and make new column for building graph 
df55_new3 <-df55_new3[order(df55_new3$row_num, df55_new3$trim),] 
df55_new3$row_num2 <- seq.int(nrow(df55_new3))
#Transfer missing annotation informtion from another respective columns
df55_new3[1,3] <- df55_new3[1,5]
df55_new3[4,3] <- df55_new3[4,5]
df55_new3[13,3] <- df55_new3[13,5]
df55_new3[9,3] <- df55_new3[9,1]
#Building plot 
p<- ggplot(df55_new3 , aes(x = row_num2, y = Median, fill=trim)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=Median-SD, ymax=Median+SD), width=.2,
                position=position_dodge(.9)) +
  geom_text(size=4, aes(label = rgd_symbol), nudge_y = 2e+05, angle =90) +
  labs(  y = expression('TPM') ) +
  
  scale_y_continuous(expand = expansion(mult = c(0, .1)))+
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.title.y = element_text(size=24), axis.text.y = element_text(size=15),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), legend.title=element_text(size=20), legend.text=element_text(size=24), 
  )

p


#Code for fig.1d image
library(ggplot2)
cairo_ps(filename =  "TPM_10transcripts.eps",
         width =7, height =7, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
p<- ggplot(df55_new3 , aes(x = row_num2, y = Median, fill=trim)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=Median-SD, ymax=Median+SD), width=.2,
                position=position_dodge(.9)) +
  geom_text(size=4, aes(label = rgd_symbol), nudge_y = 2e+05, angle =90) +
  labs(  y = expression('TPM') ) +
  
  scale_y_continuous(expand = expansion(mult = c(0, .1)))+
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.title.y = element_text(size=24), axis.text.y = element_text(size=15),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), legend.title=element_text(size=20), legend.text=element_text(size=24), 
  )

p
dev.off()
dev.set()


#After DSN treatment for some set of genes we observed shift of logFC in the results.
#We decided to elucidate what is the reason of this distortion.
#At first I decided to divide DEGs on opposite genes and indirectly changed genes
#Opposite genes - log2FC_DSN >log2FC_nDSN for positive log2FC and log2FC_DSN < log2FC_nDSN for negative log2FC
pos_gen <- subset(df5, df5$logFC.F.nT_vs_nF.nT > 0)

neg_gen <- subset(df5, df5$logFC.F.nT_vs_nF.nT < 0)
oppos_gen1 <- subset(pos_gen, pos_gen$logFC.F.nT_vs_nF.nT > pos_gen$logFC.F.T_vs_nF.T)
oppos_gen2 <- subset(neg_gen, neg_gen$logFC.F.nT_vs_nF.nT < neg_gen$logFC.F.T_vs_nF.T)
oppos_gen <- rbind(oppos_gen1,oppos_gen2)
#Introduce log2FC change rank. rank=|log2FC_DSN|-|log2FC_nDSN|
oppos_gen$rank <- abs(oppos_gen$logFC.F.T_vs_nF.T)-abs(oppos_gen$logFC.F.nT_vs_nF.nT)
#Get cDNA sequences from biomart 
gc_oppos_seq <- getSequence(id = oppos_gen$ensembl_gene_id, 
                            type="ensembl_gene_id",
                            seqType="cdna",
                            upstream=0, 
                            mart=ensembl)
#To count mean length when there are several transcripts we need cDNA length
gc_oppos_seq$length <- nchar(gc_oppos_seq$cdna)
#Concatenate cDNA sequences of several gene's transcripts to count GC content and mean cDNA length
gc_oppos_seq2 <- gc_oppos_seq %>% 
                      group_by(ensembl_gene_id) %>%
                      summarise(cdna = paste(cdna ,collapse = ""),
                      mean_len = mean (length))

#Calculate GC percentage 
gc_oppos_seq2$length <- nchar(gc_oppos_seq2$cdna)

gc_oppos_seq2$gcount <- lengths(regmatches(gc_oppos_seq2$cdna, gregexpr("G", gc_oppos_seq2$cdna)))

gc_oppos_seq2$ccount <- lengths(regmatches(gc_oppos_seq2$cdna, gregexpr("C", gc_oppos_seq2$cdna)))

gc_oppos_seq2$gc_count <- gc_oppos_seq2$gcount + gc_oppos_seq2$ccount

gc_oppos_seq2$gc_perc <- gc_oppos_seq2$gc_count/gc_oppos_seq2$length

#Put this information in our previous dataframe with logFC of oppos genes
oppos_gen <- left_join(oppos_gen, gc_oppos_seq2, by='ensembl_gene_id', copy=FALSE )
#Make list of unidirectly change genes by subtraction oppos genes from df5 
unidir_gen <- df5[!df5$ensembl_gene_id %in% oppos_gen$ensembl_gene_id, ]
#Make all the same for unidirectly changed genes 
gc_unidir_gen_seq2 <- gc_unidir_gen_seq %>% 
  group_by(ensembl_gene_id) %>%
  summarise(cdna = paste(cdna ,collapse = ""),
            mean_len = mean (length))

gc_unidir_gen_seq2$length <- nchar(gc_unidir_gen_seq2$cdna)

gc_unidir_gen_seq2$gcount <- lengths(regmatches(gc_unidir_gen_seq2$cdna, gregexpr("G", gc_unidir_gen_seq2$cdna)))

gc_unidir_gen_seq2$ccount <- lengths(regmatches(gc_unidir_gen_seq2$cdna, gregexpr("C", gc_unidir_gen_seq2$cdna)))

gc_unidir_gen_seq2$gc_count <- gc_unidir_gen_seq2$gcount + gc_unidir_gen_seq2$ccount

gc_unidir_gen_seq2$gc_perc <- gc_unidir_gen_seq2$gc_count/gc_unidir_gen_seq2$length

unidir_gen <- left_join(unidir_gen, gc_unidir_gen_seq2, by='ensembl_gene_id', copy=FALSE )
#Make varaible w to mark oppos and unidir genes 
oppos_gen$w <- rep(c('oppos_gen'),times=nrow(oppos_gen))
unidir_gen$w <- rep(c('unidir_gen'),times=nrow(unidir_gen))
#Get TPM information for oppos and unidir genes
tpm_man3_df$ensembl_gene_id <- rownames(tpm_man3_df)
tpm3_oppos_gen <-tpm_man3_df[tpm_man3_df$ensembl_gene_id  %in% oppos_gen$ensembl_gene_id, ]

tpm3_unidir_gen <-tpm_man3_df[tpm_man3_df$ensembl_gene_id  %in% unidir_gen$ensembl_gene_id, ]
tpm3_oppos_gen <- tpm3_oppos_gen[,c(1,2,5)]
tpm3_unidir_gen <- tpm3_unidir_gen[,c(1,2,5)]
#Merge TPM for control samples
tpm3_oppos_gen <- pivot_longer(tpm3_oppos_gen, cols = c('n1_1_S5_R1_001_sorted.bam', 'n3_S4_R1_001_sorted.bam'), names_to = "sample_name", values_to = "tpm")
tpm3_unidir_gen <- pivot_longer(tpm3_unidir_gen, cols = c('n1_1_S5_R1_001_sorted.bam', 'n3_S4_R1_001_sorted.bam'), names_to = "sample_name", values_to = "tpm")
#Merge with mean cDNA length, GC content,rank,  w grouping variable information.
tpm3_oppos_gen1  <- tpm3_oppos_gen1  %>% 
  left_join(select(oppos_gen, mean_len, gc_perc, rank, w, ensembl_gene_id), by="ensembl_gene_id")


tpm3_unidir_gen1  <- tpm3_unidir_gen1  %>% 
  left_join(select(unidir_gen, mean_len, gc_perc, rank, w, ensembl_gene_id), by="ensembl_gene_id")
#After that I conclude that division in oppos and unidir genes is not so important and 
#logFC shift mainly is describing by rank. Because of that his two dataframe could be joined.
tpm3_alltab <- rbind(tpm3_oppos_gen1,tpm3_unidir_gen1)
#Now plot scatterplot dependency of log2FC rank from transcrpts tpm, gc aontent and transcript mean length 
#for all significant genes
require(scales)
p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =tpm+1 , color = gc_perc)) +
  geom_point()+
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="YlGnBu")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1



p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =gc_perc , color = log2(tpm+1))) +
  geom_point()+
  labs(colour=expression('TPM'), x = expression('log2FC rank'), y = expression('GC'['%'])) +
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="Spectral")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1


p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =mean_len , color =gc_perc )) +
  geom_point()+
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="YlGnBu")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1
#Code for creatind eps pictures fig.3d

cairo_ps(filename =  "rank_vs_tpm.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =tpm+1 , color = gc_perc)) +
  geom_point()+
  labs(colour=expression('GC%'), x = expression('log2FC rank'), y = expression('TPM+1'['log2 scale'])) +
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="YlGnBu")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1
dev.off()
dev.set()

cairo_ps(filename =  "rank_vs_gcpers.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =gc_perc , color = log2(tpm+1))) +
  geom_point()+
  labs(colour=expression('TPM'), x = expression('log2FC rank'), y = expression('GC'['%'])) +
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="Spectral")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1
dev.off()
dev.set()

cairo_ps(filename =  "rank_vs_meanlen.eps",
         width =4, height =3, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
p1 <- ggplot(tpm3_alltab  , aes(x = abs(rank) , y =mean_len , color =gc_perc )) +
  geom_point()+
  labs(colour=expression('GC%'), x = expression('log2FC rank'), y = expression('Mean cDNA length'['bp'])) +
  scale_y_continuous(trans = "log2", breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))+
  scale_color_distiller(palette="YlGnBu")+
  theme_classic()+ 
  theme(axis.text =  element_text(size = 12),axis.title = element_text(size = 15),)
p1
dev.off()
dev.set()

##Functional annotation.
##Install requered packages
BiocManager::install("topGO")
BiocManager::install("Rgraphviz")
BiocManager::install("org.Rn.eg.db") # тут GO аннотация геномы мыши
#Firstly build GO ontology of all significant genes p.adj<0.05
library(topGO)
qv <- qlf$table[,c(1,8)]
qv$ensembl_gene_id <- rownames(qv)
qv <- qv[,c(2,3)]
rownames(qv)
s = factor(as.integer(apply(qv<0.05,1,sum)>0))#vector containing information about significant and non significant genes, significant is 1 
table(s)
names(s) = rownames(qv)

tgo1 = new("topGOdata", ontology = "BP", #enrichment in Biological Proceses GO
           allGenes = s,
           nodeSize = 10, # cut off 10 test genes
           annotationFun = annFUN.org,mapping='org.Rn.eg.db',ID='Ensembl') 
r1 = runTest(tgo1, algorithm = "classic", statistic = "fisher") # test for ranking
hist(score(r1)) # hist of p values
goqv = p.adjust(score(r1),m='BH') # multiple comparisons correction
sort(goqv)[1:10] # sort for 10 most significant nodes
GenTable(tgo1,r1,topNodes=10) #table of 10 most significant

#Wrie it in the table 
gt <- GenTable(tgo1,r1,topNodes=10)
write.csv (gt,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_2.csv')
#Draw scheme with nested GOS
dev.off()
showSigOfNodes(tgo1, score(r1), firstSigNodes = 5, useInfo ='all')
#Save eps image for fig. 4a
cairo_ps(filename =  "nodes_table.eps",
         width =11, height =6, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
showSigOfNodes(tgo1, score(r1), firstSigNodes = 5, useInfo ='all', useFullNames = TRUE)
dev.off()
dev.set()

##Drawing dotplots and network pictures 
##Divide subset on UPregulated and DOWNregulated genes
library(clusterProfiler)

tes_set <- subset(df6, df6$logFC.F.nT_vs_nF.nT > 1 )
genes_to_test <- tes_set$ensembl_gene_id  
GO_results <- enrichGO(gene = genes_to_test <- tes_set$ensembl_gene_id  , OrgDb="org.Rn.eg.db", keyType = "ENSEMBL", ont = "BP", pAdjustMethod = "BH",
                       pvalueCutoff  = 0.05,
                       qvalueCutoff  = 0.05,
                       readable      = TRUE)


tes_set_tr <- subset(df7, df7$logFC.F.nT_vs_nF.nT > 1 )
genes_to_test <- tes_set$ensembl_gene_id  
GO_results_tr <- enrichGO(gene = genes_to_test <- tes_set_tr$ensembl_gene_id  , OrgDb="org.Rn.eg.db", keyType = "ENSEMBL", ont = "BP", pAdjustMethod = "BH",
                          pvalueCutoff  = 0.05,
                          qvalueCutoff  = 0.05,
                          readable      = TRUE)

tes_set2 <- subset(df6, df6$logFC.F.nT_vs_nF.nT < -1 )
GO_results2 <- enrichGO(gene =  genes_to_test <- tes_set2$ensembl_gene_id  , OrgDb="org.Rn.eg.db", keyType = "ENSEMBL", ont = "BP",  pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05,
                        readable      = TRUE)



tes_set2_tr <- subset(df7, df7$logFC.F.nT_vs_nF.nT < -1)
GO_results2_tr <- enrichGO(gene =  genes_to_test <- tes_set2_tr$ensembl_gene_id  , OrgDb="org.Rn.eg.db", keyType = "ENSEMBL", ont = "BP",  pAdjustMethod = "BH",
                           pvalueCutoff  = 0.05,
                           qvalueCutoff  = 0.05,
                           readable      = TRUE)

##Save the results

write.csv (GO_results,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_3.csv')
write.csv (GO_results2,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_4.csv')
write.csv (GO_results_tr,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_5.csv')
write.csv (GO_results2_tr,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_6.csv')

##Plots for the fig.4b picture
cairo_ps(filename =  "ann_DOWN_genes_UNTRIM.eps",
         width =8, height =8, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
dotplot(GO_results2,showCategory = 20)
dev.off()
dev.set()


cairo_ps(filename =  "ann_DOWN_genes_TRIM.eps",
         width =8, height =8, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
dotplot(GO_results2_tr,showCategory = 20)
dev.off()
dev.set()


cairo_ps(filename =  "ann_UP_genes_UNTRIM.eps",
         width =8, height =8, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
dotplot(GO_results,showCategory = 20)
dev.off()
dev.set()

cairo_ps(filename =  "ann_UP_genes_TRIM.eps",
         width =8, height =8, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
dotplot(GO_results_tr,showCategory = 20)
dev.off()
dev.set()

#Save cnetplots from fig.5a
cairo_ps(filename =  "netw_DOWN_genes_UNTRIM.eps",
         width =10, height =10, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
cnetplot(GO_results2)
dev.off()
dev.set()


cairo_ps(filename =  "netw_UP_genes_UNTRIM.eps",
         width =10, height =10, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
cnetplot(GO_results)
dev.off()
dev.set()

##KEGG pathway enrichment analysis
##as soon as KEGG needs enrezgene_id as submission list t first prepare data 
qv_3s <- df6
qv_3s <- qv_3s[!is.na(qv_3s$entrezgene_id),]
library(clusterProfiler)

kk <- enrichKEGG(gene = qv_3s$entrezgene_id , organism = 'rno',  pAdjustMethod = "BH",
                 keyType  = 'kegg', 
                 pvalueCutoff  = 0.9, # put high cut off filter to enrich more pathways
                 qvalueCutoff  =0.9)

#Save the results
write.csv (kk,'/home/ssdd/data_nvme/trimmer_seq/counts/Supplementary_table_7.csv')
##Visualize with pathview
BiocManager::install("pathview")
library(pathview)
logFC = qv_3s[,1]
names(logFC)= as.character(qv_3s[,10])
##Get png image
## For the fig.5 the following pathways were selected rno04080,rno04060,rno04010,rno04024,rno04310,rno04921

pathview(gene.data = logFC, gene.idtype="entrez",cpd.idtype="kegg",
         pathway.id = "rno04921", map.null = F, 
         map.symbol =T,same.layer=FALSE,
         species = "rno", 
         limit = list(gene=5, cpd=1))

##Choosing reference genes for pcr.
#Selet genes with log2FC <0.04 from two data sets
df16 <- subset(df5, df5$logFC.F.nT_vs_nF.nT > -0.04 & df5$logFC.F.nT_vs_nF.nT < 0.04)
df17 <- subset(df5,  df5$logFC.F.T_vs_nF.T  > -0.04 & df5$logFC.F.T_vs_nF.T <0.04)
#Find intersection between them
library("VennDiagram")
ol4 = calculate.overlap(list(df16$ensembl_gene_id,df17$ensembl_gene_id))
ol4_size=sapply(ol4, length)
ol4$a3
df19 <- as.data.frame(ol4$a3)
df19$ensemble_id <- df19$`ol4$a3`

#In a vector add ensembl_id of populr reference genes
ref.genes <- c("ENSRNOG00000057823",
               "ENSRNOG00000000913",
               "ENSRNOG00000001766",
               "ENSRNOG00000013331",
               "ENSRNOG00000010390",
               "ENSRNOG00000001489",
               "ENSRNOG00000031367",
               "ENSRNOG00000034254",
               "ENSRNOG00000020618",
               "ENSRNOG00000018630",
               "ENSRNOG00000028834",
               "ENSRNOG00000019578",
               "ENSRNOG00000017840",
               "ENSRNOG00000005762",
               "ENSRNOG00000008195",
               "ENSRNOG00000001148",
               "ENSRNOG00000047098",
               "ENSRNOG00000008489",
               "ENSRNOG00000056728",
               "ENSRNOG00000000521",
               "ENSRNOG00000058249",
               "ENSRNOG00000011709",
               "ENSRNOG00000027646",
               "ENSRNOG00000013009",
               "ENSRNOG00000003689",
               "ENSRNOG00000056596",
               "ENSRNOG00000001766",
               "ENSRNOG00000019106",
               "ENSRNOG00000068679",
               "ENSRNOG00000056041",
               "ENSRNOG00000018994",
               "ENSRNOG00000000471",
               "ENSRNOG00000027864",
               "ENSRNOG00000017123",
               "ENSRNOG00000019834")
#Merge it with selected comon genes from two datasets with log2FC<0.04
ref.genes <- c(ref.genes, df19$ensemble_id)

#Take out from qlf$table information about this genes
df15 <-qlf$table
df15$ensemble_gene_id <- rownames(df15)
df20 <- df15[df15$ensemble_gene_id %in% ref.genes,]
require(org.Rn.eg.db)
df20$Symbol <- mapIds(org.Rn.eg.db, keys= df20$ensemble_gene_id,  column="ALIAS", keytype="ENSEMBL")
#Add missing nnotations
df20[1,10] <- c("Tmem80")
df20[13,10] <- c("AABR07005506.1")
df20[38,10] <- df20[38,9]
df20[42,10] <- df20[42,9] 
df20[48,10] <- df20[48,9] 
df20[70,10] <- df20[70,9] 
##Get cpm data
cpm <- cpm(y, log=FALSE)
##Prepare table for SARP.compo 

cpm <- as.data.frame(cpm)
cpm$ensemble_gene_id <- rownames(cpm)
cpm_rg <- cpm[cpm$ensemble_gene_id %in% df20$ensemble_gene_id,]
cpm_rg <- left_join(cpm_rg, df20[,c(9,10)], by = 'ensemble_gene_id',  copy = T)
cpm_rg <- cpm_rg[,c(1:10,12)]

cpm_rg <- cpm_rg %>% 
  pivot_longer(names_to = "df", cols= c('n1_1_S5_R1_001_sorted.bam','n3_S4_R1_001_sorted.bam',
                                        'n16_S6_R1_001_sorted.bam','n30_S3_R1_001_sorted.bam','n34_S2_R1_001_sorted.bam',
                                        'tr_1_S1_R1_001_sorted.bam','tr_3_S5_R1_001_sorted.bam',
                                        'tr_16_S2_R1_001_sorted.bam','tr_30_S9_R1_001_sorted.bam','tr_34_S11_R1_001_sorted.bam'))




cpm_rg  <- cpm_rg  %>% 
  pivot_wider(names_from = Symbol, values_from = value)

cpm_rg  <- left_join(df1 , cpm_rg, by = 'df',  copy = F)

df34 <- cpm_rg[cpm_rg$group %in% c("FST.notrim", "noFST.notrim"),]
df35 <- cpm_rg[cpm_rg$group %in% c("FST.trim", "noFST.trim"),]
df34 <- df34[,c(2,8:88)]#cpm for untrim
df35 <- df35[,c(2,8:88)]#cpm for trim

##Next was done with SARP.compo package
##   [see Curis et al., 2019, Bioinformatics, PMID: 30010788]


exp.data <- df34

## Name of the first column
group.name <- as.factor(names( exp.data )[ 1 ])
table( exp.data[ , group.name ] )

## Names of genes
genes <- names( exp.data )[ -1 ]
genes
## Construct the p-value matrix
##   (using the same equivalence region!)
Mp <- creer.Mp( d = as.data.frame(exp.data), noms = genes, log = TRUE,
                f.p = anva1.fpc, v.X = group.name )
## Construct the graph
##   (using the optimal p-value cutoff found above)

grf <- grf.Mp( Mp, p = 0.05,
               complement = TRUE )

##Build plot 
plot(grf, p = 0.05  ,cex.lab=1.5)
##Save eps image fig.6a
cairo_ps(filename =  "ref_genes_graph_non_trim_new.eps",
         width = 11, height = 10, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
plot(grf, p = 0.05  ,cex.lab=1.5)
dev.off()
dev.set()

# Drawing hierarchy tree

arbre <- arbre.Mp( Mp, en.log = TRUE )
str( arbre )
class( arbre )
#Build plot
plot(arbre, ylab = "p-value threshold", ylim=c(0,0.08),xlab = "" , main = "", sub ="" ,cex.lab=1.7)

#save eps image
cairo_ps(filename =  "tree_non_trim.eps",
         width = 10, height = 9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
par(cex=0.6, mar=c(11, 8, 4, 1))
plot(arbre, ylab = "p-value threshold", ylim=c(0,0.08),xlab = "" , main = "", sub ="" ,cex.lab=1.7)
dev.off()
dev.set()

###Now do the same with trim cpm.
exp.data1 <- df35
## Name of the first column
group.name1 <- as.factor(names( exp.data1 )[ 1 ])
table( exp.data1[ , group.name1 ] )

## Names of genes
genes1 <- names( exp.data1 )[ -1 ]
genes1
## Construct the p-value matrix
##   (using the same equivalence region!)
Mp1 <- creer.Mp( d = as.data.frame(exp.data1), noms = genes1, log = TRUE,
                f.p = anva1.fpc, v.X = group.name1 )
## Construct the graph
##   (using the optimal p-value cutoff found above)

grf1 <- grf.Mp( Mp1, p = 0.05,
               complement = TRUE )

##Build plot 
plot(grf1, p = 0.05  ,cex.lab=1.5)
##Save eps image fig.6a
cairo_ps(filename =  "ref_genes_graph_trim_new.eps",
         width = 11, height = 10, pointsize = 12,
         onefile = FALSE, family = "sans",
         antialias = c("default"),
         fallback_resolution = 300)
plot(grf1, p = 0.05  ,cex.lab=1.5)
dev.off()
dev.set()

# Drawing hierarchy tree

arbre1 <- arbre.Mp( Mp1, en.log = TRUE )
str(arbre1)
class(arbre1)
#Build plot
plot(arbre1, ylab = "p-value threshold", ylim=c(0,0.08),xlab = "" , main = "", sub ="" ,cex.lab=1.7)

#save eps image
cairo_ps(filename =  "tree_trim.eps",
         width = 10, height = 9, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
par(cex=0.6, mar=c(11, 8, 4, 1))
plot(arbre1, ylab = "p-value threshold", ylim=c(0,0.08),xlab = "" , main = "", sub ="" ,cex.lab=1.7)
dev.off()
dev.set()

##Building cpm plots for selected refernce genes
ref.gs <-c('Rps17', 'Rps16','Rpl8','Rpl13a','B3galt4', 'Rpl30', 'Hgprtase')
cpm_gr <- cpm[cpm$ensemble_gene_id %in% df20$ensemble_gene_id,]
cpm_gr <- left_join(cpm_gr, df20[,c(9,10)], by = 'ensemble_gene_id',  copy = T)
cpm_gr <- cpm_gr %>% 
                pivot_longer(names_to = "df", cols= c('n1_1_S5_R1_001_sorted.bam','n3_S4_R1_001_sorted.bam',
                                                      'n16_S6_R1_001_sorted.bam','n30_S3_R1_001_sorted.bam','n34_S2_R1_001_sorted.bam',
                                                      'tr_1_S1_R1_001_sorted.bam','tr_3_S5_R1_001_sorted.bam',
                                                      'tr_16_S2_R1_001_sorted.bam','tr_30_S9_R1_001_sorted.bam','tr_34_S11_R1_001_sorted.bam'))

cpm_gr <- left_join(df1 , cpm_gr, by = 'df',  copy = F)
refg_dat <-cpm_gr[ cpm_gr$Symbol %in% ref.gs, ] 


##Save eps images
cairo_ps(filename =  "refgen_sel1.eps",
         width = 8, height = 7, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)

ggplot(refg_dat, aes(x=Symbol, y=value, color=refg_dat$group))+
  geom_jitter(size = 7, width = 0.15 , show.legend = TRUE)+
  labs( x = NULL, y = 'CPM' )+
  theme_classic()+
  theme(axis.text.x =  element_text(angle =90, size = 18, face = "bold"),axis.text.y =  element_text(size = 18,face = "bold"), axis.title.y =  element_text(size =28), legend.title=element_blank(), legend.key.size = unit(1, 'cm'), legend.text =element_text(size = 18, face = "bold"))
dev.off()
dev.set()



ref.gs2 <-c('G6pdx', 'Sdha','Nono','CDK1','B3galt4', 'Rpl30', 'Hgprtase', 'Trfr',  'TFIID')
refg_dat2 <-cpm_gr[ cpm_gr$Symbol %in% ref.gs2, ] 

cairo_ps(filename =  "refgen_sel2.eps",
         width = 8, height = 7, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
ggplot(refg_dat2, aes(x=Symbol, y=value, color=refg_dat2$group))+
  geom_jitter(size = 7, width = 0.15 , show.legend = F)+
  labs( x = NULL, y = 'CPM' )+
  theme_classic()+
  theme(axis.text.x =  element_text(angle =90, size = 18, face = "bold"),axis.text.y =  element_text(size = 18,face = "bold"), axis.title.y =  element_text(size =28), legend.title=element_blank(), legend.key.size = unit(1, 'cm'), legend.text =element_text(size = 18, face = "bold"))
dev.off()
dev.set()


##Correlations of reference genes Ct
#install packages if necessary
install.packages("Hmisc")
library("Hmisc")
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
  )
}
pcr_count_1 <- read.csv("/home/ssdd/data_nvme/trimmer_seq/Supplementary_table_8.csv", header=TRUE, stringsAsFactors=FALSE)

library(Hmisc)
res2<-rcorr(as.matrix(pcr_count_1[,2:7]))
flattenCorrMatrix(res2$r, res2$P)

install.packages("corrplot")
library(corrplot)

#Save eps image fig.7a
cairo_ps(filename =  "refgen_pcr_corr_matrix.eps",
         width = 5, height = 4, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
corrplot(res2$r, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)
dev.off()
dev.set()

##Analyze target genes pcr data
##∆∆Ct was calculated in Excel with root mean square of all selected reference genes, load the data 
pcr_targ <- read.csv("/home/ssdd/data_nvme/trimmer_seq/Supplementary_table_9.csv", header=TRUE, stringsAsFactors=FALSE)

pcr_targ2 <- tidyr::pivot_longer(pcr_targ,   cols=c('f_esyt1', 'f_dcn', 'f_tcf7l1'), names_to='variable', 
                                 values_to="value")

shapiro.test(pcr_targ$f_esyt1)
shapiro.test(pcr_targ$f_dcn)
shapiro.test(pcr_targ$f_tcf7l1)



t.test(f_esyt1 ~ group, data = pcr_targ)
t.test(f_dcn ~ group, data = pcr_targ)
t.test(f_tcf7l1 ~ group, data = pcr_targ)


library(ggplot2)
library(ggsignif)

cairo_ps(filename =  "pcr_targen.eps",
         width = 5, height = 4, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
par(mar=c(5,4,6,1)+.1)
p <- ggplot(pcr_targ2, aes(x = variable, y = value, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position=position_jitterdodge(jitter.width = 0.2))+
  labs(color ="гены", x=NULL, y = expression(atop("mRNA expression", paste("rel. to RMS of ref. genes Ct's, folds" ))) ) +
  scale_x_discrete(labels = c("Dcn","Esyt1","Tcf7l1"))+
  geom_signif( comparisons = list(c(1,1.3)),annotations="*", y_position = 1.7, tip_length = 0, vjust=0.4, size=0.5, textsize=8)+
  geom_signif(comparisons = list(c(2,2.3)), annotations="*", y_position = 1.9, tip_length = 0, vjust=0.4, size=0.5, textsize=8)+
  geom_signif(comparisons = list(c(3, 3.3)), annotations="**", y_position = 1.5, tip_length = 0, vjust=0.4, size=0.5, textsize=8)+
  theme_classic() + 
  theme(axis.title.x=element_blank(),
        axis.title.y = element_text(size=16), axis.text.y = element_text(size=15),
        axis.text.x=element_text(size=15),
        legend.title=element_text(size=15), legend.text=element_text(size=15))
p 
dev.off()
dev.set()

##TPM plot of target genes

targ1 <- tpm_man3_df[c("ENSRNOG00000060753", "ENSRNOG00000004554","ENSRNOG00000014753"), ]
targ2 <- tpm_man4_df[c("ENSRNOG00000060753", "ENSRNOG00000004554","ENSRNOG00000014753"), ]
targ2$ensembl_gene_id <- rownames(targ2)
targ1$trim <- c('No Trimmer')
targ2$trim <- c('Trimmer')

targ1$row_num <- seq.int(nrow(targ1))
targ2$row_num <- seq.int(nrow(targ2))
targ1<- targ1[,3:7]
targ2 <- targ2[,3:7]
targ3 <- rbind(targ1, targ2)
G_list5 <- getBM(filters= "ensembl_gene_id", attributes= c('ensembl_gene_id', 
                                                           'description',
                                                           'rgd_symbol', 'rgd_id','entrezgene_description' ), 
                 values=targ3$ensembl_gene_id,mart= ensembl)
targ4 <- merge(G_list5, targ3, by.x="ensembl_gene_id",by.y="ensembl_gene_id")

library(ggplot2)
cairo_ps(filename =  "TPM_targenes.eps",
         width =5, height =4, pointsize = 12,
         onefile = FALSE, family = "sans", 
         antialias = c("default"),
         fallback_resolution = 300)
par(mar=c(5,4,6,1)+.1)
p<- ggplot(targ4, aes(x = row_num, y = Median, fill=trim)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=Median-SD, ymax=Median+SD), width=.2,
                position=position_dodge(.9)) +
  
  labs( x=NULL ,y = expression('TPM') ) +
  geom_text(size=9, aes(label = ifelse(trim == "Trimmer", rgd_symbol, "")), 
            position = position_nudge(y = 50), vjust = 1,  angle=90)+
  scale_y_continuous(expand = expansion(mult = c(0, .1)))+
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.title.y = element_text(size=24), axis.text.y = element_text(size=15),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), legend.title=element_text(size=14), legend.text=element_text(size=14), 
  )

p 
dev.off()
dev.set()


##Plot logFC of target genes
targ5 <- df5[is.element(df5$ensembl_gene_id, c("ENSRNOG00000060753", "ENSRNOG00000004554","ENSRNOG00000014753")), ]

targ5 <-targ5[,c(1,2,11)]
colnames(targ5)
targ6 <- tidyr::pivot_longer(targ5,   cols=c('logFC.F.nT_vs_nF.nT', 'logFC.F.T_vs_nF.T'), names_to='variable', 
                             values_to="value")

targ6$col_num <- seq.int(nrow(targ6))

cairo_ps(filename =  "logFC_targenes.eps",
         width =5, height = 4, pointsize = 12,
         onefile = FALSE, family = "sans", bg = "white",
         antialias = c("default"),
         fallback_resolution = 300)

p<- ggplot(targ6, aes(x = col_num, y = value, fill=variable)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_text(size=9, aes(label = ifelse(variable == "logFC.F.nT_vs_nF.nT", rgd_symbol, "")), 
            position = position_nudge(y = 1.9), vjust = 1,  angle=90)+
  labs(x=targ6$rgd_symbol,  y = expression('log2FC') ) +
  
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.title.y = element_text(size=24), axis.text.y = element_text(size=15),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), legend.title=element_blank(), legend.text=element_text(size=15), 
  )

p 
dev.off()
dev.set()

