1 Setup

suppressPackageStartupMessages({
  library(DESeq2)
  library(edgeR)
  library(LSD)
  library(org.Hs.eg.db)
  library(RColorBrewer)
  library(SummarizedExperiment)
  library(tidyverse)
  library(tximeta)
  library(vsn)
})
pal <- brewer.pal(8,"Dark2")
datadir <- "/home/training/share/Day2"

Reading the metadata and modifying the content

dex and cell will be variables in our model. As they are categorical, we change them to be factors

meta <- read.delim(paste0(datadir, "/airway/airway_meta.txt"), 
                   header = TRUE, as.is = TRUE)
rownames(meta) <- meta$names
meta$dex <- factor(meta$dex)
meta$cell <- factor(meta$cell)

A look at it

meta
##            SampleName    cell   dex albut      names avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677

What is a factor?

meta$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: trt untrt
levels(meta$dex)
## [1] "trt"   "untrt"
as.integer(meta$dex)
## [1] 2 1 2 1 2 1 2 1

2 Alignment vs. pseudo-alignment comparison

2.1 featureCounts

The following code, we do not run, it is there to show how the featureCounts data was generated. featureCounts is a function part of the Rsubread library. It reads BAM files, generated by a “classical” aligner and summarises the overlap of the alignments with the gene annotation. I.e. for every gene locus, it counts the number of reads that aligned there. featureCounts can handle multi-mapping reads, but not in a probabilistic manner.

library(Rsubread)
fc <- featureCounts(files = files, 
                    annot.ext = gtf, 
                    isGTFAnnotationFile = TRUE,
                    GTF.featureType = "exon", 
                    GTF.attrType = "gene_id", 
                    useMetaFeatures = TRUE, 
                    strandSpecific = 0, 
                    isPairedEnd = TRUE, 
                    nthreads = 6)
Read the data generated from the command above
fc <- readRDS(paste0(datadir,
                     "/airway/featureCounts/star_featureCounts.rds"))

let us inspect it

names(fc)
## [1] "counts"     "annotation" "targets"    "stat"
counts_featurecounts <- fc$counts
head(counts_featurecounts)
##                   SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000223972.5         29         28         23         16         21
## ENSG00000227232.5        843        701       1102        495        838
## ENSG00000278267.1         26         19         24         16         13
## ENSG00000243485.5         20          9         30          9         10
## ENSG00000284332.1          0          0          0          0          0
## ENSG00000237613.2         19          6         19          4          8
##                   SRR1039517 SRR1039520 SRR1039521
## ENSG00000223972.5         30          5          2
## ENSG00000227232.5        967        578        573
## ENSG00000278267.1         20         12         16
## ENSG00000243485.5          4          9          3
## ENSG00000284332.1          0          0          0
## ENSG00000237613.2          6         14          7
dim(counts_featurecounts)
## [1] 58721     8
fc$stat
##                           Status SRR1039508 SRR1039509 SRR1039512 SRR1039513
## 1                       Assigned   21007653   19184955   25895949   15496792
## 2            Unassigned_Unmapped          0          0          0          0
## 3      Unassigned_MappingQuality          0          0          0          0
## 4             Unassigned_Chimera          0          0          0          0
## 5      Unassigned_FragmentLength          0          0          0          0
## 6           Unassigned_Duplicate          0          0          0          0
## 7        Unassigned_MultiMapping          0          0          0          0
## 8           Unassigned_Secondary          0          0          0          0
## 9            Unassigned_NonSplit          0          0          0          0
## 10         Unassigned_NoFeatures    1922304    1410877    2030626    1083972
## 11 Unassigned_Overlapping_Length          0          0          0          0
## 12          Unassigned_Ambiguity    1358418    1236366    1584019     943994
##    SRR1039516 SRR1039517 SRR1039520 SRR1039521
## 1    24998626   31477332   19500283   21608001
## 2           0          0          0          0
## 3           0          0          0          0
## 4           0          0          0          0
## 5           0          0          0          0
## 6           0          0          0          0
## 7           0          0          0          0
## 8           0          0          0          0
## 9           0          0          0          0
## 10    2133471    2509956    1539435    1791941
## 11          0          0          0          0
## 12    1556731    1927932    1186465    1283094

let us visualise it

stats <- as.matrix(fc$stat %>% column_to_rownames("Status"))
barplot(stats[rowSums(stats)>0,],
        beside = TRUE,col = pal[1:3],las=2,
        cex.axis = .8,cex.names = .8,
        legend.text = rownames(stats)[rowSums(stats)>0],
        args.legend = list(horiz=TRUE,cex=.8,x="top",bty="n"))

2.2 Salmon

We list all quant.sf output files from Salmon

salmonfiles <- paste0(datadir, "/airway/salmon/", 
                      meta$names, "/quant.sf")
names(salmonfiles) <- meta$names
stopifnot(all(file.exists(salmonfiles)))

We extend the metadata by adding a column “files” to the metadata table. This table must contain at least two columns: “names” and “files” for importing the data using the tximeta package

coldata <- cbind(meta, files = salmonfiles, 
                 stringsAsFactors = FALSE)

How does it look

coldata
##            SampleName    cell   dex albut      names avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
##                                                                  files
## SRR1039508 /home/training/share/Day2/airway/salmon/SRR1039508/quant.sf
## SRR1039509 /home/training/share/Day2/airway/salmon/SRR1039509/quant.sf
## SRR1039512 /home/training/share/Day2/airway/salmon/SRR1039512/quant.sf
## SRR1039513 /home/training/share/Day2/airway/salmon/SRR1039513/quant.sf
## SRR1039516 /home/training/share/Day2/airway/salmon/SRR1039516/quant.sf
## SRR1039517 /home/training/share/Day2/airway/salmon/SRR1039517/quant.sf
## SRR1039520 /home/training/share/Day2/airway/salmon/SRR1039520/quant.sf
## SRR1039521 /home/training/share/Day2/airway/salmon/SRR1039521/quant.sf

Now, we can import the quantifications

Remember that Salmon quantifies abundances at the transcript level

This function will fetch from the Gencode archive (locate at the European Bioinformatics Institue), the annotation relevant to our project, including the transcript information and that of their parental gene

Note: enter 2 when prompted

st <- tximeta::tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## building TxDb with 'GenomicFeatures' package
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
## generating transcript ranges
## fetching genome info for GENCODE

let us take a look at it

st
## class: RangedSummarizedExperiment 
## dim: 205870 8 
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(205870): ENST00000456328.2 ENST00000450305.2 ...
##   ENST00000387460.2 ENST00000387461.2
## rowData names(3): tx_id gene_id tx_name
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
metadata(st)
## $tximetaInfo
## $tximetaInfo$version
## [1] '1.4.2'
## 
## $tximetaInfo$importTime
## [1] "2020-03-05 09:57:55 UTC"
## 
## 
## $quantInfo
## $quantInfo$salmon_version
## [1] "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0"
## 
## $quantInfo$samp_type
## [1] "none" "none" "none" "none" "none" "none" "none" "none"
## 
## $quantInfo$opt_type
## [1] "vb" "vb" "vb" "vb" "vb" "vb" "vb" "vb"
## 
## $quantInfo$quant_errors
## $quantInfo$quant_errors[[1]]
## list()
## 
## $quantInfo$quant_errors[[2]]
## list()
## 
## $quantInfo$quant_errors[[3]]
## list()
## 
## $quantInfo$quant_errors[[4]]
## list()
## 
## $quantInfo$quant_errors[[5]]
## list()
## 
## $quantInfo$quant_errors[[6]]
## list()
## 
## $quantInfo$quant_errors[[7]]
## list()
## 
## $quantInfo$quant_errors[[8]]
## list()
## 
## 
## $quantInfo$num_libraries
## [1] 1 1 1 1 1 1 1 1
## 
## $quantInfo$library_types
## [1] "IU" "IU" "IU" "IU" "IU" "IU" "IU" "IU"
## 
## $quantInfo$frag_dist_length
## [1] 1001 1001 1001 1001 1001 1001 1001 1001
## 
## $quantInfo$seq_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 
## $quantInfo$gc_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 
## $quantInfo$num_bias_bins
## [1] 4096 4096 4096 4096 4096 4096 4096 4096
## 
## $quantInfo$mapping_type
## [1] "mapping" "mapping" "mapping" "mapping" "mapping" "mapping" "mapping"
## [8] "mapping"
## 
## $quantInfo$num_targets
## [1] 205870 205870 205870 205870 205870 205870 205870 205870
## 
## $quantInfo$serialized_eq_classes
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 
## $quantInfo$length_classes
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]
## [1,]    520    520    520    520    520    520    520    520
## [2,]    669    669    669    669    669    669    669    669
## [3,]   1065   1065   1065   1065   1065   1065   1065   1065
## [4,]   2328   2328   2328   2328   2328   2328   2328   2328
## [5,] 205012 205012 205012 205012 205012 205012 205012 205012
## 
## $quantInfo$index_seq_hash
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
## 
## $quantInfo$index_name_hash
## [1] "77aca5545a0626421efb4730dd7b95482c77da261f9bdef70d36e25ee68bb7ef"
## 
## $quantInfo$index_seq_hash512
## [1] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [2] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [3] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [4] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [5] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [6] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [7] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [8] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## 
## $quantInfo$index_name_hash512
## [1] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [2] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [3] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [4] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [5] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [6] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [7] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [8] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## 
## $quantInfo$num_bootstraps
## [1] 0 0 0 0 0 0 0 0
## 
## $quantInfo$num_processed
## [1] 22935521 21155707 28136282 16823088 27298970 34298260 21275888 23487860
## 
## $quantInfo$num_mapped
## [1] 21072103 19264813 26088219 15654204 25210669 31839924 19643454 21784047
## 
## $quantInfo$percent_mapped
## [1] 91.87541 91.06201 92.72092 93.05191 92.35026 92.83248 92.32730 92.74598
## 
## $quantInfo$call
## [1] "quant" "quant" "quant" "quant" "quant" "quant" "quant" "quant"
## 
## $quantInfo$start_time
## [1] "Thu Mar 21 21:33:45 2019" "Thu Mar 21 21:35:50 2019"
## [3] "Thu Mar 21 21:37:52 2019" "Thu Mar 21 21:40:11 2019"
## [5] "Thu Mar 21 21:41:52 2019" "Thu Mar 21 21:44:04 2019"
## [7] "Thu Mar 21 21:46:38 2019" "Thu Mar 21 21:48:32 2019"
## 
## $quantInfo$end_time
## [1] "Thu Mar 21 21:35:49 2019" "Thu Mar 21 21:37:51 2019"
## [3] "Thu Mar 21 21:40:10 2019" "Thu Mar 21 21:41:51 2019"
## [5] "Thu Mar 21 21:44:02 2019" "Thu Mar 21 21:46:37 2019"
## [7] "Thu Mar 21 21:48:31 2019" "Thu Mar 21 21:50:28 2019"
## 
## 
## $countsFromAbundance
## [1] "no"
## 
## $level
## [1] "txp"
## 
## $txomeInfo
## $txomeInfo$source
## [1] "GENCODE"
## 
## $txomeInfo$organism
## [1] "Homo sapiens"
## 
## $txomeInfo$release
## [1] "29"
## 
## $txomeInfo$genome
## [1] "GRCh38"
## 
## $txomeInfo$fasta
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz"
## 
## $txomeInfo$gtf
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
## 
## $txomeInfo$sha256
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
## 
## 
## $txdbInfo
##                                                                                            Db type 
##                                                                                             "TxDb" 
##                                                                                 Supporting package 
##                                                                                  "GenomicFeatures" 
##                                                                                        Data source 
## "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz" 
##                                                                                           Organism 
##                                                                                                 NA 
##                                                                                        Taxonomy ID 
##                                                                                                 NA 
##                                                                                   miRBase build ID 
##                                                                                                 NA 
##                                                                                             Genome 
##                                                                                                 NA 
##                                                                                    transcript_nrow 
##                                                                                           "206694" 
##                                                                                          exon_nrow 
##                                                                                           "702834" 
##                                                                                           cds_nrow 
##                                                                                           "274013" 
##                                                                                      Db created by 
##                                                        "GenomicFeatures package from Bioconductor" 
##                                                                                      Creation time 
##                                                     "2020-03-05 10:01:18 +0000 (Thu, 05 Mar 2020)" 
##                                                           GenomicFeatures version at creation time 
##                                                                                           "1.38.0" 
##                                                                   RSQLite version at creation time 
##                                                                                            "2.1.2" 
##                                                                                    DBSCHEMAVERSION 
##                                                                                              "1.2"
colData(st)
## DataFrame with 8 rows and 9 columns
##             SampleName     cell      dex       albut       names avgLength
##            <character> <factor> <factor> <character> <character> <integer>
## SRR1039508  GSM1275862   N61311    untrt       untrt  SRR1039508       126
## SRR1039509  GSM1275863   N61311      trt       untrt  SRR1039509       126
## SRR1039512  GSM1275866  N052611    untrt       untrt  SRR1039512       126
## SRR1039513  GSM1275867  N052611      trt       untrt  SRR1039513        87
## SRR1039516  GSM1275870  N080611    untrt       untrt  SRR1039516       120
## SRR1039517  GSM1275871  N080611      trt       untrt  SRR1039517       126
## SRR1039520  GSM1275874  N061011    untrt       untrt  SRR1039520       101
## SRR1039521  GSM1275875  N061011      trt       untrt  SRR1039521        98
##             Experiment      Sample    BioSample
##            <character> <character>  <character>
## SRR1039508   SRX384345   SRS508568 SAMN02422669
## SRR1039509   SRX384346   SRS508567 SAMN02422675
## SRR1039512   SRX384349   SRS508571 SAMN02422678
## SRR1039513   SRX384350   SRS508572 SAMN02422670
## SRR1039516   SRX384353   SRS508575 SAMN02422682
## SRR1039517   SRX384354   SRS508576 SAMN02422673
## SRR1039520   SRX384357   SRS508579 SAMN02422683
## SRR1039521   SRX384358   SRS508580 SAMN02422677
rowData(st)
## DataFrame with 205870 rows and 3 columns
##                       tx_id           gene_id           tx_name
##                   <integer>   <CharacterList>       <character>
## ENST00000456328.2         1 ENSG00000223972.5 ENST00000456328.2
## ENST00000450305.2         2 ENSG00000223972.5 ENST00000450305.2
## ENST00000488147.1      9483 ENSG00000227232.5 ENST00000488147.1
## ENST00000619216.1      9484 ENSG00000278267.1 ENST00000619216.1
## ENST00000473358.1         3 ENSG00000243485.5 ENST00000473358.1
## ...                     ...               ...               ...
## ENST00000361681.2    206692 ENSG00000198695.2 ENST00000361681.2
## ENST00000387459.1    206693 ENSG00000210194.1 ENST00000387459.1
## ENST00000361789.2    206684 ENSG00000198727.2 ENST00000361789.2
## ENST00000387460.2    206685 ENSG00000210195.2 ENST00000387460.2
## ENST00000387461.2    206694 ENSG00000210196.2 ENST00000387461.2
assays(st) #plural
## List of length 3
## names(3): counts abundance length
head(assay(st)) # the first one by default
##                   SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENST00000456328.2     17.896     16.714     12.576      8.970     19.224
## ENST00000450305.2      0.000      0.000      0.000      0.000      0.000
## ENST00000488147.1     66.727     75.392    188.265     43.748     94.695
## ENST00000619216.1      0.000      0.000      0.000      0.000      0.000
## ENST00000473358.1      0.000      0.000      0.000      0.000      0.000
## ENST00000469289.1      0.000      0.000      0.000      0.000      0.000
##                   SRR1039517 SRR1039520 SRR1039521
## ENST00000456328.2      0.000      0.000      0.000
## ENST00000450305.2      0.000      0.000      0.000
## ENST00000488147.1     92.538     57.415     64.596
## ENST00000619216.1      0.000      0.000      1.000
## ENST00000473358.1      0.000      0.000      0.000
## ENST00000469289.1      0.000      1.059      0.000

However, we usually want to (start) analyse(ing) the data at the gene level

So we summarize the quantifications on the gene level.

sg <- tximeta::summarizeToGene(st)
## loading existing TxDb created: 2020-03-05 09:58:03
## Loading required package: GenomicFeatures
## obtaining transcript-to-gene mapping from TxDb
## generating gene ranges
## summarizing abundance
## summarizing counts
## summarizing length

Did it work? how many transcripts?

dim(assay(st))
## [1] 205870      8

how many genes?

dim(assay(sg))
## [1] 58294     8

And, how does it look?

head(assay(sg))
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14    707.210    463.445    896.252    421.186   1185.870
## ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
## ENSG00000000419.12    454.000    508.999    606.000    352.999    583.000
## ENSG00000000457.13    311.880    266.807    358.017    217.296    354.140
## ENSG00000000460.16     91.451     74.463     52.361     45.144    107.779
## ENSG00000000938.12      0.000      0.000      2.000      0.000      1.000
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   1086.289    802.000    596.220
## ENSG00000000005.5       0.000      0.000      0.000
## ENSG00000000419.12    773.000    410.000    500.001
## ENSG00000000457.13    423.712    297.742    288.368
## ENSG00000000460.16    100.610     97.742     84.634
## ENSG00000000938.12      0.000      0.000      0.000

Looks fine, but given that we have the gene annotation available, let us add gene symbols

sg <- tximeta::addIds(sg, "SYMBOL", gene = TRUE)
## mapping to new IDs using 'org.Hs.eg.db' data package
## if all matching IDs are desired, and '1:many mappings' are reported,
## set multiVals='list' to obtain all the matching IDs
## 'select()' returned 1:many mapping between keys and columns
head(rowData(sg))
## DataFrame with 6 rows and 2 columns
##                               gene_id      SYMBOL
##                           <character> <character>
## ENSG00000000003.14 ENSG00000000003.14      TSPAN6
## ENSG00000000005.5   ENSG00000000005.5        TNMD
## ENSG00000000419.12 ENSG00000000419.12        DPM1
## ENSG00000000457.13 ENSG00000000457.13       SCYL3
## ENSG00000000460.16 ENSG00000000460.16    C1orf112
## ENSG00000000938.12 ENSG00000000938.12         FGR

Because the tools (DESeq2 and edgeR), we use downstream expect integers (non decimal counts), we round the values. This has no impact on the analysis

counts_salmon <- round(assay(sg, "counts"))

2.3 Comparison

Let’s compare featureCounts vs. Salmon for one sample

spl <- "SRR1039508"

for comparing, we need to make sure, both counts table are sorted in the same way. We create a data.frame containing both sorted according to their gene IDs

gns <- rownames(counts_salmon)
quants <- data.frame(featureCounts = counts_featurecounts[gns, spl],
                     salmon = counts_salmon[gns, spl])

head(quants)
##                    featureCounts salmon
## ENSG00000000003.14           702    707
## ENSG00000000005.5              0      0
## ENSG00000000419.12           467    454
## ENSG00000000457.13           279    312
## ENSG00000000460.16            62     91
## ENSG00000000938.12             0      0

Let us plot the raw counts

heatscatter(quants$featureCounts,quants$salmon,
            xlab="featureCounts",ylab="salmon",
            main="scatterplot (raw counts)")
abline(0,1,lty=2,col="black")