library(stringr)
library(DESeq2)
library(Rtsne)
library(ggplot2)
library(caret)
library(LiblineaR)
library(ranger)
setwd('~/share/Day4/cancer-ml')
load('rawdata.rd')

Similarly to before, we want to subset our genes to those that actually matter. Since we don’t have a binary (case/control) setup, but rather a multi-factor (which cancer) one, we can test in DESeq wether a gene changes in any condition. DESeq2 does this via a Likelihood Ratio Test (LRT):

# Make a new column in the metadata for a short representation of the cancer type
meta$mc <- factor(str_match(meta$Patient.type, '(\\()([A-Za-z]+)(\\))')[,3])
head(meta)
##       Sample            Patient.type Gender Age
## 1  Breast-03 Breast carcinoma (BrCa)      F  35
## 2  Breast-08 Breast carcinoma (BrCa)      F  60
## 3  Breast-10 Breast carcinoma (BrCa)      F  59
## 4 Breast-100 Breast carcinoma (BrCa)      F  54
## 5  Breast-15 Breast carcinoma (BrCa)      F  58
## 6  Breast-16 Breast carcinoma (BrCa)      F  74
##                          Mutation Metastasis Organ.of.metastasis
## 1                HER2+, PIK3CA WT          Y         Brain, bone
## 2              HER2 WT, PIK3CA WT          Y                Bone
## 3                HER2+, PIK3CA WT          Y        Brain, liver
## 4 HER2 WT, PIK3CA WT, triple neg.          N                   -
## 5                HER2+, PIK3CA WT          Y               Brain
## 6 HER2 WT, PIK3CA WT, triple neg.          Y                Lung
##   Cancer.treatment   mc
## 1           CT, RT BrCa
## 2           CT, RT BrCa
## 3           CT, RT BrCa
## 4             None BrCa
## 5  Surgery, CT, RT BrCa
## 6      Surgery, CT BrCa
# Perform LRT
dds.mc <- DESeqDataSetFromMatrix(expr, meta, ~mc)
dds.mc <- DESeq(dds.mc, test = "LRT", full = ~mc, reduced = ~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 243 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

Again, select genes with a P-value < 1%

sel <- which(results(dds.mc)$padj < 0.01)

We again subset the VST data, create a vector of true value and split the data into a training/testing set.

# Subset the VST data
dat <- t(assay(vst)) [,sel]
# Create truth vector
target <- meta$mc
# Split data and truth in a training and testing set
index <- createDataPartition(target, p = 0.75, list = FALSE, times = 1)
tts <- list(
  'train' = list(
    'dat' = dat[index[,1],],
    'target' = target[index[,1]]
  ),
  'test' = list(
    'dat' = dat[-index[,1],],
    'target' = target[-index[,1]]
  )
)

Now we train a new model and predict the test data. We use regularized logistic regression here, which we did not cover in the lecture, but shows much better accuracy:

# Train the model, this takes a long time so we will cheat and load a pre-calculated
# model. The command used to estimate good settings:

# model_mc <- train(x = dat, y = target, method = 'regLogistic', 
#                   trControl = trainControl(method = "cv", 
#                                            number = 10, 
#                                            verboseIter = TRUE))
model_mc <- LiblineaR(tts$train$dat, tts$train$target, type = 0, epsilon = 0.01, cost = 2)

# Predict new unknown data
pred_mc <- predict(model_mc, tts$test$dat)

For multiclass predictors, ROC curves are not the best way to look at the data, so let’s have a look at a confusion matrix instead:

cm <- confusionMatrix(pred_mc$predictions, reference = tts$test$target)
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BrCa CRC GBM HBC HD NSCLC PAAD
##      BrCa     5   1   0   0  0     3    0
##      CRC      2   6   0   2  0     0    5
##      GBM      0   0   4   0  0     1    0
##      HBC      0   0   0   1  0     0    0
##      HD       0   0   5   0  9     0    0
##      NSCLC    0   1   0   0  0     4    0
##      PAAD     2   1   0   0  0     0    3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5818          
##                  95% CI : (0.4411, 0.7135)
##     No Information Rate : 0.1636          
##     P-Value [Acc > NIR] : 2.473e-12       
##                                           
##                   Kappa : 0.5033          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: BrCa Class: CRC Class: GBM Class: HBC
## Sensitivity              0.55556     0.6667    0.44444    0.33333
## Specificity              0.91304     0.8043    0.97826    1.00000
## Pos Pred Value           0.55556     0.4000    0.80000    1.00000
## Neg Pred Value           0.91304     0.9250    0.90000    0.96296
## Prevalence               0.16364     0.1636    0.16364    0.05455
## Detection Rate           0.09091     0.1091    0.07273    0.01818
## Detection Prevalence     0.16364     0.2727    0.09091    0.01818
## Balanced Accuracy        0.73430     0.7355    0.71135    0.66667
##                      Class: HD Class: NSCLC Class: PAAD
## Sensitivity             1.0000      0.50000     0.37500
## Specificity             0.8913      0.97872     0.93617
## Pos Pred Value          0.6429      0.80000     0.50000
## Neg Pred Value          1.0000      0.92000     0.89796
## Prevalence              0.1636      0.14545     0.14545
## Detection Rate          0.1636      0.07273     0.05455
## Detection Prevalence    0.2545      0.09091     0.10909
## Balanced Accuracy       0.9457      0.73936     0.65559