library(tidyverse)
library(stringr) ## for str_pad() function
library(purrr) ## map
library(phangorn)
library(phylosim)
library(parallel)
library(rBayesianOptimization)
library(DCLEAR)
Assume that we have data from subchallenge 2 data in ../data/subC2/
folder. Let's load the first file.
states <- c('0', '-', LETTERS)
sequence_length <- 200L
prefix <- '../data/subC2/SubC2_train_0001'
txt_file <- sprintf('%s.txt', prefix)
nw_file <- sprintf('%s_REF.nw', prefix)
print(txt_file)
Print first 5 lines to see how it looks
x <- read.table(txt_file, header = FALSE, sep = '\t')
tip_names <- x[, 1]
print(x[1:5,])
Process data and save it as phyDat
object form
states2num = 1:length(states)
names(states2num) = states
x <- do.call('rbind', strsplit(as.character(x[, 2]), ''))
rownames(x) <- tip_names
x <- x %>% phyDat(type = 'USER', levels = states)
y <- read.tree(nw_file)
x
is a phyDat
object.
x
And, y
is phylo
object for tree.
plot(y)
Our algorithm is divided in 2 parts; distance calculation
, and tree reconstruction
.
For distance calculation
, we will compare 1) hamming distance, 2) weighted hamming with entrophy weights, and 3) weighted hamming with bayesian optimization search.
For tree reconstruction
, we will simply use Neighbor Joining (NJ) as tree reconstruction method.
The simple one is hamming distance
D_h = x %>% dist.hamming()
D_h is a dist
object that calculated pairwise distance among 100 cells
length(D_h)
We will explain how to get dist
object using weghted hamming method. There are two methods. The first one is using Inverse information as weights. and the second one is estimating weights using bayesian optimization.
Prepare training data first
set.seed(99)
idx1 = sample(100)
train_idx = idx1[1:70]
test_idx = idx1[71:100]
Let 70% of data as training set.
SDs_train = NULL
for(i in 1:length(train_idx)) {
idx = train_idx[i]
states <- c('0', '-', LETTERS)
sequence_length <- 200L
prefix <- paste('../data/subC2/SubC2_train_',str_pad(idx,4, pad = "0"), sep="")
txt_file <- sprintf('%s.txt', prefix)
nw_file <- sprintf('%s_REF.nw', prefix)
x <- read.table(txt_file, header = FALSE, sep = '\t')
tip_names <- x[, 1]
states2num = 1:length(states)
names(states2num) = states
x <- do.call('rbind', strsplit(as.character(x[, 2]), ''))
rownames(x) <- tip_names
x <- x %>% phyDat(type = 'USER', levels = states)
y <- read.tree(nw_file)
item1 = list(seq=x, tree=y)
SDs_train[[i]] = item1
}
Let 30% of the data as testing set
SDs_test = NULL
for(i in 1:length(test_idx)) {
idx = test_idx[i]
states <- c('0', '-', LETTERS)
sequence_length <- 200L
prefix <- paste('../data/subC2/SubC2_train_',str_pad(idx,4, pad = "0"), sep="")
txt_file <- sprintf('%s.txt', prefix)
nw_file <- sprintf('%s_REF.nw', prefix)
x <- read.table(txt_file, header = FALSE, sep = '\t')
tip_names <- x[, 1]
states2num = 1:length(states)
names(states2num) = states
x <- do.call('rbind', strsplit(as.character(x[, 2]), ''))
rownames(x) <- tip_names
x <- x %>% phyDat(type = 'USER', levels = states)
y <- read.tree(nw_file)
item1 = list(seq=x, tree=y)
SDs_test[[i]] = item1
}
eval_test
is a function for evaluating our method in test set
eval_test <- function(InfoW, SDs_test, dir = 'res', fnm= "v1") {
Dwh = NULL
Dh = NULL
dir.create(dir, showWarnings = FALSE)
for (i in 1:length(SDs_test)) {
x = SDs_test[[i]][1]$seq
y = SDs_test[[i]]$tree
tree1 = x %>% dist_weighted_hamming(InfoW, dropout=FALSE) %>% fastme.bal()
dist_wh = tree1 %>% RF.dist(y, normalize = TRUE)
write.tree(tree1, paste(dir, '/Constructed_', i,'_',fnm,'.newick', sep="") )
write.tree(y, paste(dir, '/true_', i,'_',fnm,'.newick', sep="") )
# dist_h = x %>% dist.hamming() %>% fastme.bal() %>% RF.dist(y, normalize = TRUE)
Dwh = c(Dwh, dist_wh)
# Dh = c(Dh, dist_h)
}
return(mean(Dwh))
}
InfoW = rep(1,23)
#InfoW[1] = 1
#InfoW[2:23] = 1
eval_test(InfoW, SDs_test, fnm = 'hamming')
A function to calculate Inverse information from data:
getInfoW <- function(SDs_train, outcome_pos=3:25, upper_bound = 7, W0 = 1, W_ = 1 ) {
n_t = length(SDs_train)
seqs <- rep(0,30)
for(i in 1:n_t) {
tb = table(unlist(SDs_train[[i]]$seq))
for(j in names(tb)) {
seqs[as.numeric(j)] = seqs[as.numeric(j)] + tb[j]
}
}
seqs = seqs[outcome_pos]
ws = seqs / sum(seqs)
InfoW = -log(ws)
InfoW[InfoW > upper_bound] = upper_bound
InfoW = c(W0,W_,InfoW)
return(InfoW)
}
Using inverse information method, we can get evaluation score as below
InfoW = getInfoW(SDs_train, outcome_pos = 3:25, upper_bound = 7, W0 = 1, W_ = 1)
eval_test(InfoW, SDs_test, fnm = 'v1_1')
Assume two different weights, weights for interval dropout, Wdropout
, and weight for other outcome states Wothers
.
WHfit <- function( Wdropout, Wothers ) {
InfoW = rep(1,7)
InfoW[1] = 1
InfoW[2] = Wdropout
InfoW[3:23] = Wothers
ds = NULL
for(i in 1:length(SDs_train)){
D_wh = dist_weighted_hamming(SDs_train[[i]][1]$seq, InfoW, dropout = FALSE)
tree_wh = fastme.bal(D_wh)
ds = c(ds, -RF.dist(tree_wh, SDs_train[[i]]$tree, normalize = TRUE) )
}
result = list( Score = mean(ds), Pred = 0 )
return(result)
}
search_bound <- list(Wdropout = c(0.1,1),
Wothers = c(1, 15) )
set.seed(123)
search_grid <- data.frame(Wdropout = runif(30, 0.1, 1.5),
Wothers = runif(30, 1.5, 15))
head(search_grid)
set.seed(123)
bayes_WH <- BayesianOptimization(FUN = WHfit, bounds = search_bound,
init_points = 0, init_grid_dt = search_grid,
n_iter = 30, acq = "ucb")
With Bayesian hyperparameter optimization, we get estimates for Wdropout
and Wothers
. Put these estimates as wegiht and get the evaluation performance.
InfoW = rep(1,7)
InfoW[1] = 1
InfoW[2] = 0.5835
InfoW[3:23] = 1.8797
eval_test(InfoW, SDs_test, fnm = 'v1_2')