|
|
library(biomaRt) |
|
|
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") |
|
|
|
|
|
|
|
|
testing <- read.csv('testing.csv', row.names = 1) |
|
|
training <- read.csv('/share/vault/Users/gz2294/Data/DMS/ClinVar.HGMD.PrimateAI.syn/training.csv', row.names = 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uniprot2geneid <- read.csv('uniprot2geneid.csv', row.names = 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
geneid2paralog <- read.csv('geneid2paralog.csv', row.names = 1) |
|
|
|
|
|
testing.geneids <- uniprot2geneid$ensembl_gene_id[uniprot2geneid$uniprot_gn_id %in% testing$uniprotID] |
|
|
testing.paralogs <- geneid2paralog$hsapiens_paralog_ensembl_gene[geneid2paralog$ensembl_gene_id %in% testing.geneids] |
|
|
testing.paralogs.uid <- uniprot2geneid$uniprot_gn_id[uniprot2geneid$ensembl_gene_id %in% testing.paralogs] |
|
|
|
|
|
training.filetered <- training[!training$uniprotID %in% c(testing.paralogs.uid, testing$uniprotID),] |
|
|
training <- training.filetered |
|
|
|
|
|
|
|
|
split.n <- 4 |
|
|
set.seed(0) |
|
|
to.drop <- sample(which(training$score==0), dim(training)[1]-floor(dim(training)[1]/split.n)*split.n) |
|
|
if (length(to.drop) > 0) { |
|
|
training <- training[-to.drop,] |
|
|
} |
|
|
split.by.uniprotID <- function(freq_table, number_to_select) { |
|
|
set.seed(0) |
|
|
selected = 0 |
|
|
selected_uniprotIDs = c() |
|
|
candidates = freq_table[freq_table$Freq <= number_to_select - selected,] |
|
|
while ((selected < number_to_select) & (dim(candidates)[1] > 0)) { |
|
|
selected_uniprotID = sample(as.character(candidates$Var1), size = 1) |
|
|
selected_uniprotIDs <- c(selected_uniprotIDs, selected_uniprotID) |
|
|
selected = selected + freq_table$Freq[freq_table$Var1 == selected_uniprotID] |
|
|
|
|
|
freq_table = freq_table[!freq_table$Var1 %in% selected_uniprotID,] |
|
|
candidates = freq_table[freq_table$Freq <= number_to_select - selected,] |
|
|
} |
|
|
result = list(selected_uniprotIDs, freq_table) |
|
|
result |
|
|
} |
|
|
|
|
|
quarter.size <- floor(dim(training)[1] / split.n) |
|
|
training_freq_table <- as.data.frame(table(training$uniprotID)) |
|
|
splits <- list() |
|
|
tmp <- split.by.uniprotID(freq_table = training_freq_table, quarter.size) |
|
|
splits[[1]] <- which(training$uniprotID %in% tmp[[1]]) |
|
|
left_freq_table <- tmp[[2]] |
|
|
for (s in 2:split.n) { |
|
|
tmp <- split.by.uniprotID(freq_table = left_freq_table, quarter.size) |
|
|
splits[[s]] <- which(training$uniprotID %in% tmp[[1]]) |
|
|
left_freq_table <- tmp[[2]] |
|
|
} |
|
|
|
|
|
left_split <- which(training$uniprotID %in% left_freq_table$Var1) |
|
|
for (s in 1:split.n) { |
|
|
set.seed(0) |
|
|
if (length(splits[[s]]) < quarter.size) { |
|
|
to.add <- sample(left_split, quarter.size - length(splits[[s]])) |
|
|
splits[[s]] <- c(splits[[s]], to.add) |
|
|
left_split <- left_split[!left_split %in% to.add] |
|
|
} |
|
|
} |
|
|
training$sequence.len <- nchar(training$sequence) |
|
|
training$sequence.len.orig <- nchar(training$sequence.orig) |
|
|
for (s in 1:split.n) { |
|
|
tmp.split <- training[splits[[s]], ] |
|
|
|
|
|
set.seed(0) |
|
|
tmp.split <- tmp.split[sample(dim(tmp.split)[1]),] |
|
|
print(dim(tmp.split)[1]) |
|
|
write.csv(tmp.split, paste0("training.", s-1, ".csv"), na = ".") |
|
|
table(tmp.split$split) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
write.csv(training, file = 'training.csv') |
|
|
|
|
|
|