library(biomaRt) # version 2.50.0 human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") # do experiment on removing all paralogues in testing data # read in testing data testing <- read.csv('testing.csv', row.names = 1) training <- read.csv('/share/vault/Users/gz2294/Data/DMS/ClinVar.HGMD.PrimateAI.syn/training.csv', row.names = 1) # uniprot2geneid <- getBM(attributes = c("uniprot_gn_id", "ensembl_gene_id"), # mart = human) # # filter uniprot ID in testing and training # uniprot2geneid <- uniprot2geneid[uniprot2geneid$uniprot_gn_id %in% c(testing$uniprotID, training$uniprotID),] uniprot2geneid <- read.csv('uniprot2geneid.csv', row.names = 1) # geneid2paralog <- getBM(attributes = c("ensembl_gene_id", # "hsapiens_paralog_ensembl_gene"), # filters = "ensembl_gene_id", # values = uniprot2geneid$ensembl_gene_id, # mart = human) geneid2paralog <- read.csv('geneid2paralog.csv', row.names = 1) # find paralogues in testing set testing.geneids <- uniprot2geneid$ensembl_gene_id[uniprot2geneid$uniprot_gn_id %in% testing$uniprotID] testing.paralogs <- geneid2paralog$hsapiens_paralog_ensembl_gene[geneid2paralog$ensembl_gene_id %in% testing.geneids] testing.paralogs.uid <- uniprot2geneid$uniprot_gn_id[uniprot2geneid$ensembl_gene_id %in% testing.paralogs] # remove all paralog uids in the training data, and the uids themselves. training.filetered <- training[!training$uniprotID %in% c(testing.paralogs.uid, testing$uniprotID),] training <- training.filetered # write training # randomly drop 1 lines split.n <- 4 set.seed(0) to.drop <- sample(which(training$score==0), dim(training)[1]-floor(dim(training)[1]/split.n)*split.n) if (length(to.drop) > 0) { training <- training[-to.drop,] } split.by.uniprotID <- function(freq_table, number_to_select) { set.seed(0) selected = 0 selected_uniprotIDs = c() candidates = freq_table[freq_table$Freq <= number_to_select - selected,] while ((selected < number_to_select) & (dim(candidates)[1] > 0)) { selected_uniprotID = sample(as.character(candidates$Var1), size = 1) selected_uniprotIDs <- c(selected_uniprotIDs, selected_uniprotID) selected = selected + freq_table$Freq[freq_table$Var1 == selected_uniprotID] # update freq_table and candidates freq_table = freq_table[!freq_table$Var1 %in% selected_uniprotID,] candidates = freq_table[freq_table$Freq <= number_to_select - selected,] } result = list(selected_uniprotIDs, freq_table) result } # do split quarter.size <- floor(dim(training)[1] / split.n) training_freq_table <- as.data.frame(table(training$uniprotID)) splits <- list() tmp <- split.by.uniprotID(freq_table = training_freq_table, quarter.size) splits[[1]] <- which(training$uniprotID %in% tmp[[1]]) left_freq_table <- tmp[[2]] for (s in 2:split.n) { tmp <- split.by.uniprotID(freq_table = left_freq_table, quarter.size) splits[[s]] <- which(training$uniprotID %in% tmp[[1]]) left_freq_table <- tmp[[2]] } # fill in splits left_split <- which(training$uniprotID %in% left_freq_table$Var1) for (s in 1:split.n) { set.seed(0) if (length(splits[[s]]) < quarter.size) { to.add <- sample(left_split, quarter.size - length(splits[[s]])) splits[[s]] <- c(splits[[s]], to.add) left_split <- left_split[!left_split %in% to.add] } } training$sequence.len <- nchar(training$sequence) training$sequence.len.orig <- nchar(training$sequence.orig) for (s in 1:split.n) { tmp.split <- training[splits[[s]], ] # shuffle set.seed(0) tmp.split <- tmp.split[sample(dim(tmp.split)[1]),] print(dim(tmp.split)[1]) write.csv(tmp.split, paste0("training.", s-1, ".csv"), na = ".") table(tmp.split$split) } # set training.head # training.head <- training[1:50,] # write.csv(training.head, file = 'training.head.csv') write.csv(training, file = 'training.csv')