| | |
| | |
| | |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| |
|
| | dnv.table.seperate.multiple.effects <- function( |
| | dnv.table, njobs=42, split.chr=";", |
| | cols.to.split=c("TransIDs", "TransEffs", "AAChg", "Symbol", "GeneID", "HGNC", "GeneEff"), |
| | carno.only=TRUE) { |
| | |
| | duplicated.effects.idx <- grep(split.chr, dnv.table$AAChg) |
| | if (length(duplicated.effects.idx)!=0) { |
| | dnv.table.nodup <- dnv.table[-duplicated.effects.idx,] |
| | dnv.table.dup <- dnv.table[duplicated.effects.idx,] |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | cols.to.split <- cols.to.split[cols.to.split %in% colnames(dnv.table.dup)] |
| | dnv.table.dup.expand <- foreach (i = 1:dim(dnv.table.dup)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | row.number <- strsplit(dnv.table.dup[i, cols.to.split[1]], split = split.chr)[[1]] |
| | tmp <- dnv.table.dup[rep(i, length(row.number)),] |
| | for (k in cols.to.split) { |
| | split.res <- strsplit(dnv.table.dup[i, k], split = split.chr)[[1]][1:nrow(tmp)] |
| | tmp[,k] <- split.res |
| | } |
| | if (carno.only) { |
| | tmp <- tmp[match(dnv.table.dup$TransCanon[i], tmp$TransIDs),] |
| | } |
| | tmp |
| | } |
| | stopCluster(cl) |
| | dnv.table <- rbind(dnv.table.nodup, dnv.table.dup.expand) |
| | } |
| | dnv.table |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.HGNC <- function( |
| | dnv.table, VarID.column, Score.column, |
| | HGNC.column, aaChg.column, offset.column=NA) { |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | result <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | uniprotID = NA, |
| | ref = NA, |
| | pos = NA, |
| | alt = NA, |
| | score = dnv.table[,Score.column], |
| | sequence = NA, |
| | wt = NA, |
| | sequence.len = NA, |
| | seq.start = NA, |
| | seq.end = NA, |
| | pos.orig = NA, |
| | sequence.orig = NA, |
| | wt.orig = NA, |
| | sequence.len.orig = NA, |
| | HGNC = dnv.table[,HGNC.column]) |
| | unique_genes <- as.character(unique(result$HGNC)) |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv') |
| | uniprot_ID <- vector("list", length(unique_genes)) |
| | uniprot_wt.sequence <- vector("list", length(unique_genes)) |
| | for (i in 1:length(unique_genes)) { |
| | uniprot_ID[[i]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[i]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | if (length(uniprot_ID[[i]]>1)) { |
| | print(unique_genes[i]) |
| | } |
| | } |
| | |
| | empty_ids <- c() |
| | for (i in 1:length(uniprot_ID)) { |
| | if (length(uniprot_ID[[i]])==0) { |
| | empty_ids <- c(empty_ids, i) |
| | } |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (id in empty_ids) { |
| | uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | for (i in 1:dim(result)[1]) { |
| | aaChg <- result$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(result$HGNC[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(result$HGNC[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(result$sequence[i]) & j <= length(uniprot_IDs)) { |
| | result$uniprotID[i] <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset) |
| | result$ref[i] <- substitute_res$ref |
| | result$pos[i] <- substitute_res$pos |
| | result$alt[i] <- substitute_res$alt |
| | result$sequence[i] <- substitute_res$sequence |
| | result$wt[i] <- substitute_res$wt |
| | result$sequence.len[i] <- substitute_res$sequence.len |
| | result$seq.start[i] = substitute_res$seq.start |
| | result$seq.end[i] = substitute_res$seq.end |
| | result$pos.orig[i] = substitute_res$pos.orig |
| | result$sequence.orig[i] = substitute_res$sequence.orig |
| | result$wt.orig[i] = substitute_res$wt.orig |
| | result$sequence.len.orig[i] = substitute_res$sequence.len.orig |
| | j = j + 1 |
| | } |
| | } |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.uniprotID <- function( |
| | dnv.table, VarID.column, Score.column, |
| | uniprotID.column, aaChg.column, offset.column=NA) { |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | result <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | uniprotID = dnv.table[,uniprotID.column], |
| | ref = NA, |
| | pos = NA, |
| | alt = NA, |
| | score = dnv.table[,Score.column], |
| | sequence = NA, |
| | wt = NA, |
| | sequence.len = NA, |
| | seq.start = NA, |
| | seq.end = NA, |
| | pos.orig = NA, |
| | sequence.orig = NA, |
| | wt.orig = NA, |
| | sequence.len.orig = NA, |
| | HGNC = NA) |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (i in 1:dim(result)[1]) { |
| | aaChg <- result$aaChg[i] |
| | uniprot_IDs <- result$uniprotID[i] |
| | if (grepl("-", uniprot_IDs)) { |
| | url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta') |
| | txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]] |
| | wt.sequences <- paste(txt[2:length(txt)], collapse = '') |
| | } else { |
| | wt.sequences <- uniprot_ID.mapping$Sequence[match(uniprot_IDs, uniprot_ID.mapping$Entry)] |
| | } |
| | j = 1 |
| | while (is.na(result$sequence[i]) & j <= length(uniprot_IDs)) { |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | result$ref[i] <- substitute_res$ref |
| | result$pos[i] <- substitute_res$pos |
| | result$alt[i] <- substitute_res$alt |
| | result$sequence[i] <- substitute_res$sequence |
| | result$wt[i] <- substitute_res$wt |
| | result$sequence.len[i] <- substitute_res$sequence.len |
| | result$seq.start[i] = substitute_res$seq.start |
| | result$seq.end[i] = substitute_res$seq.end |
| | result$pos.orig[i] = substitute_res$pos.orig |
| | result$sequence.orig[i] = substitute_res$sequence.orig |
| | result$wt.orig[i] = substitute_res$wt.orig |
| | result$sequence.len.orig[i] = substitute_res$sequence.len.orig |
| | j = j + 1 |
| | } |
| | } |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.HGNC.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | HGNC.column, aaChg.column, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | HGNC = dnv.table[,HGNC.column]) |
| | unique_genes <- as.character(unique(prompt$HGNC)) |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | |
| | uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | if (length(tmp>1)) { |
| | print(unique_genes[i]) |
| | } |
| | tmp |
| | } |
| | uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | tmp |
| | } |
| | |
| | empty_ids <- c() |
| | for (i in 1:length(uniprot_ID)) { |
| | if (length(uniprot_ID[[i]])==0) { |
| | empty_ids <- c(empty_ids, i) |
| | } |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (id in empty_ids) { |
| | uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(prompt$HGNC[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(prompt$HGNC[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | HGNC = prompt$HGNC[i]) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.HGNC.af2.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | HGNC.column, aaChg.column, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | HGNC = dnv.table[,HGNC.column]) |
| | unique_genes <- as.character(unique(prompt$HGNC)) |
| | af2.mapping <- read.csv('/share/pascal/Users/gz2294/Data/af2_uniprot/swissprot_and_human.csv', row.names = 1) |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | |
| | uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- af2.mapping$uniprotID[match(unique_genes[i], af2.mapping$HGNC)] |
| | if (is.na(tmp)) { |
| | tmp <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | if (length(tmp>1)) { |
| | print(unique_genes[i]) |
| | } |
| | } |
| | tmp |
| | } |
| | uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- af2.mapping$seq[match(unique_genes[i], af2.mapping$HGNC)] |
| | if (is.na(tmp)) { |
| | tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | tmp |
| | } |
| | |
| | empty_ids <- c() |
| | for (i in 1:length(uniprot_ID)) { |
| | if (length(uniprot_ID[[i]])==0) { |
| | empty_ids <- c(empty_ids, i) |
| | } |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (id in empty_ids) { |
| | uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(prompt$HGNC[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(prompt$HGNC[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | HGNC = prompt$HGNC[i]) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| |
|
| | dnv.table.to.uniprot.by.uniprotID.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | uniprotID.column, aaChg.column, offset.column=NA, length.column=NA, |
| | match.length=FALSE, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | if (match.length) { |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | uniprotID = dnv.table[,uniprotID.column], |
| | seq.match.length = dnv.table[,length.column]) |
| | } else { |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | uniprotID = dnv.table[,uniprotID.column]) |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- prompt$uniprotID[i] |
| | if (grepl("-", uniprot_IDs)) { |
| | url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta') |
| | txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]] |
| | wt.sequences <- paste(txt[2:length(txt)], collapse = '') |
| | } else { |
| | wt.sequences <- uniprot_ID.mapping$Sequence[match(uniprot_IDs, uniprot_ID.mapping$Entry)] |
| | } |
| | if (match.length) { |
| | if (!is.na(wt.sequences) & nchar(wt.sequences) != prompt$seq.match.length[i]) { |
| | wt.sequences <- NA |
| | } |
| | } |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | HGNC = NA) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.af2.uniprotID.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | uniprotID.column, aaChg.column, offset.column=NA, length.column=NA, |
| | match.length=FALSE, njobs=96) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | if (match.length) { |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | uniprotID = dnv.table[,uniprotID.column], |
| | seq.match.length = dnv.table[,length.column]) |
| | } else { |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | uniprotID = dnv.table[,uniprotID.column]) |
| | } |
| | uniprot_ID.mapping <- read.csv('/share/pascal/Users/gz2294/Data/af2_uniprot/swissprot_and_human.csv') |
| | |
| | |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- prompt$uniprotID[i] |
| | if (grepl("-", uniprot_IDs) | !uniprot_IDs %in% uniprot_ID.mapping$uniprotID) { |
| | url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta') |
| | txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]] |
| | wt.sequences <- paste(txt[2:length(txt)], collapse = '') |
| | } else { |
| | wt.sequences <- uniprot_ID.mapping$seq[match(uniprot_IDs, uniprot_ID.mapping$uniprotID)] |
| | } |
| | if (match.length) { |
| | if (!is.na(wt.sequences) & nchar(wt.sequences) != prompt$seq.match.length[i]) { |
| | wt.sequences <- NA |
| | } |
| | } |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.ensembl_geneID.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | ensembl.column, aaChg.column, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | ensembl = dnv.table[,ensembl.column]) |
| | unique_genes <- as.character(unique(prompt$ensembl)) |
| | ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv') |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | |
| | uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- ensembl.mapping$uniprot_gn_id[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | ensembl.mapping$ensembl_gene_id)] |
| | if (length(tmp>1)) { |
| | print(unique_genes[i]) |
| | } |
| | tmp |
| | } |
| | uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', uniprot_ID[[i]], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | tmp |
| | } |
| | |
| | empty_ids <- c() |
| | for (i in 1:length(uniprot_ID)) { |
| | if (length(uniprot_ID[[i]])==0) { |
| | empty_ids <- c(empty_ids, i) |
| | } |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (id in empty_ids) { |
| | uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | ensembl = prompt$ensembl[i]) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.ensembl_transcriptID.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | ensembl.column, aaChg.column, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | ensembl = dnv.table[,ensembl.column]) |
| | unique_genes <- as.character(unique(prompt$ensembl)) |
| | ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv') |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | |
| | uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- ensembl.mapping$uniprot_gn_id[grep(paste0('\\b', unique_genes[i], '\\b'), |
| | ensembl.mapping$ensembl_gene_id)] |
| | if (length(tmp>1)) { |
| | print(unique_genes[i]) |
| | } |
| | tmp |
| | } |
| | uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', uniprot_ID[[i]], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | tmp |
| | } |
| | |
| | empty_ids <- c() |
| | for (i in 1:length(uniprot_ID)) { |
| | if (length(uniprot_ID[[i]])==0) { |
| | empty_ids <- c(empty_ids, i) |
| | } |
| | } |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | for (id in empty_ids) { |
| | uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'), |
| | uniprot_ID.mapping$Gene.Names)] |
| | } |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | ensembl = prompt$ensembl[i]) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.ensembl_transcriptID.from.dir.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | ensembl.column, aaChg.column, ensembl.dir, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | ensembl = dnv.table[,ensembl.column]) |
| | unique_genes <- as.character(unique(prompt$ensembl)) |
| | ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv') |
| | uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv') |
| | |
| | uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),] |
| | |
| | uniprot_ID <- as.list(unique_genes) |
| | uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% { |
| | tmp <- read.csv(paste0(ensembl.dir, unique_genes[i], '.csv'), row.names = 1) |
| | tmp <- tmp$X0 |
| | tmp |
| | } |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]] |
| | wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]] |
| | j = 1 |
| | while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) { |
| | matched_uniprot_ID <- uniprot_IDs[j] |
| | wt.sequence <- wt.sequences[j] |
| | |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | j = j + 1 |
| | } |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = matched_uniprot_ID, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig, |
| | ensembl = prompt$ensembl[i]) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| |
|
| | dnv.table.to.uniprot.by.wt.seq.parallel <- function( |
| | dnv.table, VarID.column, Score.column, |
| | wt.seq.column, aaChg.column, offset.column=NA, njobs=42) { |
| | library(doParallel) |
| | cl <- makeCluster(njobs) |
| | registerDoParallel(cl) |
| | if (!is.na(offset.column)) { |
| | offset <- as.numeric(dnv.table[,offset.column]) |
| | } else { |
| | offset <- rep(0, dim(dnv.table)[1]) |
| | } |
| | prompt <- data.frame(VarID = dnv.table[,VarID.column], |
| | aaChg = dnv.table[,aaChg.column], |
| | score = dnv.table[,Score.column], |
| | wt.orig = dnv.table[,wt.seq.column]) |
| | result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% { |
| | source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R') |
| | substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA, |
| | sequence = NA, sequence.len = NA, |
| | seq.start = NA, seq.end = NA, |
| | pos.orig = NA, sequence.orig = NA, |
| | wt.orig = NA, sequence.len.orig = NA) |
| | aaChg <- prompt$aaChg[i] |
| | uniprot_ID <- NA |
| | wt.sequence <- prompt$wt.orig[i] |
| | substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i]) |
| | tmp <- data.frame(VarID = prompt$VarID[i], |
| | aaChg = prompt$aaChg[i], |
| | uniprotID = NA, |
| | ref = substitute_res$ref, |
| | pos = substitute_res$pos, |
| | alt = substitute_res$alt, |
| | score = prompt$score[i], |
| | sequence = substitute_res$sequence, |
| | wt = substitute_res$wt, |
| | sequence.len = substitute_res$sequence.len, |
| | seq.start = substitute_res$seq.start, |
| | seq.end = substitute_res$seq.end, |
| | pos.orig = substitute_res$pos.orig, |
| | sequence.orig = substitute_res$sequence.orig, |
| | wt.orig = substitute_res$wt.orig, |
| | sequence.len.orig = substitute_res$sequence.len.orig) |
| | tmp |
| | } |
| | stopCluster(cl) |
| | result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result) |
| | result.noNA <- result[!is.na(result$sequence),] |
| | output <- list(result=result, |
| | result.noNA=result.noNA) |
| | output |
| | } |
| |
|
| | source('/share/pascal/Users/gz2294/Pipeline/uniprot.table.add.annotation.R') |
| |
|
| |
|
| |
|
| |
|
| |
|