PreMode / analysis /dnv.table.to.uniprot.R

Upload folder using huggingface_hub

7718235 verified almost 2 years ago

40 kB

	# parse dnv table to HGNC, columns:
	# Reauired: VarID, Score, HGNC, aaChg
	# Optional: offset
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')

	dnv.table.seperate.multiple.effects <- function(
	dnv.table, njobs=42, split.chr=";",
	cols.to.split=c("TransIDs", "TransEffs", "AAChg", "Symbol", "GeneID", "HGNC", "GeneEff"),
	carno.only=TRUE) {
	# expect to receive standard output from VEP
	duplicated.effects.idx <- grep(split.chr, dnv.table$AAChg)
	if (length(duplicated.effects.idx)!=0) {
	dnv.table.nodup <- dnv.table[-duplicated.effects.idx,]
	dnv.table.dup <- dnv.table[duplicated.effects.idx,]
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	cols.to.split <- cols.to.split[cols.to.split %in% colnames(dnv.table.dup)]
	dnv.table.dup.expand <- foreach (i = 1:dim(dnv.table.dup)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	row.number <- strsplit(dnv.table.dup[i, cols.to.split[1]], split = split.chr)[[1]]
	tmp <- dnv.table.dup[rep(i, length(row.number)),]
	for (k in cols.to.split) {
	split.res <- strsplit(dnv.table.dup[i, k], split = split.chr)[[1]][1:nrow(tmp)]
	tmp[,k] <- split.res
	}
	if (carno.only) {
	tmp <- tmp[match(dnv.table.dup$TransCanon[i], tmp$TransIDs),]
	}
	tmp
	}
	stopCluster(cl)
	dnv.table <- rbind(dnv.table.nodup, dnv.table.dup.expand)
	}
	dnv.table
	}


	dnv.table.to.uniprot.by.HGNC <- function(
	dnv.table, VarID.column, Score.column,
	HGNC.column, aaChg.column, offset.column=NA) {
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	result <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	uniprotID = NA,
	ref = NA,
	pos = NA,
	alt = NA,
	score = dnv.table[,Score.column],
	sequence = NA,
	wt = NA,
	sequence.len = NA,
	seq.start = NA,
	seq.end = NA,
	pos.orig = NA,
	sequence.orig = NA,
	wt.orig = NA,
	sequence.len.orig = NA,
	HGNC = dnv.table[,HGNC.column])
	unique_genes <- as.character(unique(result$HGNC))
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv')
	uniprot_ID <- vector("list", length(unique_genes))
	uniprot_wt.sequence <- vector("list", length(unique_genes))
	for (i in 1:length(unique_genes)) {
	uniprot_ID[[i]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[i]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	if (length(uniprot_ID[[i]]>1)) {
	print(unique_genes[i])
	}
	}
	# next match unreviewed IDs
	empty_ids <- c()
	for (i in 1:length(uniprot_ID)) {
	if (length(uniprot_ID[[i]])==0) {
	empty_ids <- c(empty_ids, i)
	}
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (id in empty_ids) {
	uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	for (i in 1:dim(result)[1]) {
	aaChg <- result$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(result$HGNC[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(result$HGNC[i], unique_genes)]]
	j = 1
	while (is.na(result$sequence[i]) & j <= length(uniprot_IDs)) {
	result$uniprotID[i] <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset)
	result$ref[i] <- substitute_res$ref
	result$pos[i] <- substitute_res$pos
	result$alt[i] <- substitute_res$alt
	result$sequence[i] <- substitute_res$sequence
	result$wt[i] <- substitute_res$wt
	result$sequence.len[i] <- substitute_res$sequence.len
	result$seq.start[i] = substitute_res$seq.start
	result$seq.end[i] = substitute_res$seq.end
	result$pos.orig[i] = substitute_res$pos.orig
	result$sequence.orig[i] = substitute_res$sequence.orig
	result$wt.orig[i] = substitute_res$wt.orig
	result$sequence.len.orig[i] = substitute_res$sequence.len.orig
	j = j + 1
	}
	}
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.uniprotID <- function(
	dnv.table, VarID.column, Score.column,
	uniprotID.column, aaChg.column, offset.column=NA) {
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	result <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	uniprotID = dnv.table[,uniprotID.column],
	ref = NA,
	pos = NA,
	alt = NA,
	score = dnv.table[,Score.column],
	sequence = NA,
	wt = NA,
	sequence.len = NA,
	seq.start = NA,
	seq.end = NA,
	pos.orig = NA,
	sequence.orig = NA,
	wt.orig = NA,
	sequence.len.orig = NA,
	HGNC = NA)
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (i in 1:dim(result)[1]) {
	aaChg <- result$aaChg[i]
	uniprot_IDs <- result$uniprotID[i]
	if (grepl("-", uniprot_IDs)) {
	url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta')
	txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]]
	wt.sequences <- paste(txt[2:length(txt)], collapse = '')
	} else {
	wt.sequences <- uniprot_ID.mapping$Sequence[match(uniprot_IDs, uniprot_ID.mapping$Entry)]
	}
	j = 1
	while (is.na(result$sequence[i]) & j <= length(uniprot_IDs)) {
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	result$ref[i] <- substitute_res$ref
	result$pos[i] <- substitute_res$pos
	result$alt[i] <- substitute_res$alt
	result$sequence[i] <- substitute_res$sequence
	result$wt[i] <- substitute_res$wt
	result$sequence.len[i] <- substitute_res$sequence.len
	result$seq.start[i] = substitute_res$seq.start
	result$seq.end[i] = substitute_res$seq.end
	result$pos.orig[i] = substitute_res$pos.orig
	result$sequence.orig[i] = substitute_res$sequence.orig
	result$wt.orig[i] = substitute_res$wt.orig
	result$sequence.len.orig[i] = substitute_res$sequence.len.orig
	j = j + 1
	}
	}
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.HGNC.parallel <- function(
	dnv.table, VarID.column, Score.column,
	HGNC.column, aaChg.column, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	HGNC = dnv.table[,HGNC.column])
	unique_genes <- as.character(unique(prompt$HGNC))
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	# first match reviewed IDs
	uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	if (length(tmp>1)) {
	print(unique_genes[i])
	}
	tmp
	}
	uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	tmp
	}
	# next match unreviewed IDs
	empty_ids <- c()
	for (i in 1:length(uniprot_ID)) {
	if (length(uniprot_ID[[i]])==0) {
	empty_ids <- c(empty_ids, i)
	}
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (id in empty_ids) {
	uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(prompt$HGNC[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(prompt$HGNC[i], unique_genes)]]
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	HGNC = prompt$HGNC[i])
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.HGNC.af2.parallel <- function(
	dnv.table, VarID.column, Score.column,
	HGNC.column, aaChg.column, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	HGNC = dnv.table[,HGNC.column])
	unique_genes <- as.character(unique(prompt$HGNC))
	af2.mapping <- read.csv('/share/pascal/Users/gz2294/Data/af2_uniprot/swissprot_and_human.csv', row.names = 1)
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/swissprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	# first match reviewed IDs
	uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- af2.mapping$uniprotID[match(unique_genes[i], af2.mapping$HGNC)]
	if (is.na(tmp)) {
	tmp <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	if (length(tmp>1)) {
	print(unique_genes[i])
	}
	}
	tmp
	}
	uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- af2.mapping$seq[match(unique_genes[i], af2.mapping$HGNC)]
	if (is.na(tmp)) {
	tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[i], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	tmp
	}
	# next match unreviewed IDs
	empty_ids <- c()
	for (i in 1:length(uniprot_ID)) {
	if (length(uniprot_ID[[i]])==0) {
	empty_ids <- c(empty_ids, i)
	}
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (id in empty_ids) {
	uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(prompt$HGNC[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(prompt$HGNC[i], unique_genes)]]
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	HGNC = prompt$HGNC[i])
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}



	dnv.table.to.uniprot.by.uniprotID.parallel <- function(
	dnv.table, VarID.column, Score.column,
	uniprotID.column, aaChg.column, offset.column=NA, length.column=NA,
	match.length=FALSE, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	if (match.length) {
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	uniprotID = dnv.table[,uniprotID.column],
	seq.match.length = dnv.table[,length.column])
	} else {
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	uniprotID = dnv.table[,uniprotID.column])
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- prompt$uniprotID[i]
	if (grepl("-", uniprot_IDs)) {
	url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta')
	txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]]
	wt.sequences <- paste(txt[2:length(txt)], collapse = '')
	} else {
	wt.sequences <- uniprot_ID.mapping$Sequence[match(uniprot_IDs, uniprot_ID.mapping$Entry)]
	}
	if (match.length) {
	if (!is.na(wt.sequences) & nchar(wt.sequences) != prompt$seq.match.length[i]) {
	wt.sequences <- NA
	}
	}
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	HGNC = NA)
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.af2.uniprotID.parallel <- function(
	dnv.table, VarID.column, Score.column,
	uniprotID.column, aaChg.column, offset.column=NA, length.column=NA,
	match.length=FALSE, njobs=96) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	if (match.length) {
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	uniprotID = dnv.table[,uniprotID.column],
	seq.match.length = dnv.table[,length.column])
	} else {
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	uniprotID = dnv.table[,uniprotID.column])
	}
	uniprot_ID.mapping <- read.csv('/share/pascal/Users/gz2294/Data/af2_uniprot/swissprot_and_human.csv')
	# order to make sure first use longest transcript
	# uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- prompt$uniprotID[i]
	if (grepl("-", uniprot_IDs) \| !uniprot_IDs %in% uniprot_ID.mapping$uniprotID) {
	url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta')
	txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]]
	wt.sequences <- paste(txt[2:length(txt)], collapse = '')
	} else {
	wt.sequences <- uniprot_ID.mapping$seq[match(uniprot_IDs, uniprot_ID.mapping$uniprotID)]
	}
	if (match.length) {
	if (!is.na(wt.sequences) & nchar(wt.sequences) != prompt$seq.match.length[i]) {
	wt.sequences <- NA
	}
	}
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig)
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.ensembl_geneID.parallel <- function(
	dnv.table, VarID.column, Score.column,
	ensembl.column, aaChg.column, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	ensembl = dnv.table[,ensembl.column])
	unique_genes <- as.character(unique(prompt$ensembl))
	ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv')
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	# first match reviewed IDs
	uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- ensembl.mapping$uniprot_gn_id[grep(paste0('\\b', unique_genes[i], '\\b'),
	ensembl.mapping$ensembl_gene_id)]
	if (length(tmp>1)) {
	print(unique_genes[i])
	}
	tmp
	}
	uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', uniprot_ID[[i]], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	tmp
	}
	# next match unreviewed IDs
	empty_ids <- c()
	for (i in 1:length(uniprot_ID)) {
	if (length(uniprot_ID[[i]])==0) {
	empty_ids <- c(empty_ids, i)
	}
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (id in empty_ids) {
	uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]]
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	ensembl = prompt$ensembl[i])
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.ensembl_transcriptID.parallel <- function(
	dnv.table, VarID.column, Score.column,
	ensembl.column, aaChg.column, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	ensembl = dnv.table[,ensembl.column])
	unique_genes <- as.character(unique(prompt$ensembl))
	ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv')
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	# first match reviewed IDs
	uniprot_ID <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- ensembl.mapping$uniprot_gn_id[grep(paste0('\\b', unique_genes[i], '\\b'),
	ensembl.mapping$ensembl_gene_id)]
	if (length(tmp>1)) {
	print(unique_genes[i])
	}
	tmp
	}
	uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', uniprot_ID[[i]], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	tmp
	}
	# next match unreviewed IDs
	empty_ids <- c()
	for (i in 1:length(uniprot_ID)) {
	if (length(uniprot_ID[[i]])==0) {
	empty_ids <- c(empty_ids, i)
	}
	}
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	for (id in empty_ids) {
	uniprot_ID[[id]] <- uniprot_ID.mapping$Entry[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	uniprot_wt.sequence[[id]] <- uniprot_ID.mapping$Sequence[grep(paste0('\\b', unique_genes[id], '\\b'),
	uniprot_ID.mapping$Gene.Names)]
	}
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]]
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	ensembl = prompt$ensembl[i])
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.ensembl_transcriptID.from.dir.parallel <- function(
	dnv.table, VarID.column, Score.column,
	ensembl.column, aaChg.column, ensembl.dir, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	ensembl = dnv.table[,ensembl.column])
	unique_genes <- as.character(unique(prompt$ensembl))
	ensembl.mapping <- read.csv('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/ensembl.uniprot.ID.mapping.csv')
	uniprot_ID.mapping <- read.delim('/share/pascal/Users/gz2294/Data/Protein/uniprot.ID/uniprot.ID.mapping.tsv')
	# order to make sure first use longest transcript
	uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
	# first match reviewed IDs
	uniprot_ID <- as.list(unique_genes)
	uniprot_wt.sequence <- foreach (i = 1:length(unique_genes)) %dopar% {
	tmp <- read.csv(paste0(ensembl.dir, unique_genes[i], '.csv'), row.names = 1)
	tmp <- tmp$X0
	tmp
	}
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_IDs <- uniprot_ID[[match(prompt$ensembl[i], unique_genes)]]
	wt.sequences <- uniprot_wt.sequence[[match(prompt$ensembl[i], unique_genes)]]
	j = 1
	while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
	matched_uniprot_ID <- uniprot_IDs[j]
	wt.sequence <- wt.sequences[j]
	# parse variant
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	j = j + 1
	}
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = matched_uniprot_ID,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig,
	ensembl = prompt$ensembl[i])
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}


	dnv.table.to.uniprot.by.wt.seq.parallel <- function(
	dnv.table, VarID.column, Score.column,
	wt.seq.column, aaChg.column, offset.column=NA, njobs=42) {
	library(doParallel)
	cl <- makeCluster(njobs)
	registerDoParallel(cl)
	if (!is.na(offset.column)) {
	offset <- as.numeric(dnv.table[,offset.column])
	} else {
	offset <- rep(0, dim(dnv.table)[1])
	}
	prompt <- data.frame(VarID = dnv.table[,VarID.column],
	aaChg = dnv.table[,aaChg.column],
	score = dnv.table[,Score.column],
	wt.orig = dnv.table[,wt.seq.column])
	result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
	source('/share/pascal/Users/gz2294/Pipeline/parse.variant.wt.sequence.R')
	substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
	sequence = NA, sequence.len = NA,
	seq.start = NA, seq.end = NA,
	pos.orig = NA, sequence.orig = NA,
	wt.orig = NA, sequence.len.orig = NA)
	aaChg <- prompt$aaChg[i]
	uniprot_ID <- NA
	wt.sequence <- prompt$wt.orig[i]
	substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
	tmp <- data.frame(VarID = prompt$VarID[i],
	aaChg = prompt$aaChg[i],
	uniprotID = NA,
	ref = substitute_res$ref,
	pos = substitute_res$pos,
	alt = substitute_res$alt,
	score = prompt$score[i],
	sequence = substitute_res$sequence,
	wt = substitute_res$wt,
	sequence.len = substitute_res$sequence.len,
	seq.start = substitute_res$seq.start,
	seq.end = substitute_res$seq.end,
	pos.orig = substitute_res$pos.orig,
	sequence.orig = substitute_res$sequence.orig,
	wt.orig = substitute_res$wt.orig,
	sequence.len.orig = substitute_res$sequence.len.orig)
	tmp
	}
	stopCluster(cl)
	result <- dplyr::bind_cols(dnv.table[,!colnames(dnv.table) %in% colnames(result)], result)
	result.noNA <- result[!is.na(result$sequence),]
	output <- list(result=result,
	result.noNA=result.noNA)
	output
	}

	source('/share/pascal/Users/gz2294/Pipeline/uniprot.table.add.annotation.R')