DNABERT_save / SNP /SNP.py
nancyH's picture
Upload folder using huggingface_hub
ab6c03c verified
#### ::: DNABERT-viz SNP analysis ::: ####
import os
import sys
sys.path.append('../motif')
import pandas as pd
import numpy as np
import argparse
import motif_utils as utils
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--orig_seq_file",
default='../examples/sample_data/ft/prom-core/6/dev.tsv',
type=str,
required=True,
help="Path to original input sequence+label .tsv file.",
)
parser.add_argument(
"--orig_pred_file",
required=True,
type=str,
default='../examples/result/prom-core/6/pred.npy',
help="Path to predictions pred.npy of original sequences.",
)
parser.add_argument(
"--mut_seq_file",
default='examples/dev.tsv',
type=str,
required=True,
help="Path to mutated sequence+index .tsv file.",
)
parser.add_argument(
"--mut_pred_file",
required=True,
type=str,
default='examples/pred.npy',
help="Path to predictions pred_results.npy of mutated sequences.",
)
parser.add_argument(
"--save_file_dir",
default='.',
type=str,
help="Path to save outputs",
)
# TODO: add the conditions
args = parser.parse_args()
# original sequences
# orig_pred = np.load(args.orig_pred_file)
orig_dev = pd.read_csv(args.orig_seq_file,sep='\t',header=0)
orig_dev.columns = ['sequence','label']
orig_dev['orig_seq'] = orig_dev['sequence'].apply(utils.kmer2seq)
orig_dev['idx'] = orig_dev.index
orig_pred = np.load(args.orig_pred_file)
orig_dev['orig_pred'] = orig_pred
# mutated sequences
# mut_pred = np.load(args.mut_pred_file)
mut_dev = pd.read_csv(args.mut_seq_file,sep='\t',header=0)
mut_dev.columns = ['sequence','label','idx'] #ignore label
mut_dev['mut_seq'] = mut_dev['sequence'].apply(utils.kmer2seq)
mut_pred = np.load(args.mut_pred_file)
mut_dev['mut_pred'] = mut_pred
# merge
dev = pd.merge(orig_dev[['idx','orig_seq','orig_pred']],
mut_dev[['idx','mut_seq','mut_pred']],
on='idx'
)
dev['diff'] = (dev['mut_pred'] - dev['orig_pred'])*(dev[['orig_pred','mut_pred']].max(axis=1))
dev['logOR'] = np.log2(dev['orig_pred']/(1-dev['orig_pred'])) - np.log2(dev['mut_pred']/(1-dev['mut_pred']))
dev.to_csv(os.path.join(args.save_file_dir,'mutations.tsv'),sep='\t')
if __name__ == "__main__":
main()