#### ::: mutate seqs ::: #### import os import sys sys.path.append('../motif') import pandas as pd import numpy as np import argparse import motif_utils as utils def mutate(seq, start, end, target=None): """ Mutate input sequence at specified position. If target is not None, returns the mutated seq. Otherwise, returns a numpy array with shape (4,1) with all four mutated possibilities. Arguments: seq -- str, original sequence. start -- int, starting index where nucleotide needs to be changed. Counting starts at zero. end -- int, ending index where nucleotide needs to be changed. Counting starts at zero. Keyword arguments: target -- str, the target nucleotide(s) to be changed to (default: None). Returns: mutated_seq -- str, mutated sequence. """ assert end >= start and start >= 0 and end <= len(seq), "Wrong start and end index input." if target is not None: mutated_seq = seq[:start] + str(target) + seq[end:] else: mutated_seq = [] for n in ['A','T','G','C']: m_seq = seq[:start] + str(n) + seq[end:] mutated_seq.append(m_seq) mutated_seq = np.asarray(mutated_seq) return mutated_seq def main(): parser = argparse.ArgumentParser() parser.add_argument( "seq_file", type=str, help="Path to input sequence+label .tsv file.", ) parser.add_argument( "save_file_dir", type=str, help="Path to save the mutated seqs", ) parser.add_argument( "--mut_file", default=None, type=str, help="Path to the file defining how each input seq should be mutated", ) parser.add_argument( "--k", default=3, type=int, help="length of kmer for conversion of mutated seqs" ) # TODO: add the conditions args = parser.parse_args() os.makedirs(args.save_file_dir, exist_ok=True) mutated_dev = {'index':[],'seq':[]} dev = pd.read_csv(args.seq_file,sep='\t',header=0) dev.columns = ['sequence','label'] dev['seq'] = dev['sequence'].apply(utils.kmer2seq) if args.mut_file is not None: mut_file = pd.read_csv(args.mut_file, sep='\t',header=None) mut_file = mut_file.fillna('') mut_file.columns = ['idx','start', 'end', 'allele'] mut_file['idx'] = mut_file['idx'].astype(int) mut_file['start'] = mut_file['start'].astype(int) mut_file['end'] = mut_file['end'].astype(int) dev_selected = dev.iloc[mut_file['idx'].tolist(),:].reset_index() for i, row in dev_selected.iterrows(): seq = row['seq'] mut = mut_file.iloc[i] mut_seq = mutate(seq, mut['start'], mut['end'], target = mut['allele']) mut_seq = utils.seq2kmer(mut_seq, args.k) mutated_dev['index'].append(mut['idx']) mutated_dev['seq'].append(mut_seq) else: for i, row in dev.iterrows(): seq = row['seq'] for j in range(len(seq)): mut_seq = mutate(seq, j, j+1) mut_seq = [utils.seq2kmer(seq, args.k) for seq in mut_seq] idx = [i] * 4 mutated_dev['index'].extend(idx) mutated_dev['seq'].extend(mut_seq) mutated_dev = pd.DataFrame.from_dict(mutated_dev) mutated_dev = mutated_dev[['seq','index']] mutated_dev.columns = ['sequence','index'] mutated_dev['label'] = 0 mutated_dev.iloc[0, mutated_dev.columns.get_loc('label')] = 1 mutated_dev = mutated_dev[['sequence','label','index']] mutated_dev.to_csv(os.path.join(args.save_file_dir,'dev.tsv'),sep='\t',header=True, index=False) if __name__ == "__main__": main()