File size: 3,815 Bytes
ab6c03c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#### ::: mutate seqs ::: ####

import os
import sys
sys.path.append('../motif')
import pandas as pd
import numpy as np
import argparse
import motif_utils as utils


def mutate(seq, start, end, target=None):
    """
    Mutate input sequence at specified position.
    
    If target is not None, returns the mutated seq. Otherwise, returns a numpy array with shape (4,1)
    with all four mutated possibilities.
    
    Arguments:
    seq -- str, original sequence.
    start -- int, starting index where nucleotide needs to be changed. Counting starts at zero.
    end -- int, ending index where nucleotide needs to be changed. Counting starts at zero.
    
    Keyword arguments:
    target -- str, the target nucleotide(s) to be changed to (default: None).
    
    Returns:
    mutated_seq -- str, mutated sequence.

    """
    assert end >= start and start >= 0 and end <= len(seq), "Wrong start and end index input."
    
    if target is not None:
        mutated_seq = seq[:start] + str(target) + seq[end:]
    else:
        mutated_seq = []
        for n in ['A','T','G','C']:
            m_seq = seq[:start] + str(n) + seq[end:]
            mutated_seq.append(m_seq)
        mutated_seq = np.asarray(mutated_seq)
    return mutated_seq

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "seq_file",
        type=str,
        help="Path to input sequence+label .tsv file.",
    )
    
    parser.add_argument(
        "save_file_dir",
        type=str,
        help="Path to save the mutated seqs",
    )
    
    parser.add_argument(
        "--mut_file",
        default=None,
        type=str,
        help="Path to the file defining how each input seq should be mutated",
    )
    
    parser.add_argument(
        "--k",
        default=3,
        type=int,
        help="length of kmer for conversion of mutated seqs"
    )

    # TODO: add the conditions
    args = parser.parse_args()
    
    os.makedirs(args.save_file_dir, exist_ok=True)
    
    mutated_dev = {'index':[],'seq':[]}
    
    dev = pd.read_csv(args.seq_file,sep='\t',header=0)
    dev.columns = ['sequence','label']
    dev['seq'] = dev['sequence'].apply(utils.kmer2seq)
    
    if args.mut_file is not None:
        mut_file = pd.read_csv(args.mut_file, sep='\t',header=None)
        mut_file = mut_file.fillna('')
        mut_file.columns = ['idx','start', 'end', 'allele']
        mut_file['idx'] = mut_file['idx'].astype(int)
        mut_file['start'] = mut_file['start'].astype(int)
        mut_file['end'] = mut_file['end'].astype(int)
        dev_selected = dev.iloc[mut_file['idx'].tolist(),:].reset_index()
        for i, row in dev_selected.iterrows():
            seq = row['seq']
            mut = mut_file.iloc[i]
            mut_seq = mutate(seq, mut['start'], mut['end'], target = mut['allele'])
            mut_seq = utils.seq2kmer(mut_seq, args.k)
            mutated_dev['index'].append(mut['idx'])
            mutated_dev['seq'].append(mut_seq)
    else:
        for i, row in dev.iterrows():
            seq = row['seq']
            for j in range(len(seq)):
                mut_seq = mutate(seq, j, j+1)
                mut_seq = [utils.seq2kmer(seq, args.k) for seq in mut_seq]
                idx = [i] * 4
                mutated_dev['index'].extend(idx)
                mutated_dev['seq'].extend(mut_seq)

    mutated_dev = pd.DataFrame.from_dict(mutated_dev)
    mutated_dev = mutated_dev[['seq','index']]
    mutated_dev.columns = ['sequence','index']
    mutated_dev['label'] = 0
    mutated_dev.iloc[0, mutated_dev.columns.get_loc('label')] = 1
    mutated_dev = mutated_dev[['sequence','label','index']]
            
    mutated_dev.to_csv(os.path.join(args.save_file_dir,'dev.tsv'),sep='\t',header=True, index=False)
    

if __name__ == "__main__":
    main()