File size: 3,483 Bytes
eaca1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pickle
import re
import time
import random as rand
from os.path import join

from sklearn.model_selection import train_test_split

from src.snapconfig import config

def apply_filter(l_filt, file_name):
    try:
        file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d+)-(\d+).[pt|npy]", file_name)
        l_charge = int(file_parts[4])
        mods = int(file_parts[5])
    except:
        print(file_name)
        print(file_parts)
    
    if ((l_filt["charge"] == 0 or l_charge <= l_filt["charge"]) # change this back to <=
        and (mods <= l_filt["mods"])):
        return True
    
    return True#False

def load_file_names(l_filt, l_listing_path, count=None):
    'Load the peptide and corresponding spectra file names that satisfy the filter'
    with open(l_listing_path, 'rb') as f:
        dir_listing = pickle.load(f)

    rand.shuffle(dir_listing)
    l_pep_file_names = []
    l_spec_file_names_lists = []
    for pep, spec_list in dir_listing[:count]:
        spec_file_list = []
        for spec in spec_list:
            if apply_filter(l_filt, spec):
                spec_file_list.append(spec)
        if spec_file_list:
            l_pep_file_names.append(pep)
            l_spec_file_names_lists.append(spec_file_list)

    assert len(l_pep_file_names) == len(l_spec_file_names_lists)
    return l_pep_file_names, l_spec_file_names_lists

if __name__ == '__main__':
    charge      = config.get_config(section='input', key='charge')
    use_mods    = config.get_config(section='input', key='use_mods')
    num_mods    = config.get_config(section='input', key='num_mods')
    filt        = {'charge': charge, 'mods': num_mods if use_mods else 0}
    test_size   = config.get_config(section='ml', key='test_size')
    train_count = config.get_config(section="ml", key="train_count")
    batch_size  = config.get_config(section="ml", key="batch_size")
    train_count = None if train_count == 0 else train_count

    in_tensor_dir = config.get_config(section='preprocess', key='in_tensor_dir')
    print(in_tensor_dir)
    listing_path = join(in_tensor_dir, 'pep_spec.pkl')
    pep_file_names, spec_file_names_lists = load_file_names(filt, listing_path, train_count)
    
    split_rand_state = int(time.time())
    train_peps, test_peps, train_specs, test_specs = train_test_split(
        pep_file_names, spec_file_names_lists, test_size=test_size,
        random_state=split_rand_state, shuffle=True)
    
    # test_peps, val_peps, test_specs, val_specs = train_test_split(
    #     test_peps, test_specs, test_size=.1,
    #     random_state=split_rand_state, shuffle=True)
    # get the 100k version
    # train_peps  = train_peps[:80000]
    # train_specs = train_specs[:80000]
    # test_peps   = test_peps[:20000]
    # test_specs  = test_specs[:20000]

    print("Writing train test split listings as pickles.")
    with open(join(in_tensor_dir, "train_peps.pkl"), "wb") as trp:
        pickle.dump(train_peps, trp)
    with open(join(in_tensor_dir, "train_specs.pkl"), "wb") as trs:
        pickle.dump(train_specs, trs)
    with open(join(in_tensor_dir, "test_peps.pkl"), "wb") as tep:
        pickle.dump(test_peps, tep)
    with open(join(in_tensor_dir, "test_specs.pkl"), "wb") as tes:
        pickle.dump(test_specs, tes)
    # with open(join(in_tensor_dir, "val_peps.pkl"), "wb") as vap:
    #     pickle.dump(test_peps, vap)
    # with open(join(in_tensor_dir, "val_specs.pkl"), "wb") as vas:
    #     pickle.dump(test_specs, vas)