Spaces:
Sleeping
Sleeping
| from os import listdir | |
| from os.path import isfile, join | |
| from pathlib import Path | |
| import shutil | |
| import re | |
| import numpy as np | |
| import torch | |
| from src.snapconfig import config | |
| def create_out_dir(dir_path, exist_ok=True): | |
| out_path = Path(dir_path) | |
| if out_path.exists() and out_path.is_dir(): | |
| if not exist_ok: | |
| shutil.rmtree(out_path) | |
| out_path.mkdir() | |
| else: | |
| out_path.mkdir() | |
| Path(join(out_path, 'spectra')).mkdir() | |
| Path(join(out_path, 'peptides')).mkdir() | |
| def verify_in_dir(dir_path, ext, ignore_list=[]): | |
| in_path = Path(dir_path) | |
| assert in_path.exists() and in_path.is_dir() | |
| files = [join(dir_path, f) for f in listdir(dir_path) if | |
| isfile(join(dir_path, f)) and not f.startswith('.') | |
| and f.split('.')[-1] == ext and f not in ignore_list] | |
| assert len(files) > 0 | |
| return files | |
| def isfloat(str_float): | |
| try: | |
| float(str_float) | |
| return True | |
| except ValueError: | |
| return False | |
| def mod_repl(match): | |
| lookup = str(round(float(match.group(0)), 2)) | |
| return config.ModCHAR[lookup] if lookup in config.ModCHAR else "" | |
| def mod_repl_2(match): | |
| return '[' + str(round(float(match.group(0)), 2)) + ']' | |
| def preprocess_msps(msp_dir, out_dir): | |
| in_path = Path(msp_dir) | |
| assert in_path.exists() and in_path.is_dir() | |
| msp_files = [join(msp_dir, f) for f in listdir(msp_dir) if | |
| isfile(join(msp_dir, f)) and f.split('.')[-1] == 'msp'] | |
| assert len(msp_files) > 0 | |
| out_path = Path(out_dir) | |
| if out_path.exists() and out_path.is_dir(): | |
| shutil.rmtree(out_path) | |
| out_path.mkdir() | |
| Path(join(out_path, 'spectra')).mkdir() | |
| Path(join(out_path, 'peptides')).mkdir() | |
| print('reading {} files'.format(len(msp_files))) | |
| count = 0 | |
| max_peaks = max_moz = 0 | |
| for species_id, msp_file in enumerate(msp_files): | |
| print('Reading: {}'.format(msp_file)) | |
| f = open(msp_file, "r") | |
| lines = f.readlines() | |
| f.close() | |
| # FIXME: config should use only one get_config call. | |
| spec_size = config.get_config(section='input', key='spec_size') | |
| seq_len = config.get_config(section='ml', key='pep_seq_len') | |
| print('len of file: ' + str(len(lines))) | |
| limit = 200000 | |
| pep = [] | |
| spec = [] | |
| is_name = is_mw = is_num_peaks = False | |
| prev = 0 | |
| i = 0 | |
| while i < len(lines) and limit > 0: | |
| line = lines[i] | |
| i += 1 | |
| if line.startswith('Name:'): | |
| name_groups = re.search(r"Name:\s(?P<pep>[a-zA-Z]+)/(?P<charge>\d+)" | |
| r"(?:_(?P<num_mods>\d+)(?P<mods>.*))?", line) | |
| if not name_groups: | |
| continue | |
| pep = name_groups['pep'] | |
| if len(pep) + 1 > seq_len: | |
| continue | |
| l_charge = int(name_groups['charge']) | |
| num_mods = int(name_groups['num_mods']) | |
| is_name = True | |
| if is_name and line.startswith('MW:'): | |
| mass = float(re.findall(r"MW:\s([-+]?[0-9]*\.?[0-9]*)", line)[0]) | |
| if round(mass) < spec_size: | |
| is_mw = True | |
| # limit = limit - 1 | |
| else: | |
| is_name = is_mw = is_num_peaks = False | |
| continue | |
| if is_name and is_mw and line.startswith('Num peaks:'): | |
| num_peaks = int(re.findall(r"Num peaks:\s([0-9]*\.?[0-9]*)", line)[0]) | |
| if num_peaks > max_peaks: | |
| max_peaks = num_peaks | |
| spec = np.zeros(spec_size) | |
| while lines[i] != '\n': | |
| mz_line = lines[i] | |
| i += 1 | |
| mz_splits = mz_line.split('\t') | |
| moz, intensity = float(mz_splits[0]), float(mz_splits[1]) | |
| if moz > max_moz: | |
| max_moz = moz | |
| spec[round(moz)] += round(intensity) | |
| # for k in range(1, charge + 1): | |
| # spec[-k] = 0 | |
| # spec[-l_charge] = 1000.0 | |
| spec = np.clip(spec, None, 1000.0) | |
| # spec = preprocessing.scale(spec) | |
| is_num_peaks = True | |
| if is_name and is_mw and is_num_peaks: | |
| is_name = is_mw = is_num_peaks = False | |
| #pep = '{}{}{}'.format(charge, species_id, pep) | |
| """output the data to """ | |
| spec_tensor = torch.tensor((np.asarray(spec) - 3.725) / 51.479, dtype=torch.float) | |
| torch.save(spec_tensor, | |
| join(out_dir, 'spectra', '{}-{}-{}-{}-{}.pt' | |
| .format(count, species_id, mass, l_charge, int(num_mods > 0)))) | |
| pep_file_name = '{}-{}-{}-{}-{}.pep'.format(count, species_id, mass, l_charge, int(num_mods > 0)) | |
| with open(join(out_path, 'peptides', pep_file_name), 'w+') as f: | |
| f.write(pep) | |
| count = count + 1 | |
| pep = 0 | |
| spec = [] | |
| new = int((i / len(lines)) * 100) | |
| if new > prev + 10: | |
| # clear_output(wait=True) | |
| print(str(new) + '%') | |
| prev = new | |
| print('max peaks: ' + str(max_peaks)) | |
| print('count: ' + str(count)) | |
| print('max moz: ' + str(max_moz)) | |
| def preprocess_mgfs(mgf_dir, out_dir): | |
| mgf_files = verify_in_dir(mgf_dir, "mgf") | |
| create_out_dir(out_dir, exist_ok=False) | |
| print('reading {} files'.format(len(mgf_files))) | |
| spec_size = config.get_config(section='input', key='spec_size') | |
| charge = config.get_config(section='input', key='charge') | |
| use_mods = config.get_config(section='input', key='use_mods') | |
| num_species = config.get_config(section='input', key='num_species') | |
| seq_len = config.get_config(section='ml', key='pep_seq_len') | |
| ch = np.zeros(20) | |
| modified = 0 | |
| unmodified = 0 | |
| unique_pep_set = set() | |
| pep_dict = {} | |
| idx_spec_map = [] | |
| pep_spec = [] | |
| pep_idx = 0 | |
| summ = np.zeros(spec_size) | |
| sq_sum = np.zeros(spec_size) | |
| N = 0 | |
| tot_count = 0 | |
| max_peaks = max_moz = 0 | |
| for species_id, mgf_file in enumerate(mgf_files): | |
| print('Reading: {}'.format(mgf_file)) | |
| f = open(mgf_file, "r") | |
| lines = f.readlines() | |
| f.close() | |
| count = lcount = 0 | |
| pep_list = [] | |
| dataset = [] | |
| label = [] | |
| mass_ign = 0 | |
| pep_len_ign = 0 | |
| dup_ign = 0 | |
| print('len of file: ' + str(len(lines))) | |
| limit = 200000 | |
| pep = [] | |
| spec = [] | |
| pep_set = set() | |
| is_name = is_mw = is_charge = is_seq = False | |
| prev = 0 | |
| i = 0 | |
| while i < len(lines) and limit > 0: | |
| line = lines[i] | |
| i += 1 | |
| if line.startswith('PEPMASS'): | |
| count += 1 | |
| mass = float(re.findall(r"PEPMASS=([-+]?[0-9]*\.?[0-9]*)", line)[0]) | |
| if round(mass)*10 < spec_size: | |
| is_mw = True | |
| # limit = limit - 1 | |
| else: | |
| is_name = is_mw = is_charge = is_seq = False | |
| mass_ign += 1 | |
| continue | |
| if is_mw and line.startswith('CHARGE'): | |
| l_charge = int(re.findall(r"CHARGE=([-+]?[0-9]*\.?[0-9]*)", line)[0]) | |
| is_charge = True | |
| mass = (mass - config.PROTON) * l_charge | |
| if is_mw and is_charge and line.startswith("SEQ"): | |
| line = re.sub(r"[()]", "", line.strip()).split('=')[-1] | |
| mod_repl_rex = r'([-+]?\d*\.\d+|[-+]?\d+)' | |
| pep, num_mods = re.subn(mod_repl_rex, mod_repl_2, line) | |
| is_seq = True | |
| if is_mw and is_charge and is_seq: | |
| ind = [] # setting the precision to one decimal point. | |
| val = [] | |
| for ch_val in range(l_charge): | |
| ind.append(ch_val) | |
| val.append(1) | |
| while not isfloat(re.split(' |\t|=', lines[i])[0]): | |
| i += 1 | |
| num_peaks = 0 | |
| while 'END IONS' not in lines[i].upper(): | |
| if lines[i] == '\n': | |
| i += 1 | |
| continue | |
| mz_line = lines[i] | |
| i += 1 | |
| num_peaks += 1 | |
| mz_splits = re.split(' |\t', mz_line) | |
| moz, intensity = float(mz_splits[0]), float(mz_splits[1]) | |
| if moz > max_moz: | |
| max_moz = moz | |
| if 0 < round(moz*10) < spec_size: | |
| # spec[round(moz*10)] += round(intensity) | |
| if ind[-1] == moz*10: | |
| val[-1] += intensity | |
| else: | |
| ind.append(round(moz*10)) | |
| val.append(intensity) | |
| if num_peaks < 15: | |
| is_name = is_mw = is_charge = is_seq = False | |
| continue | |
| ind = np.array(ind) | |
| val = np.array(val) | |
| val = (val - np.amin(val)) / (np.amax(val) - np.amin(val)) | |
| for ch_val in range(l_charge): | |
| val[ch_val] = 1 | |
| assert len(ind) == len(val) | |
| spec = np.array([ind, val]) | |
| summ[ind] += val | |
| sq_sum[ind] += val**2 | |
| N += 1 | |
| is_name = True | |
| if is_name and is_mw and is_charge and is_seq: | |
| is_name = is_mw = is_charge = is_seq = False | |
| """output the data to """ | |
| spec_file_name = '{}-{}-{}.npy'.format(lcount, mass, l_charge) | |
| np.save(join(out_dir, 'spectra', spec_file_name), spec) | |
| pep_file_name = '{}.pep'.format(lcount) | |
| with open(join(out_dir, 'peptides', pep_file_name), 'w+') as f: | |
| f.write(pep) | |
| lcount += 1 | |
| tot_count += 1 | |
| pep = 0 | |
| spec = [] | |
| new = int((i / len(lines)) * 100) | |
| if new >= prev + 10: | |
| #clear_output(wait=True) | |
| print('count: ' + str(lcount)) | |
| print(str(new) + '%') | |
| prev = new | |
| #print('max peaks: ' + str(max_peaks)) | |
| print('In current file, read {} out of {}'.format(lcount, count)) | |
| print("Ignored: large mass: {}, pep len: {}, dup: {}".format(mass_ign, pep_len_ign, dup_ign)) | |
| print('overall running count: ' + str(tot_count)) | |
| print('max moz: ' + str(max_moz)) | |
| # return pep_list, dataset, label | |
| # tmp_pep_list, tmp_dataset, tmp_labels = read_msp(msp_file, species_id, decoy) | |
| # pep_list.extend(tmp_dataset) | |
| # dataset.extend(tmp_dataset) | |
| # label.extend(tmp_labels) | |
| # save the map. this will be used to generate masks for hard positive/negative mining during training. | |
| # np.save(join(out_dir, "idx_spec_map.npy"), idx_spec_map) | |
| # with open(join(out_dir, 'pep_spec.pkl'), 'wb') as f: | |
| # pickle.dump(pep_spec, f) | |
| print("Statistics:") | |
| print("Charge distribution:") | |
| print(ch) | |
| print("Modified:\t{}".format(modified)) | |
| print("Unmodified:\t{}".format(unmodified)) | |
| print("Unique Peptides:\t{}".format(len(unique_pep_set))) | |
| print("Sum: {}".format(summ)) | |
| print("Sum-Squared: {}".format(sq_sum)) | |
| print("N: {}".format(N)) | |
| means = summ / N | |
| print("mean: {}".format(means)) | |
| stds = np.sqrt((sq_sum / N) - means**2) | |
| stds[stds < 0.0000001] = float("inf") | |
| print("std: {}".format(stds)) | |
| np.save(join(out_dir, 'means.npy'), means) | |
| np.save(join(out_dir, 'stds.npy'), stds) | |
| # return spectra, masses, charges |