bilalsm's picture
Upload folder using huggingface_hub
eaca1e1 verified
from os import listdir
from os.path import isfile, join
from pathlib import Path
import shutil
import re
import numpy as np
import torch
from src.snapconfig import config
def create_out_dir(dir_path, exist_ok=True):
out_path = Path(dir_path)
if out_path.exists() and out_path.is_dir():
if not exist_ok:
shutil.rmtree(out_path)
out_path.mkdir()
else:
out_path.mkdir()
Path(join(out_path, 'spectra')).mkdir()
Path(join(out_path, 'peptides')).mkdir()
def verify_in_dir(dir_path, ext, ignore_list=[]):
in_path = Path(dir_path)
assert in_path.exists() and in_path.is_dir()
files = [join(dir_path, f) for f in listdir(dir_path) if
isfile(join(dir_path, f)) and not f.startswith('.')
and f.split('.')[-1] == ext and f not in ignore_list]
assert len(files) > 0
return files
def isfloat(str_float):
try:
float(str_float)
return True
except ValueError:
return False
def mod_repl(match):
lookup = str(round(float(match.group(0)), 2))
return config.ModCHAR[lookup] if lookup in config.ModCHAR else ""
def mod_repl_2(match):
return '[' + str(round(float(match.group(0)), 2)) + ']'
def preprocess_msps(msp_dir, out_dir):
in_path = Path(msp_dir)
assert in_path.exists() and in_path.is_dir()
msp_files = [join(msp_dir, f) for f in listdir(msp_dir) if
isfile(join(msp_dir, f)) and f.split('.')[-1] == 'msp']
assert len(msp_files) > 0
out_path = Path(out_dir)
if out_path.exists() and out_path.is_dir():
shutil.rmtree(out_path)
out_path.mkdir()
Path(join(out_path, 'spectra')).mkdir()
Path(join(out_path, 'peptides')).mkdir()
print('reading {} files'.format(len(msp_files)))
count = 0
max_peaks = max_moz = 0
for species_id, msp_file in enumerate(msp_files):
print('Reading: {}'.format(msp_file))
f = open(msp_file, "r")
lines = f.readlines()
f.close()
# FIXME: config should use only one get_config call.
spec_size = config.get_config(section='input', key='spec_size')
seq_len = config.get_config(section='ml', key='pep_seq_len')
print('len of file: ' + str(len(lines)))
limit = 200000
pep = []
spec = []
is_name = is_mw = is_num_peaks = False
prev = 0
i = 0
while i < len(lines) and limit > 0:
line = lines[i]
i += 1
if line.startswith('Name:'):
name_groups = re.search(r"Name:\s(?P<pep>[a-zA-Z]+)/(?P<charge>\d+)"
r"(?:_(?P<num_mods>\d+)(?P<mods>.*))?", line)
if not name_groups:
continue
pep = name_groups['pep']
if len(pep) + 1 > seq_len:
continue
l_charge = int(name_groups['charge'])
num_mods = int(name_groups['num_mods'])
is_name = True
if is_name and line.startswith('MW:'):
mass = float(re.findall(r"MW:\s([-+]?[0-9]*\.?[0-9]*)", line)[0])
if round(mass) < spec_size:
is_mw = True
# limit = limit - 1
else:
is_name = is_mw = is_num_peaks = False
continue
if is_name and is_mw and line.startswith('Num peaks:'):
num_peaks = int(re.findall(r"Num peaks:\s([0-9]*\.?[0-9]*)", line)[0])
if num_peaks > max_peaks:
max_peaks = num_peaks
spec = np.zeros(spec_size)
while lines[i] != '\n':
mz_line = lines[i]
i += 1
mz_splits = mz_line.split('\t')
moz, intensity = float(mz_splits[0]), float(mz_splits[1])
if moz > max_moz:
max_moz = moz
spec[round(moz)] += round(intensity)
# for k in range(1, charge + 1):
# spec[-k] = 0
# spec[-l_charge] = 1000.0
spec = np.clip(spec, None, 1000.0)
# spec = preprocessing.scale(spec)
is_num_peaks = True
if is_name and is_mw and is_num_peaks:
is_name = is_mw = is_num_peaks = False
#pep = '{}{}{}'.format(charge, species_id, pep)
"""output the data to """
spec_tensor = torch.tensor((np.asarray(spec) - 3.725) / 51.479, dtype=torch.float)
torch.save(spec_tensor,
join(out_dir, 'spectra', '{}-{}-{}-{}-{}.pt'
.format(count, species_id, mass, l_charge, int(num_mods > 0))))
pep_file_name = '{}-{}-{}-{}-{}.pep'.format(count, species_id, mass, l_charge, int(num_mods > 0))
with open(join(out_path, 'peptides', pep_file_name), 'w+') as f:
f.write(pep)
count = count + 1
pep = 0
spec = []
new = int((i / len(lines)) * 100)
if new > prev + 10:
# clear_output(wait=True)
print(str(new) + '%')
prev = new
print('max peaks: ' + str(max_peaks))
print('count: ' + str(count))
print('max moz: ' + str(max_moz))
def preprocess_mgfs(mgf_dir, out_dir):
mgf_files = verify_in_dir(mgf_dir, "mgf")
create_out_dir(out_dir, exist_ok=False)
print('reading {} files'.format(len(mgf_files)))
spec_size = config.get_config(section='input', key='spec_size')
charge = config.get_config(section='input', key='charge')
use_mods = config.get_config(section='input', key='use_mods')
num_species = config.get_config(section='input', key='num_species')
seq_len = config.get_config(section='ml', key='pep_seq_len')
ch = np.zeros(20)
modified = 0
unmodified = 0
unique_pep_set = set()
pep_dict = {}
idx_spec_map = []
pep_spec = []
pep_idx = 0
summ = np.zeros(spec_size)
sq_sum = np.zeros(spec_size)
N = 0
tot_count = 0
max_peaks = max_moz = 0
for species_id, mgf_file in enumerate(mgf_files):
print('Reading: {}'.format(mgf_file))
f = open(mgf_file, "r")
lines = f.readlines()
f.close()
count = lcount = 0
pep_list = []
dataset = []
label = []
mass_ign = 0
pep_len_ign = 0
dup_ign = 0
print('len of file: ' + str(len(lines)))
limit = 200000
pep = []
spec = []
pep_set = set()
is_name = is_mw = is_charge = is_seq = False
prev = 0
i = 0
while i < len(lines) and limit > 0:
line = lines[i]
i += 1
if line.startswith('PEPMASS'):
count += 1
mass = float(re.findall(r"PEPMASS=([-+]?[0-9]*\.?[0-9]*)", line)[0])
if round(mass)*10 < spec_size:
is_mw = True
# limit = limit - 1
else:
is_name = is_mw = is_charge = is_seq = False
mass_ign += 1
continue
if is_mw and line.startswith('CHARGE'):
l_charge = int(re.findall(r"CHARGE=([-+]?[0-9]*\.?[0-9]*)", line)[0])
is_charge = True
mass = (mass - config.PROTON) * l_charge
if is_mw and is_charge and line.startswith("SEQ"):
line = re.sub(r"[()]", "", line.strip()).split('=')[-1]
mod_repl_rex = r'([-+]?\d*\.\d+|[-+]?\d+)'
pep, num_mods = re.subn(mod_repl_rex, mod_repl_2, line)
is_seq = True
if is_mw and is_charge and is_seq:
ind = [] # setting the precision to one decimal point.
val = []
for ch_val in range(l_charge):
ind.append(ch_val)
val.append(1)
while not isfloat(re.split(' |\t|=', lines[i])[0]):
i += 1
num_peaks = 0
while 'END IONS' not in lines[i].upper():
if lines[i] == '\n':
i += 1
continue
mz_line = lines[i]
i += 1
num_peaks += 1
mz_splits = re.split(' |\t', mz_line)
moz, intensity = float(mz_splits[0]), float(mz_splits[1])
if moz > max_moz:
max_moz = moz
if 0 < round(moz*10) < spec_size:
# spec[round(moz*10)] += round(intensity)
if ind[-1] == moz*10:
val[-1] += intensity
else:
ind.append(round(moz*10))
val.append(intensity)
if num_peaks < 15:
is_name = is_mw = is_charge = is_seq = False
continue
ind = np.array(ind)
val = np.array(val)
val = (val - np.amin(val)) / (np.amax(val) - np.amin(val))
for ch_val in range(l_charge):
val[ch_val] = 1
assert len(ind) == len(val)
spec = np.array([ind, val])
summ[ind] += val
sq_sum[ind] += val**2
N += 1
is_name = True
if is_name and is_mw and is_charge and is_seq:
is_name = is_mw = is_charge = is_seq = False
"""output the data to """
spec_file_name = '{}-{}-{}.npy'.format(lcount, mass, l_charge)
np.save(join(out_dir, 'spectra', spec_file_name), spec)
pep_file_name = '{}.pep'.format(lcount)
with open(join(out_dir, 'peptides', pep_file_name), 'w+') as f:
f.write(pep)
lcount += 1
tot_count += 1
pep = 0
spec = []
new = int((i / len(lines)) * 100)
if new >= prev + 10:
#clear_output(wait=True)
print('count: ' + str(lcount))
print(str(new) + '%')
prev = new
#print('max peaks: ' + str(max_peaks))
print('In current file, read {} out of {}'.format(lcount, count))
print("Ignored: large mass: {}, pep len: {}, dup: {}".format(mass_ign, pep_len_ign, dup_ign))
print('overall running count: ' + str(tot_count))
print('max moz: ' + str(max_moz))
# return pep_list, dataset, label
# tmp_pep_list, tmp_dataset, tmp_labels = read_msp(msp_file, species_id, decoy)
# pep_list.extend(tmp_dataset)
# dataset.extend(tmp_dataset)
# label.extend(tmp_labels)
# save the map. this will be used to generate masks for hard positive/negative mining during training.
# np.save(join(out_dir, "idx_spec_map.npy"), idx_spec_map)
# with open(join(out_dir, 'pep_spec.pkl'), 'wb') as f:
# pickle.dump(pep_spec, f)
print("Statistics:")
print("Charge distribution:")
print(ch)
print("Modified:\t{}".format(modified))
print("Unmodified:\t{}".format(unmodified))
print("Unique Peptides:\t{}".format(len(unique_pep_set)))
print("Sum: {}".format(summ))
print("Sum-Squared: {}".format(sq_sum))
print("N: {}".format(N))
means = summ / N
print("mean: {}".format(means))
stds = np.sqrt((sq_sum / N) - means**2)
stds[stds < 0.0000001] = float("inf")
print("std: {}".format(stds))
np.save(join(out_dir, 'means.npy'), means)
np.save(join(out_dir, 'stds.npy'), stds)
# return spectra, masses, charges