Spaces:

bilalsm
/

specollate-search

Sleeping

App Files Files Community

specollate-search / src /snaputils /preprocess.py

bilalsm

Upload folder using huggingface_hub

eaca1e1 verified 7 months ago

raw

history blame contribute delete

12.3 kB

	from os import listdir
	from os.path import isfile, join
	from pathlib import Path
	import shutil
	import re

	import numpy as np
	import torch

	from src.snapconfig import config


	def create_out_dir(dir_path, exist_ok=True):
	out_path = Path(dir_path)
	if out_path.exists() and out_path.is_dir():
	if not exist_ok:
	shutil.rmtree(out_path)
	out_path.mkdir()
	else:
	out_path.mkdir()

	Path(join(out_path, 'spectra')).mkdir()
	Path(join(out_path, 'peptides')).mkdir()


	def verify_in_dir(dir_path, ext, ignore_list=[]):
	in_path = Path(dir_path)
	assert in_path.exists() and in_path.is_dir()

	files = [join(dir_path, f) for f in listdir(dir_path) if
	isfile(join(dir_path, f)) and not f.startswith('.')
	and f.split('.')[-1] == ext and f not in ignore_list]
	assert len(files) > 0
	return files


	def isfloat(str_float):
	try:
	float(str_float)
	return True
	except ValueError:
	return False


	def mod_repl(match):
	lookup = str(round(float(match.group(0)), 2))
	return config.ModCHAR[lookup] if lookup in config.ModCHAR else ""


	def mod_repl_2(match):
	return '[' + str(round(float(match.group(0)), 2)) + ']'


	def preprocess_msps(msp_dir, out_dir):
	in_path = Path(msp_dir)
	assert in_path.exists() and in_path.is_dir()

	msp_files = [join(msp_dir, f) for f in listdir(msp_dir) if
	isfile(join(msp_dir, f)) and f.split('.')[-1] == 'msp']
	assert len(msp_files) > 0

	out_path = Path(out_dir)
	if out_path.exists() and out_path.is_dir():
	shutil.rmtree(out_path)
	out_path.mkdir()
	Path(join(out_path, 'spectra')).mkdir()
	Path(join(out_path, 'peptides')).mkdir()

	print('reading {} files'.format(len(msp_files)))

	count = 0
	max_peaks = max_moz = 0
	for species_id, msp_file in enumerate(msp_files):
	print('Reading: {}'.format(msp_file))

	f = open(msp_file, "r")
	lines = f.readlines()
	f.close()

	# FIXME: config should use only one get_config call.
	spec_size = config.get_config(section='input', key='spec_size')
	seq_len = config.get_config(section='ml', key='pep_seq_len')

	print('len of file: ' + str(len(lines)))
	limit = 200000
	pep = []
	spec = []
	is_name = is_mw = is_num_peaks = False
	prev = 0
	i = 0
	while i < len(lines) and limit > 0:
	line = lines[i]
	i += 1
	if line.startswith('Name:'):
	name_groups = re.search(r"Name:\s(?P<pep>[a-zA-Z]+)/(?P<charge>\d+)"
	r"(?:_(?P<num_mods>\d+)(?P<mods>.*))?", line)
	if not name_groups:
	continue

	pep = name_groups['pep']
	if len(pep) + 1 > seq_len:
	continue

	l_charge = int(name_groups['charge'])
	num_mods = int(name_groups['num_mods'])

	is_name = True

	if is_name and line.startswith('MW:'):
	mass = float(re.findall(r"MW:\s([-+]?[0-9]\.?[0-9])", line)[0])
	if round(mass) < spec_size:
	is_mw = True
	# limit = limit - 1
	else:
	is_name = is_mw = is_num_peaks = False
	continue

	if is_name and is_mw and line.startswith('Num peaks:'):
	num_peaks = int(re.findall(r"Num peaks:\s([0-9]\.?[0-9])", line)[0])
	if num_peaks > max_peaks:
	max_peaks = num_peaks

	spec = np.zeros(spec_size)
	while lines[i] != '\n':
	mz_line = lines[i]
	i += 1
	mz_splits = mz_line.split('\t')
	moz, intensity = float(mz_splits[0]), float(mz_splits[1])
	if moz > max_moz:
	max_moz = moz
	spec[round(moz)] += round(intensity)

	# for k in range(1, charge + 1):
	# spec[-k] = 0
	# spec[-l_charge] = 1000.0
	spec = np.clip(spec, None, 1000.0)
	# spec = preprocessing.scale(spec)

	is_num_peaks = True

	if is_name and is_mw and is_num_peaks:
	is_name = is_mw = is_num_peaks = False

	#pep = '{}{}{}'.format(charge, species_id, pep)

	"""output the data to """
	spec_tensor = torch.tensor((np.asarray(spec) - 3.725) / 51.479, dtype=torch.float)

	torch.save(spec_tensor,
	join(out_dir, 'spectra', '{}-{}-{}-{}-{}.pt'
	.format(count, species_id, mass, l_charge, int(num_mods > 0))))

	pep_file_name = '{}-{}-{}-{}-{}.pep'.format(count, species_id, mass, l_charge, int(num_mods > 0))

	with open(join(out_path, 'peptides', pep_file_name), 'w+') as f:
	f.write(pep)

	count = count + 1
	pep = 0
	spec = []
	new = int((i / len(lines)) * 100)
	if new > prev + 10:
	# clear_output(wait=True)
	print(str(new) + '%')
	prev = new

	print('max peaks: ' + str(max_peaks))
	print('count: ' + str(count))
	print('max moz: ' + str(max_moz))


	def preprocess_mgfs(mgf_dir, out_dir):

	mgf_files = verify_in_dir(mgf_dir, "mgf")
	create_out_dir(out_dir, exist_ok=False)

	print('reading {} files'.format(len(mgf_files)))

	spec_size = config.get_config(section='input', key='spec_size')
	charge = config.get_config(section='input', key='charge')
	use_mods = config.get_config(section='input', key='use_mods')
	num_species = config.get_config(section='input', key='num_species')
	seq_len = config.get_config(section='ml', key='pep_seq_len')

	ch = np.zeros(20)
	modified = 0
	unmodified = 0
	unique_pep_set = set()

	pep_dict = {}
	idx_spec_map = []
	pep_spec = []
	pep_idx = 0

	summ = np.zeros(spec_size)
	sq_sum = np.zeros(spec_size)
	N = 0

	tot_count = 0
	max_peaks = max_moz = 0
	for species_id, mgf_file in enumerate(mgf_files):
	print('Reading: {}'.format(mgf_file))

	f = open(mgf_file, "r")
	lines = f.readlines()
	f.close()

	count = lcount = 0

	pep_list = []
	dataset = []
	label = []

	mass_ign = 0
	pep_len_ign = 0
	dup_ign = 0

	print('len of file: ' + str(len(lines)))
	limit = 200000
	pep = []
	spec = []
	pep_set = set()
	is_name = is_mw = is_charge = is_seq = False
	prev = 0
	i = 0
	while i < len(lines) and limit > 0:
	line = lines[i]
	i += 1

	if line.startswith('PEPMASS'):
	count += 1
	mass = float(re.findall(r"PEPMASS=([-+]?[0-9]\.?[0-9])", line)[0])
	if round(mass)*10 < spec_size:
	is_mw = True
	# limit = limit - 1
	else:
	is_name = is_mw = is_charge = is_seq = False
	mass_ign += 1
	continue

	if is_mw and line.startswith('CHARGE'):
	l_charge = int(re.findall(r"CHARGE=([-+]?[0-9]\.?[0-9])", line)[0])
	is_charge = True
	mass = (mass - config.PROTON) * l_charge

	if is_mw and is_charge and line.startswith("SEQ"):
	line = re.sub(r"[()]", "", line.strip()).split('=')[-1]
	mod_repl_rex = r'([-+]?\d*\.\d+\|[-+]?\d+)'
	pep, num_mods = re.subn(mod_repl_rex, mod_repl_2, line)
	is_seq = True

	if is_mw and is_charge and is_seq:
	ind = [] # setting the precision to one decimal point.
	val = []
	for ch_val in range(l_charge):
	ind.append(ch_val)
	val.append(1)

	while not isfloat(re.split(' \|\t\|=', lines[i])[0]):
	i += 1
	num_peaks = 0
	while 'END IONS' not in lines[i].upper():
	if lines[i] == '\n':
	i += 1
	continue
	mz_line = lines[i]
	i += 1
	num_peaks += 1
	mz_splits = re.split(' \|\t', mz_line)
	moz, intensity = float(mz_splits[0]), float(mz_splits[1])
	if moz > max_moz:
	max_moz = moz
	if 0 < round(moz*10) < spec_size:
	# spec[round(moz*10)] += round(intensity)
	if ind[-1] == moz*10:
	val[-1] += intensity
	else:
	ind.append(round(moz*10))
	val.append(intensity)
	if num_peaks < 15:
	is_name = is_mw = is_charge = is_seq = False
	continue
	ind = np.array(ind)
	val = np.array(val)
	val = (val - np.amin(val)) / (np.amax(val) - np.amin(val))
	for ch_val in range(l_charge):
	val[ch_val] = 1
	assert len(ind) == len(val)
	spec = np.array([ind, val])

	summ[ind] += val
	sq_sum[ind] += val**2
	N += 1

	is_name = True

	if is_name and is_mw and is_charge and is_seq:
	is_name = is_mw = is_charge = is_seq = False

	"""output the data to """
	spec_file_name = '{}-{}-{}.npy'.format(lcount, mass, l_charge)
	np.save(join(out_dir, 'spectra', spec_file_name), spec)

	pep_file_name = '{}.pep'.format(lcount)
	with open(join(out_dir, 'peptides', pep_file_name), 'w+') as f:
	f.write(pep)

	lcount += 1
	tot_count += 1

	pep = 0
	spec = []
	new = int((i / len(lines)) * 100)
	if new >= prev + 10:
	#clear_output(wait=True)
	print('count: ' + str(lcount))
	print(str(new) + '%')
	prev = new

	#print('max peaks: ' + str(max_peaks))
	print('In current file, read {} out of {}'.format(lcount, count))
	print("Ignored: large mass: {}, pep len: {}, dup: {}".format(mass_ign, pep_len_ign, dup_ign))
	print('overall running count: ' + str(tot_count))
	print('max moz: ' + str(max_moz))
	# return pep_list, dataset, label
	# tmp_pep_list, tmp_dataset, tmp_labels = read_msp(msp_file, species_id, decoy)
	# pep_list.extend(tmp_dataset)
	# dataset.extend(tmp_dataset)
	# label.extend(tmp_labels)

	# save the map. this will be used to generate masks for hard positive/negative mining during training.
	# np.save(join(out_dir, "idx_spec_map.npy"), idx_spec_map)
	# with open(join(out_dir, 'pep_spec.pkl'), 'wb') as f:
	# pickle.dump(pep_spec, f)

	print("Statistics:")
	print("Charge distribution:")
	print(ch)
	print("Modified:\t{}".format(modified))
	print("Unmodified:\t{}".format(unmodified))
	print("Unique Peptides:\t{}".format(len(unique_pep_set)))
	print("Sum: {}".format(summ))
	print("Sum-Squared: {}".format(sq_sum))
	print("N: {}".format(N))
	means = summ / N
	print("mean: {}".format(means))
	stds = np.sqrt((sq_sum / N) - means**2)
	stds[stds < 0.0000001] = float("inf")
	print("std: {}".format(stds))
	np.save(join(out_dir, 'means.npy'), means)
	np.save(join(out_dir, 'stds.npy'), stds)

	# return spectra, masses, charges