Spaces:

bilalsm
/

specollate-search

Sleeping

App Files Files Community

specollate-search / src /snapsearch /preprocess.py

bilalsm

Upload folder using huggingface_hub

eaca1e1 verified 7 months ago

raw

history blame contribute delete

6.31 kB

	from os import listdir
	from os.path import isfile, join
	from pathlib import Path
	import shutil
	import re
	import math

	import numpy as np

	from src.snapconfig import config


	def create_out_dir(dir_path, exist_ok=True):
	out_path = Path(dir_path)
	if out_path.exists() and out_path.is_dir():
	if not exist_ok:
	shutil.rmtree(out_path)
	out_path.mkdir()
	else:
	out_path.mkdir()

	Path(join(out_path, 'spectra')).mkdir()
	Path(join(out_path, 'peptides')).mkdir()


	def verify_in_dir(dir_path, ext, ignore_list=[]):
	in_path = Path(dir_path)
	assert in_path.exists() and in_path.is_dir()

	files = [join(dir_path, f) for f in listdir(dir_path) if
	isfile(join(dir_path, f)) and not f.startswith('.')
	and f.split('.')[-1] == ext and f not in ignore_list]
	assert len(files) > 0
	return files


	def isfloat(str_float):
	try:
	float(str_float)
	return True
	except ValueError:
	return False


	def mod_repl(match):
	lookup = str(round(float(match.group(0)), 2))
	return config.ModCHAR[lookup] if lookup in config.ModCHAR else ""


	def mod_repl_2(match):
	return '[' + str(round(float(match.group(0)), 2)) + ']'


	def preprocess_mgfs(mgf_dir, out_dir):

	mgf_files = verify_in_dir(mgf_dir, "mgf")
	create_out_dir(out_dir, exist_ok=False)

	print('reading {} files'.format(len(mgf_files)))

	spec_size = config.get_config(section='input', key='spec_size')

	ch = np.zeros(20)
	modified = 0
	unmodified = 0
	unique_pep_set = set()

	summ = np.zeros(spec_size)
	sq_sum = np.zeros(spec_size)
	N = 0

	tot_count = 0
	max_moz = 0
	for mgf_file in mgf_files:
	print('Reading: {}'.format(mgf_file))

	f = open(mgf_file, "r")
	lines = f.readlines()
	f.close()

	count = lcount = 0

	mass_ign = 0
	pep_len_ign = 0
	dup_ign = 0

	print('len of file: ' + str(len(lines)))
	limit = 200000
	spec = []
	is_name = is_mw = is_charge = False
	prev = 0
	i = 0
	while i < len(lines) and limit > 0:
	line = lines[i]
	i += 1

	if line.startswith('PEPMASS'):
	count += 1
	mass = float(re.findall(r"PEPMASS=([-+]?[0-9]\.?[0-9])", line)[0])
	is_mw = True

	if is_mw and line.startswith('CHARGE'):
	l_charge = int(re.findall(r"CHARGE=([-+]?[0-9]\.?[0-9])", line)[0])
	is_charge = True
	mass = (mass - config.PROTON) * l_charge

	if is_mw and is_charge:

	while not isfloat(re.split(' \|\t\|=', lines[i])[0]):
	i += 1

	spec_ind = []
	spec_val = []
	num_peaks = 0
	while 'END IONS' not in lines[i].upper():
	if lines[i] == '\n':
	i += 1
	continue
	mz_line = lines[i]
	i += 1
	num_peaks += 1
	mz_splits = re.split(' \|\t', mz_line)
	moz = round(float(mz_splits[0]) * 10) # + 32 # 32 because charge is len 8 and mass is len 24
	intensity = math.sqrt(float(mz_splits[1]) + 1.0) # adding 1 to avoid sqrt of zero
	# intensity = float(mz_splits[1])
	if moz > max_moz:
	max_moz = moz
	if 0 < moz < spec_size:
	# spec[round(moz*10)] += round(intensity)
	if spec_ind and spec_ind[-1] == moz:
	spec_val[-1] = max(intensity, spec_val[-1])
	else:
	spec_ind.append(moz)
	spec_val.append(intensity) # adding one to avoid sqrt of zero
	if num_peaks < 10:
	is_name = is_mw = is_charge = False
	continue

	spec_ind = np.array(spec_ind)
	spec_val = np.array(spec_val)
	spec_val = (spec_val - np.amin(spec_val)) / (np.amax(spec_val) - np.amin(spec_val))

	ind = spec_ind
	val = spec_val

	assert len(ind) == len(val)
	spec = np.array([ind, val])

	summ[ind] += val
	sq_sum[ind] += val**2
	N += 1

	is_name = True

	if is_name and is_mw and is_charge:
	is_name = is_mw = is_charge = False

	"""output the data to """
	spec_file_name = '{}-{}-{}.npy'.format(lcount, mass, l_charge)
	np.save(join(out_dir, 'spectra', spec_file_name), spec)

	lcount += 1
	tot_count += 1

	pep = 0
	spec = []
	new = int((i / len(lines)) * 100)
	if new >= prev + 10:
	#clear_output(wait=True)
	print('count: ' + str(lcount))
	print(str(new) + '%')
	prev = new

	#print('max peaks: ' + str(max_peaks))
	print('In current file, read {} out of {}'.format(lcount, count))
	print("Ignored: large mass: {}, pep len: {}, dup: {}".format(mass_ign, pep_len_ign, dup_ign))
	print('overall running count: ' + str(tot_count))
	print('max moz: ' + str(max_moz))

	print("Statistics:")
	print("Charge distribution:")
	print(ch)
	print("Modified:\t{}".format(modified))
	print("Unmodified:\t{}".format(unmodified))
	print("Unique Peptides:\t{}".format(len(unique_pep_set)))
	print("Sum: {}".format(summ))
	print("Sum-Squared: {}".format(sq_sum))
	print("N: {}".format(N))
	means = summ / N
	print("mean: {}".format(means))
	stds = np.sqrt((sq_sum / N) - means**2)
	stds[stds < 0.0000001] = float("inf")
	print("std: {}".format(stds))
	np.save(join(out_dir, 'means.npy'), means)
	np.save(join(out_dir, 'stds.npy'), stds)

	# return spectra, masses, charges