Buckets:

AlienChen
/

Storage

about 1 month ago

1.02 kB

	import pandas as pd

	df = pd.read_csv('/scratch/pranamlab/tong/SMILES_BindEvaluator/datasets/train.csv')

	targets = df['Target'].tolist()
	total_length = sum([len(seq) for seq in targets])

	def parse_sites(x):
	"""
	"49,50,51" -> [49,50,51]
	Handles empty/NaN gracefully.
	"""
	if x is None:
	return []
	s = str(x).strip()
	if s == "" or s.lower() == "nan":
	return []
	# remove possible quotes
	if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
	s = s[1:-1].strip()
	if s == "":
	return []
	return len([int(t.strip()) for t in s.split(",") if t.strip() != ""])

	binding_sites = df['Binding Sites'].tolist()
	num_binding_sites = sum([parse_sites(site) for site in binding_sites])
	num_non_binding_sites = total_length - num_binding_sites

	weight_pos = total_length / (2 * num_binding_sites)
	weight_neg = total_length / (2 * num_non_binding_sites)

	print(f"Positive Weight: {weight_pos}")
	print(f"Negative Weight: {weight_neg}")

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.