Add BERTose and AFFINose training code release

1d6f391 verified about 2 months ago

19 kB

	"""
	Build combined Carbogrove + glycowork dataset for Bertint V5.

	Steps:
	1. Load Carbogrove processed data (315K records)
	2. Load glycowork binding data (wide) → long (620K records)
	3. Merge and filter to proteins with sequences
	4. Compute per-experiment RANK PERCENTILE as training target
	5. Compute auxiliary targets (z-scores) for ablation
	6. Save combined dataset + sequence files

	Output columns:
	protein_id - canonical protein name
	glycan_wurcs - WURCS string (for Bertose)
	target_raw - original FractionBound or Z-score
	target_rank - per-experiment rank percentile [0,1] ← PRIMARY
	target_zscore - z-score per source (global) ← ABLATION
	target_zscore_exp - z-score within each experiment ← ABLATION
	log_conc - log10(concentration+1) or -1 (unknown)
	has_conc - 1 if concentration known, 0 if not
	source - 'carbogrove' or 'glycowork'
	data_source - platform name (CG) or 'glycowork'
	exp_id - experiment identifier
	exp_size - records in this experiment

	Canonical IDs:
	- Protein: amino acid sequence (from UniProt or glycowork 'target')
	- Glycan: WURCS string (for Bertose embeddings)
	"""

	import pandas as pd
	import numpy as np
	import csv
	import json
	import os
	import logging
	from pathlib import Path

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s [%(levelname)s] %(message)s',
	)
	logger = logging.getLogger(__name__)

	# ── Paths ──────────────────────────────────────────────────────────
	BASE = Path(__file__).parent.parent
	DATA_DIR = BASE / '19274777-2'
	PROCESSED_DIR = BASE / 'processed_v5'
	OUTPUT_DIR = BASE / 'combined_dataset'

	# ── Step 1: Load Carbogrove ────────────────────────────────────────
	def load_carbogrove() -> pd.DataFrame:
	"""Load Carbogrove processed data with recovered lectins."""
	logger.info("Loading Carbogrove data...")

	# Load main binding data
	cg = pd.read_csv(PROCESSED_DIR / 'training_data.csv')
	logger.info(f" Base Carbogrove: {len(cg):,} records, "
	f"{cg['LectinName'].nunique()} lectins")

	# Load glycan info for WURCS mapping
	glycan_info = {}
	with open(DATA_DIR / 'AllGlycanInfo_V2.csv') as f:
	for row in csv.DictReader(f):
	wurcs = row.get('WURCS', '').strip()
	if wurcs:
	glycan_info[row['GlycanID']] = wurcs

	# Load ALL protein sequences:
	# 1. UniProt-fetched sequences (most complete)
	prot_seqs = {}
	cg_seq_path = OUTPUT_DIR / 'carbogrove_protein_sequences.csv'
	if cg_seq_path.exists():
	cg_seqs = pd.read_csv(cg_seq_path)
	for _, row in cg_seqs.iterrows():
	prot_seqs[row['protein_id']] = {
	'sequence': row['sequence'],
	'source': row['source'],
	}
	logger.info(f" Loaded {len(prot_seqs)} Carbogrove protein sequences")

	# 2. Recovered sequences (lectins without UniProt IDs)
	recovered_path = DATA_DIR / 'recovered_sequences.csv'
	if recovered_path.exists():
	recovered = pd.read_csv(recovered_path)
	recovered = recovered[recovered['Status'] == 'RECOVERED']
	for _, row in recovered.iterrows():
	name = row['LectinName']
	if name not in prot_seqs and row['Sequence'] and len(str(row['Sequence'])) > 20:
	prot_seqs[name] = {
	'sequence': row['Sequence'],
	'source': row['Source'],
	}
	n_recovered = len(recovered)
	logger.info(f" Added {n_recovered} recovered lectin sequences")

	# Add WURCS to Carbogrove
	cg['WURCS'] = cg['GlycanID'].map(glycan_info)
	n_with_wurcs = cg['WURCS'].notna().sum()
	logger.info(f" Records with WURCS: {n_with_wurcs:,} / {len(cg):,}")

	# Drop records without WURCS (can't generate Bertose embeddings)
	cg = cg[cg['WURCS'].notna()].copy()
	logger.info(f" After WURCS filter: {len(cg):,} records")

	# Load experiment metadata for DataSource (platform)
	data_info = {}
	with open(DATA_DIR / 'AllDataInfo_V2.csv') as f:
	for row in csv.DictReader(f):
	data_info[int(row['DataID'])] = row.get('DataSource', 'unknown')
	cg['data_source'] = cg['DataID'].map(data_info)
	logger.info(f" DataSources: {cg['data_source'].nunique()} platforms")

	# Standardize columns (keep DataID for per-experiment rank)
	cg['source'] = 'carbogrove'
	cg['target_raw'] = cg['FractionBound']
	cg['log_conc'] = np.log10(cg['Concentration'] + 1)
	cg['has_conc'] = 1
	cg['exp_id'] = 'cg_' + cg['DataID'].astype(str)

	return cg, prot_seqs, glycan_info


	# ── Step 2: Load & convert glycowork ──────────────────────────────
	def load_glycowork(iupac_to_wurcs: dict) -> pd.DataFrame:
	"""
	Load glycowork binding data, convert wide→long,
	filter to glycans with WURCS mappings.
	"""
	logger.info("Loading glycowork data...")
	from glycowork.glycan_data.loader import glycan_binding

	gw = glycan_binding
	meta_cols = ['protein', 'target']
	glycan_cols = [c for c in gw.columns if c not in meta_cols]
	logger.info(f" Wide format: {gw.shape[0]} proteins × {len(glycan_cols)} glycans")

	# Filter to glycans with WURCS
	mapped_glycans = [g for g in glycan_cols if g in iupac_to_wurcs]
	logger.info(f" Glycans with WURCS: {len(mapped_glycans)} / {len(glycan_cols)}")

	# Wide → Long (only for mapped glycans)
	logger.info(" Converting wide → long format...")
	records = []
	skipped_nan_protein = 0
	skipped_short_seq = 0

	for _, row in gw.iterrows():
	protein_name = row['protein']
	protein_seq = row['target']

	# Skip rows with NaN protein name
	if pd.isna(protein_name) or str(protein_name).strip() == 'nan':
	skipped_nan_protein += 1
	continue

	protein_name = str(protein_name).strip()
	protein_seq = str(protein_seq) if pd.notna(protein_seq) else ''

	# Skip proteins without sequences
	if len(protein_seq) < 50:
	skipped_short_seq += 1
	continue

	for glycan_iupac in mapped_glycans:
	val = row[glycan_iupac]
	if pd.notna(val):
	records.append({
	'protein_name': protein_name,
	'protein_seq': protein_seq,
	'glycan_iupac': glycan_iupac,
	'WURCS': iupac_to_wurcs[glycan_iupac],
	'target_raw': float(val),
	'target_zscore': float(val), # Already z-scored
	})

	logger.info(f" Skipped {skipped_nan_protein} rows with NaN protein")
	logger.info(f" Skipped {skipped_short_seq} rows with short/no sequence")

	gw_long = pd.DataFrame(records)
	logger.info(f" Long format: {len(gw_long):,} records")
	logger.info(f" Proteins: {gw_long['protein_name'].nunique()}")
	logger.info(f" Glycans: {gw_long['glycan_iupac'].nunique()}")

	gw_long['source'] = 'glycowork'
	gw_long['log_conc'] = -1.0 # Sentinel: concentration unknown
	gw_long['has_conc'] = 0
	gw_long['data_source'] = 'glycowork'
	gw_long['exp_id'] = 'gw_' + gw_long['protein_name']

	return gw_long


	# ── Step 3: Merge datasets ────────────────────────────────────────
	def merge_datasets(
	cg: pd.DataFrame,
	gw: pd.DataFrame,
	prot_seqs: dict,
	) -> pd.DataFrame:
	"""
	Merge Carbogrove and glycowork into unified format.

	Unified columns:
	- protein_id: canonical ID (UniProt or glycowork name)
	- protein_seq: amino acid sequence (for ESM-C)
	- glycan_wurcs: WURCS string (for Bertose)
	- target_raw: original target value
	- target_zscore: z-scored target (per source)
	- log_conc: log10(concentration + 1) or NaN
	- source: 'carbogrove' or 'glycowork'
	"""
	logger.info("Merging datasets...")

	# Standardize Carbogrove columns
	cg_unified = pd.DataFrame({
	'protein_id': cg['LectinName'],
	'glycan_wurcs': cg['WURCS'],
	'target_raw': cg['target_raw'],
	'log_conc': cg['log_conc'],
	'has_conc': cg['has_conc'],
	'source': cg['source'],
	'data_source': cg['data_source'],
	'exp_id': cg['exp_id'],
	})

	# Standardize glycowork columns
	gw_unified = pd.DataFrame({
	'protein_id': gw['protein_name'],
	'glycan_wurcs': gw['WURCS'],
	'target_raw': gw['target_raw'],
	'log_conc': gw['log_conc'],
	'has_conc': gw['has_conc'],
	'source': gw['source'],
	'data_source': gw['data_source'],
	'exp_id': gw['exp_id'],
	})

	# Concatenate
	combined = pd.concat([cg_unified, gw_unified], ignore_index=True)
	logger.info(f" Combined (pre-dedup): {len(combined):,} records")
	logger.info(f" Carbogrove: {(combined['source'] == 'carbogrove').sum():,}")
	logger.info(f" glycowork: {(combined['source'] == 'glycowork').sum():,}")

	# ── Deduplication ──────────────────────────────────────────────
	# Two sources of duplicates:
	# 1. glycowork: 160 proteins have multiple rows in the binding
	# matrix (same protein tested on different arrays), creating
	# duplicate (protein, glycan) pairs with different Z-scores.
	# 2. Carbogrove: different GlycanIDs can map to the same WURCS
	# (structurally identical glycans with different printed names),
	# creating duplicate (protein, glycan, experiment) entries.
	#
	# Fix: average target_raw per (protein_id, glycan_wurcs, exp_id).
	# Other columns (log_conc, has_conc, source, data_source) are
	# constant within each group, so we take the first value.

	group_cols = ['protein_id', 'glycan_wurcs', 'exp_id']
	before = len(combined)

	combined = combined.groupby(group_cols, as_index=False).agg({
	'target_raw': 'mean', # Average replicate measurements
	'log_conc': 'first', # Constant within experiment
	'has_conc': 'first', # Constant within source
	'source': 'first', # Constant within experiment
	'data_source': 'first', # Constant within experiment
	})

	after = len(combined)
	removed = before - after
	logger.info(f" Deduplicated: {before:,} → {after:,} ({removed:,} duplicates removed)")

	return combined


	# ── Step 4: Generate protein sequence file ─────────────────────────
	def build_protein_sequence_file(
	combined: pd.DataFrame,
	prot_seqs: dict,
	gw_long: pd.DataFrame,
	) -> pd.DataFrame:
	"""Build protein_id → sequence mapping for ESM-C embedding generation."""
	logger.info("Building protein sequence file...")

	seq_map = {}

	# Carbogrove proteins: from UniProt-fetched + recovered
	for name, info in prot_seqs.items():
	if 'sequence' in info and len(str(info['sequence'])) > 20:
	seq_map[name] = str(info['sequence'])

	n_cg = len(seq_map)
	logger.info(f" Carbogrove sequences: {n_cg}")

	# glycowork proteins: sequence in data
	gw_seqs = gw_long.groupby('protein_name')['protein_seq'].first()
	for name, seq in gw_seqs.items():
	if name not in seq_map and len(str(seq)) > 50:
	seq_map[name] = str(seq)

	logger.info(f" glycowork sequences added: {len(seq_map) - n_cg}")

	# Check coverage
	all_proteins = set(combined['protein_id'].unique())
	covered = all_proteins & set(seq_map.keys())
	missing = all_proteins - set(seq_map.keys())

	logger.info(f" Total unique proteins: {len(all_proteins)}")
	logger.info(f" With sequences: {len(covered)} ({len(covered)/len(all_proteins)*100:.1f}%)")
	logger.info(f" Missing sequences: {len(missing)}")
	if missing:
	logger.info(f" Missing: {sorted(missing)}")

	# Drop records for proteins without sequences from combined
	records_before = len(combined)
	combined_filtered = combined[combined['protein_id'].isin(seq_map)].copy()
	records_after = len(combined_filtered)
	dropped = records_before - records_after
	if dropped > 0:
	logger.info(f" Dropped {dropped:,} records with missing protein sequences")

	# Save
	seq_df = pd.DataFrame([
	{'protein_id': pid, 'sequence': seq, 'seq_len': len(seq)}
	for pid, seq in seq_map.items()
	if pid in set(combined_filtered['protein_id'].unique())
	])
	return seq_df, combined_filtered


	# ── Step 5: Rank-based normalization ───────────────────────────────
	def compute_rank_targets(combined: pd.DataFrame) -> pd.DataFrame:
	"""
	Compute per-experiment rank percentile + auxiliary z-score targets.

	Targets computed:
	target_rank - rank percentile [0,1] within experiment (PRIMARY)
	target_zscore - z-score per source (global) (ABLATION)
	target_zscore_exp - z-score within each experiment (ABLATION)
	exp_size - number of records in this experiment
	"""
	logger.info("Computing targets...")

	# 1. Rank within each experiment, normalized to [0, 1]
	combined['target_rank'] = combined.groupby('exp_id')['target_raw'].transform(
	lambda x: x.rank(method='average', pct=True)
	)

	# 2. Global z-score per source
	for src in ['carbogrove', 'glycowork']:
	mask = combined['source'] == src
	raw = combined.loc[mask, 'target_raw']
	combined.loc[mask, 'target_zscore'] = (
	(raw - raw.mean()) / (raw.std() + 1e-10)
	)
	logger.info(f" Global z-scores computed per source")

	# 3. Z-score within each experiment
	combined['target_zscore_exp'] = combined.groupby('exp_id')['target_raw'].transform(
	lambda x: (x - x.mean()) / (x.std() + 1e-10) if x.std() > 0 else 0.0
	)
	logger.info(f" Per-experiment z-scores computed")

	# 4. Experiment size (for potential weighting)
	combined['exp_size'] = combined.groupby('exp_id')['target_raw'].transform('count').astype(int)

	# Validation: check rank distribution
	n_exps = combined['exp_id'].nunique()
	exp_sizes = combined.groupby('exp_id').size()
	logger.info(f" Ranked across {n_exps:,} experiments")
	logger.info(f" Experiment sizes: min={exp_sizes.min()}, "
	f"median={exp_sizes.median():.0f}, max={exp_sizes.max()}")

	# Drop experiments with fewer than 5 glycans — too noisy for ranking
	small_exps = (exp_sizes < 5).sum()
	if small_exps > 0:
	valid_exps = exp_sizes[exp_sizes >= 5].index
	before = len(combined)
	combined = combined[combined['exp_id'].isin(valid_exps)].copy()
	after = len(combined)
	if before > after:
	logger.info(f" Dropped {before - after:,} records from "
	f"experiments with <5 glycans")

	# Per-source stats
	for src in ['carbogrove', 'glycowork']:
	sub = combined[combined['source'] == src]
	r = sub['target_rank']
	z = sub['target_zscore']
	logger.info(f" [{src}] rank: mean={r.mean():.3f} std={r.std():.3f} \| "
	f"zscore: mean={z.mean():.3f} std={z.std():.3f}")

	return combined


	# ── Step 7: Summary statistics ─────────────────────────────────────
	def print_summary(combined: pd.DataFrame) -> None:
	"""Print comprehensive dataset summary."""
	logger.info("=" * 70)
	logger.info("COMBINED DATASET SUMMARY")
	logger.info("=" * 70)

	for src in ['carbogrove', 'glycowork', 'all']:
	if src == 'all':
	sub = combined
	label = 'COMBINED'
	else:
	sub = combined[combined['source'] == src]
	label = src.upper()

	logger.info(f"\n [{label}]")
	logger.info(f" Records: {len(sub):,}")
	logger.info(f" Proteins: {sub['protein_id'].nunique()}")
	logger.info(f" Glycans (WURCS): {sub['glycan_wurcs'].nunique()}")
	logger.info(f" Experiments: {sub['exp_id'].nunique()}")
	logger.info(f" target_rank: mean={sub['target_rank'].mean():.3f}, "
	f"std={sub['target_rank'].std():.3f}")
	logger.info(f" target_zscore: mean={sub['target_zscore'].mean():.3f}, "
	f"std={sub['target_zscore'].std():.3f}")

	# Column inventory
	logger.info(f"\n Columns: {list(combined.columns)}")
	logger.info(f" Unique glycan WURCS: {combined['glycan_wurcs'].nunique()}")
	logger.info(f" Unique protein IDs: {combined['protein_id'].nunique()}")


	# ── Main ───────────────────────────────────────────────────────────
	def main() -> None:
	"""Build combined dataset."""
	# Create output directory
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# Step 1: Load Carbogrove
	cg, prot_seqs, glycan_info = load_carbogrove()

	# Load IUPAC → WURCS mapping
	iupac_wurcs_path = DATA_DIR / 'glycowork_iupac_to_wurcs.json'
	with open(iupac_wurcs_path) as f:
	iupac_to_wurcs = json.load(f)
	logger.info(f"Loaded {len(iupac_to_wurcs)} IUPAC→WURCS mappings")

	# Step 2: Load & convert glycowork
	gw = load_glycowork(iupac_to_wurcs)

	# Step 3: Merge
	combined = merge_datasets(cg, gw, prot_seqs)

	# Step 4: Protein sequences (also filters combined to drop missing)
	seq_df, combined_final = build_protein_sequence_file(combined, prot_seqs, gw)

	# Step 5: Rank-based normalization + auxiliary targets
	combined_final = compute_rank_targets(combined_final)

	# Save outputs
	logger.info("\nSaving outputs...")
	combined_final.to_csv(OUTPUT_DIR / 'combined_binding_data.csv', index=False)
	logger.info(f" Saved combined_binding_data.csv: {len(combined_final):,} records")

	seq_df.to_csv(OUTPUT_DIR / 'protein_sequences.csv', index=False)
	logger.info(f" Saved protein_sequences.csv: {len(seq_df)} proteins")

	# Save unique WURCS for Bertose embedding generation
	unique_wurcs = combined_final['glycan_wurcs'].unique()
	wurcs_df = pd.DataFrame({'wurcs': unique_wurcs})
	wurcs_df.to_csv(OUTPUT_DIR / 'unique_glycan_wurcs.csv', index=False)
	logger.info(f" Saved unique_glycan_wurcs.csv: {len(wurcs_df)} glycans")

	# Step 7: Summary
	print_summary(combined_final)


	if __name__ == '__main__':
	main()