| """ |
| Build combined Carbogrove + glycowork dataset for Bertint V5. |
| |
| Steps: |
| 1. Load Carbogrove processed data (315K records) |
| 2. Load glycowork binding data (wide) β long (620K records) |
| 3. Merge and filter to proteins with sequences |
| 4. Compute per-experiment RANK PERCENTILE as training target |
| 5. Compute auxiliary targets (z-scores) for ablation |
| 6. Save combined dataset + sequence files |
| |
| Output columns: |
| protein_id - canonical protein name |
| glycan_wurcs - WURCS string (for Bertose) |
| target_raw - original FractionBound or Z-score |
| target_rank - per-experiment rank percentile [0,1] β PRIMARY |
| target_zscore - z-score per source (global) β ABLATION |
| target_zscore_exp - z-score within each experiment β ABLATION |
| log_conc - log10(concentration+1) or -1 (unknown) |
| has_conc - 1 if concentration known, 0 if not |
| source - 'carbogrove' or 'glycowork' |
| data_source - platform name (CG) or 'glycowork' |
| exp_id - experiment identifier |
| exp_size - records in this experiment |
| |
| Canonical IDs: |
| - Protein: amino acid sequence (from UniProt or glycowork 'target') |
| - Glycan: WURCS string (for Bertose embeddings) |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import csv |
| import json |
| import os |
| import logging |
| from pathlib import Path |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s [%(levelname)s] %(message)s', |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| |
| BASE = Path(__file__).parent.parent |
| DATA_DIR = BASE / '19274777-2' |
| PROCESSED_DIR = BASE / 'processed_v5' |
| OUTPUT_DIR = BASE / 'combined_dataset' |
|
|
| |
| def load_carbogrove() -> pd.DataFrame: |
| """Load Carbogrove processed data with recovered lectins.""" |
| logger.info("Loading Carbogrove data...") |
|
|
| |
| cg = pd.read_csv(PROCESSED_DIR / 'training_data.csv') |
| logger.info(f" Base Carbogrove: {len(cg):,} records, " |
| f"{cg['LectinName'].nunique()} lectins") |
|
|
| |
| glycan_info = {} |
| with open(DATA_DIR / 'AllGlycanInfo_V2.csv') as f: |
| for row in csv.DictReader(f): |
| wurcs = row.get('WURCS', '').strip() |
| if wurcs: |
| glycan_info[row['GlycanID']] = wurcs |
|
|
| |
| |
| prot_seqs = {} |
| cg_seq_path = OUTPUT_DIR / 'carbogrove_protein_sequences.csv' |
| if cg_seq_path.exists(): |
| cg_seqs = pd.read_csv(cg_seq_path) |
| for _, row in cg_seqs.iterrows(): |
| prot_seqs[row['protein_id']] = { |
| 'sequence': row['sequence'], |
| 'source': row['source'], |
| } |
| logger.info(f" Loaded {len(prot_seqs)} Carbogrove protein sequences") |
|
|
| |
| recovered_path = DATA_DIR / 'recovered_sequences.csv' |
| if recovered_path.exists(): |
| recovered = pd.read_csv(recovered_path) |
| recovered = recovered[recovered['Status'] == 'RECOVERED'] |
| for _, row in recovered.iterrows(): |
| name = row['LectinName'] |
| if name not in prot_seqs and row['Sequence'] and len(str(row['Sequence'])) > 20: |
| prot_seqs[name] = { |
| 'sequence': row['Sequence'], |
| 'source': row['Source'], |
| } |
| n_recovered = len(recovered) |
| logger.info(f" Added {n_recovered} recovered lectin sequences") |
|
|
| |
| cg['WURCS'] = cg['GlycanID'].map(glycan_info) |
| n_with_wurcs = cg['WURCS'].notna().sum() |
| logger.info(f" Records with WURCS: {n_with_wurcs:,} / {len(cg):,}") |
|
|
| |
| cg = cg[cg['WURCS'].notna()].copy() |
| logger.info(f" After WURCS filter: {len(cg):,} records") |
|
|
| |
| data_info = {} |
| with open(DATA_DIR / 'AllDataInfo_V2.csv') as f: |
| for row in csv.DictReader(f): |
| data_info[int(row['DataID'])] = row.get('DataSource', 'unknown') |
| cg['data_source'] = cg['DataID'].map(data_info) |
| logger.info(f" DataSources: {cg['data_source'].nunique()} platforms") |
|
|
| |
| cg['source'] = 'carbogrove' |
| cg['target_raw'] = cg['FractionBound'] |
| cg['log_conc'] = np.log10(cg['Concentration'] + 1) |
| cg['has_conc'] = 1 |
| cg['exp_id'] = 'cg_' + cg['DataID'].astype(str) |
|
|
| return cg, prot_seqs, glycan_info |
|
|
|
|
| |
| def load_glycowork(iupac_to_wurcs: dict) -> pd.DataFrame: |
| """ |
| Load glycowork binding data, convert wideβlong, |
| filter to glycans with WURCS mappings. |
| """ |
| logger.info("Loading glycowork data...") |
| from glycowork.glycan_data.loader import glycan_binding |
|
|
| gw = glycan_binding |
| meta_cols = ['protein', 'target'] |
| glycan_cols = [c for c in gw.columns if c not in meta_cols] |
| logger.info(f" Wide format: {gw.shape[0]} proteins Γ {len(glycan_cols)} glycans") |
|
|
| |
| mapped_glycans = [g for g in glycan_cols if g in iupac_to_wurcs] |
| logger.info(f" Glycans with WURCS: {len(mapped_glycans)} / {len(glycan_cols)}") |
|
|
| |
| logger.info(" Converting wide β long format...") |
| records = [] |
| skipped_nan_protein = 0 |
| skipped_short_seq = 0 |
|
|
| for _, row in gw.iterrows(): |
| protein_name = row['protein'] |
| protein_seq = row['target'] |
|
|
| |
| if pd.isna(protein_name) or str(protein_name).strip() == 'nan': |
| skipped_nan_protein += 1 |
| continue |
|
|
| protein_name = str(protein_name).strip() |
| protein_seq = str(protein_seq) if pd.notna(protein_seq) else '' |
|
|
| |
| if len(protein_seq) < 50: |
| skipped_short_seq += 1 |
| continue |
|
|
| for glycan_iupac in mapped_glycans: |
| val = row[glycan_iupac] |
| if pd.notna(val): |
| records.append({ |
| 'protein_name': protein_name, |
| 'protein_seq': protein_seq, |
| 'glycan_iupac': glycan_iupac, |
| 'WURCS': iupac_to_wurcs[glycan_iupac], |
| 'target_raw': float(val), |
| 'target_zscore': float(val), |
| }) |
|
|
| logger.info(f" Skipped {skipped_nan_protein} rows with NaN protein") |
| logger.info(f" Skipped {skipped_short_seq} rows with short/no sequence") |
|
|
| gw_long = pd.DataFrame(records) |
| logger.info(f" Long format: {len(gw_long):,} records") |
| logger.info(f" Proteins: {gw_long['protein_name'].nunique()}") |
| logger.info(f" Glycans: {gw_long['glycan_iupac'].nunique()}") |
|
|
| gw_long['source'] = 'glycowork' |
| gw_long['log_conc'] = -1.0 |
| gw_long['has_conc'] = 0 |
| gw_long['data_source'] = 'glycowork' |
| gw_long['exp_id'] = 'gw_' + gw_long['protein_name'] |
|
|
| return gw_long |
|
|
|
|
| |
| def merge_datasets( |
| cg: pd.DataFrame, |
| gw: pd.DataFrame, |
| prot_seqs: dict, |
| ) -> pd.DataFrame: |
| """ |
| Merge Carbogrove and glycowork into unified format. |
| |
| Unified columns: |
| - protein_id: canonical ID (UniProt or glycowork name) |
| - protein_seq: amino acid sequence (for ESM-C) |
| - glycan_wurcs: WURCS string (for Bertose) |
| - target_raw: original target value |
| - target_zscore: z-scored target (per source) |
| - log_conc: log10(concentration + 1) or NaN |
| - source: 'carbogrove' or 'glycowork' |
| """ |
| logger.info("Merging datasets...") |
|
|
| |
| cg_unified = pd.DataFrame({ |
| 'protein_id': cg['LectinName'], |
| 'glycan_wurcs': cg['WURCS'], |
| 'target_raw': cg['target_raw'], |
| 'log_conc': cg['log_conc'], |
| 'has_conc': cg['has_conc'], |
| 'source': cg['source'], |
| 'data_source': cg['data_source'], |
| 'exp_id': cg['exp_id'], |
| }) |
|
|
| |
| gw_unified = pd.DataFrame({ |
| 'protein_id': gw['protein_name'], |
| 'glycan_wurcs': gw['WURCS'], |
| 'target_raw': gw['target_raw'], |
| 'log_conc': gw['log_conc'], |
| 'has_conc': gw['has_conc'], |
| 'source': gw['source'], |
| 'data_source': gw['data_source'], |
| 'exp_id': gw['exp_id'], |
| }) |
|
|
| |
| combined = pd.concat([cg_unified, gw_unified], ignore_index=True) |
| logger.info(f" Combined (pre-dedup): {len(combined):,} records") |
| logger.info(f" Carbogrove: {(combined['source'] == 'carbogrove').sum():,}") |
| logger.info(f" glycowork: {(combined['source'] == 'glycowork').sum():,}") |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| group_cols = ['protein_id', 'glycan_wurcs', 'exp_id'] |
| before = len(combined) |
|
|
| combined = combined.groupby(group_cols, as_index=False).agg({ |
| 'target_raw': 'mean', |
| 'log_conc': 'first', |
| 'has_conc': 'first', |
| 'source': 'first', |
| 'data_source': 'first', |
| }) |
|
|
| after = len(combined) |
| removed = before - after |
| logger.info(f" Deduplicated: {before:,} β {after:,} ({removed:,} duplicates removed)") |
|
|
| return combined |
|
|
|
|
| |
| def build_protein_sequence_file( |
| combined: pd.DataFrame, |
| prot_seqs: dict, |
| gw_long: pd.DataFrame, |
| ) -> pd.DataFrame: |
| """Build protein_id β sequence mapping for ESM-C embedding generation.""" |
| logger.info("Building protein sequence file...") |
|
|
| seq_map = {} |
|
|
| |
| for name, info in prot_seqs.items(): |
| if 'sequence' in info and len(str(info['sequence'])) > 20: |
| seq_map[name] = str(info['sequence']) |
|
|
| n_cg = len(seq_map) |
| logger.info(f" Carbogrove sequences: {n_cg}") |
|
|
| |
| gw_seqs = gw_long.groupby('protein_name')['protein_seq'].first() |
| for name, seq in gw_seqs.items(): |
| if name not in seq_map and len(str(seq)) > 50: |
| seq_map[name] = str(seq) |
|
|
| logger.info(f" glycowork sequences added: {len(seq_map) - n_cg}") |
|
|
| |
| all_proteins = set(combined['protein_id'].unique()) |
| covered = all_proteins & set(seq_map.keys()) |
| missing = all_proteins - set(seq_map.keys()) |
|
|
| logger.info(f" Total unique proteins: {len(all_proteins)}") |
| logger.info(f" With sequences: {len(covered)} ({len(covered)/len(all_proteins)*100:.1f}%)") |
| logger.info(f" Missing sequences: {len(missing)}") |
| if missing: |
| logger.info(f" Missing: {sorted(missing)}") |
|
|
| |
| records_before = len(combined) |
| combined_filtered = combined[combined['protein_id'].isin(seq_map)].copy() |
| records_after = len(combined_filtered) |
| dropped = records_before - records_after |
| if dropped > 0: |
| logger.info(f" Dropped {dropped:,} records with missing protein sequences") |
|
|
| |
| seq_df = pd.DataFrame([ |
| {'protein_id': pid, 'sequence': seq, 'seq_len': len(seq)} |
| for pid, seq in seq_map.items() |
| if pid in set(combined_filtered['protein_id'].unique()) |
| ]) |
| return seq_df, combined_filtered |
|
|
|
|
| |
| def compute_rank_targets(combined: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Compute per-experiment rank percentile + auxiliary z-score targets. |
| |
| Targets computed: |
| target_rank - rank percentile [0,1] within experiment (PRIMARY) |
| target_zscore - z-score per source (global) (ABLATION) |
| target_zscore_exp - z-score within each experiment (ABLATION) |
| exp_size - number of records in this experiment |
| """ |
| logger.info("Computing targets...") |
|
|
| |
| combined['target_rank'] = combined.groupby('exp_id')['target_raw'].transform( |
| lambda x: x.rank(method='average', pct=True) |
| ) |
|
|
| |
| for src in ['carbogrove', 'glycowork']: |
| mask = combined['source'] == src |
| raw = combined.loc[mask, 'target_raw'] |
| combined.loc[mask, 'target_zscore'] = ( |
| (raw - raw.mean()) / (raw.std() + 1e-10) |
| ) |
| logger.info(f" Global z-scores computed per source") |
|
|
| |
| combined['target_zscore_exp'] = combined.groupby('exp_id')['target_raw'].transform( |
| lambda x: (x - x.mean()) / (x.std() + 1e-10) if x.std() > 0 else 0.0 |
| ) |
| logger.info(f" Per-experiment z-scores computed") |
|
|
| |
| combined['exp_size'] = combined.groupby('exp_id')['target_raw'].transform('count').astype(int) |
|
|
| |
| n_exps = combined['exp_id'].nunique() |
| exp_sizes = combined.groupby('exp_id').size() |
| logger.info(f" Ranked across {n_exps:,} experiments") |
| logger.info(f" Experiment sizes: min={exp_sizes.min()}, " |
| f"median={exp_sizes.median():.0f}, max={exp_sizes.max()}") |
|
|
| |
| small_exps = (exp_sizes < 5).sum() |
| if small_exps > 0: |
| valid_exps = exp_sizes[exp_sizes >= 5].index |
| before = len(combined) |
| combined = combined[combined['exp_id'].isin(valid_exps)].copy() |
| after = len(combined) |
| if before > after: |
| logger.info(f" Dropped {before - after:,} records from " |
| f"experiments with <5 glycans") |
|
|
| |
| for src in ['carbogrove', 'glycowork']: |
| sub = combined[combined['source'] == src] |
| r = sub['target_rank'] |
| z = sub['target_zscore'] |
| logger.info(f" [{src}] rank: mean={r.mean():.3f} std={r.std():.3f} | " |
| f"zscore: mean={z.mean():.3f} std={z.std():.3f}") |
|
|
| return combined |
|
|
|
|
| |
| def print_summary(combined: pd.DataFrame) -> None: |
| """Print comprehensive dataset summary.""" |
| logger.info("=" * 70) |
| logger.info("COMBINED DATASET SUMMARY") |
| logger.info("=" * 70) |
|
|
| for src in ['carbogrove', 'glycowork', 'all']: |
| if src == 'all': |
| sub = combined |
| label = 'COMBINED' |
| else: |
| sub = combined[combined['source'] == src] |
| label = src.upper() |
|
|
| logger.info(f"\n [{label}]") |
| logger.info(f" Records: {len(sub):,}") |
| logger.info(f" Proteins: {sub['protein_id'].nunique()}") |
| logger.info(f" Glycans (WURCS): {sub['glycan_wurcs'].nunique()}") |
| logger.info(f" Experiments: {sub['exp_id'].nunique()}") |
| logger.info(f" target_rank: mean={sub['target_rank'].mean():.3f}, " |
| f"std={sub['target_rank'].std():.3f}") |
| logger.info(f" target_zscore: mean={sub['target_zscore'].mean():.3f}, " |
| f"std={sub['target_zscore'].std():.3f}") |
|
|
| |
| logger.info(f"\n Columns: {list(combined.columns)}") |
| logger.info(f" Unique glycan WURCS: {combined['glycan_wurcs'].nunique()}") |
| logger.info(f" Unique protein IDs: {combined['protein_id'].nunique()}") |
|
|
|
|
| |
| def main() -> None: |
| """Build combined dataset.""" |
| |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| cg, prot_seqs, glycan_info = load_carbogrove() |
|
|
| |
| iupac_wurcs_path = DATA_DIR / 'glycowork_iupac_to_wurcs.json' |
| with open(iupac_wurcs_path) as f: |
| iupac_to_wurcs = json.load(f) |
| logger.info(f"Loaded {len(iupac_to_wurcs)} IUPACβWURCS mappings") |
|
|
| |
| gw = load_glycowork(iupac_to_wurcs) |
|
|
| |
| combined = merge_datasets(cg, gw, prot_seqs) |
|
|
| |
| seq_df, combined_final = build_protein_sequence_file(combined, prot_seqs, gw) |
|
|
| |
| combined_final = compute_rank_targets(combined_final) |
|
|
| |
| logger.info("\nSaving outputs...") |
| combined_final.to_csv(OUTPUT_DIR / 'combined_binding_data.csv', index=False) |
| logger.info(f" Saved combined_binding_data.csv: {len(combined_final):,} records") |
|
|
| seq_df.to_csv(OUTPUT_DIR / 'protein_sequences.csv', index=False) |
| logger.info(f" Saved protein_sequences.csv: {len(seq_df)} proteins") |
|
|
| |
| unique_wurcs = combined_final['glycan_wurcs'].unique() |
| wurcs_df = pd.DataFrame({'wurcs': unique_wurcs}) |
| wurcs_df.to_csv(OUTPUT_DIR / 'unique_glycan_wurcs.csv', index=False) |
| logger.info(f" Saved unique_glycan_wurcs.csv: {len(wurcs_df)} glycans") |
|
|
| |
| print_summary(combined_final) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|