#!/usr/bin/env python3 """ Create V5-B training dataset by excluding leaked benchmark WURCS. """ import pickle from pathlib import Path BASE = Path("/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/v3.1_cluster_training") # Load exclusion list with open(BASE / "bench/data_leakage_analysis/leaked_wurcs_list.txt") as f: excluded_wurcs = set(line.strip() for line in f if line.strip()) print(f"Loaded {len(excluded_wurcs)} WURCS to exclude") # Load original sequences with open(BASE / "data/sequences.pkl", "rb") as f: sequences = pickle.load(f) print(f"Original dataset: {len(sequences)} sequences") # Filter - assuming sequences is a list of dicts with 'wurcs' key filtered = [s for s in sequences if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs] print(f"After exclusion: {len(filtered)} sequences") print(f"Removed: {len(sequences) - len(filtered)} sequences") # Save output_path = BASE / "bert_v5b_excluded/data/sequences_excluded.pkl" with open(output_path, "wb") as f: pickle.dump(filtered, f) print(f"Saved to {output_path}") # Also filter BPE sequences with open(BASE / "data/sequences_bpe.pkl", "rb") as f: sequences_bpe = pickle.load(f) print(f"Original BPE dataset: {len(sequences_bpe)} sequences") # Need to match by index or wurcs field - check structure first print(f"Sample BPE entry keys: {sequences_bpe[0].keys() if isinstance(sequences_bpe[0], dict) else 'not a dict'}") if isinstance(sequences_bpe[0], dict): filtered_bpe = [s for s in sequences_bpe if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs] print(f"After BPE exclusion: {len(filtered_bpe)} sequences") output_bpe = BASE / "bert_v5b_excluded/data/sequences_bpe_excluded.pkl" with open(output_bpe, "wb") as f: pickle.dump(filtered_bpe, f) print(f"Saved BPE to {output_bpe}")