| |
| """ |
| Create V5-B training dataset by excluding leaked benchmark WURCS. |
| """ |
| import pickle |
| from pathlib import Path |
|
|
| BASE = Path("/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/v3.1_cluster_training") |
|
|
| |
| with open(BASE / "bench/data_leakage_analysis/leaked_wurcs_list.txt") as f: |
| excluded_wurcs = set(line.strip() for line in f if line.strip()) |
| print(f"Loaded {len(excluded_wurcs)} WURCS to exclude") |
|
|
| |
| with open(BASE / "data/sequences.pkl", "rb") as f: |
| sequences = pickle.load(f) |
| print(f"Original dataset: {len(sequences)} sequences") |
|
|
| |
| filtered = [s for s in sequences if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs] |
| print(f"After exclusion: {len(filtered)} sequences") |
| print(f"Removed: {len(sequences) - len(filtered)} sequences") |
|
|
| |
| output_path = BASE / "bert_v5b_excluded/data/sequences_excluded.pkl" |
| with open(output_path, "wb") as f: |
| pickle.dump(filtered, f) |
| print(f"Saved to {output_path}") |
|
|
| |
| with open(BASE / "data/sequences_bpe.pkl", "rb") as f: |
| sequences_bpe = pickle.load(f) |
| print(f"Original BPE dataset: {len(sequences_bpe)} sequences") |
|
|
| |
| print(f"Sample BPE entry keys: {sequences_bpe[0].keys() if isinstance(sequences_bpe[0], dict) else 'not a dict'}") |
|
|
| if isinstance(sequences_bpe[0], dict): |
| filtered_bpe = [s for s in sequences_bpe if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs] |
| print(f"After BPE exclusion: {len(filtered_bpe)} sequences") |
| |
| output_bpe = BASE / "bert_v5b_excluded/data/sequences_bpe_excluded.pkl" |
| with open(output_bpe, "wb") as f: |
| pickle.dump(filtered_bpe, f) |
| print(f"Saved BPE to {output_bpe}") |
|
|