bertose-affinose-training-code / code /benchmarks /create_excluded_dataset.py
supanthadey1's picture
Add BERTose and AFFINose training code release
1d6f391 verified
Raw
History Blame Contribute Delete
1.83 kB
#!/usr/bin/env python3
"""
Create V5-B training dataset by excluding leaked benchmark WURCS.
"""
import pickle
from pathlib import Path
BASE = Path("/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/v3.1_cluster_training")
# Load exclusion list
with open(BASE / "bench/data_leakage_analysis/leaked_wurcs_list.txt") as f:
excluded_wurcs = set(line.strip() for line in f if line.strip())
print(f"Loaded {len(excluded_wurcs)} WURCS to exclude")
# Load original sequences
with open(BASE / "data/sequences.pkl", "rb") as f:
sequences = pickle.load(f)
print(f"Original dataset: {len(sequences)} sequences")
# Filter - assuming sequences is a list of dicts with 'wurcs' key
filtered = [s for s in sequences if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs]
print(f"After exclusion: {len(filtered)} sequences")
print(f"Removed: {len(sequences) - len(filtered)} sequences")
# Save
output_path = BASE / "bert_v5b_excluded/data/sequences_excluded.pkl"
with open(output_path, "wb") as f:
pickle.dump(filtered, f)
print(f"Saved to {output_path}")
# Also filter BPE sequences
with open(BASE / "data/sequences_bpe.pkl", "rb") as f:
sequences_bpe = pickle.load(f)
print(f"Original BPE dataset: {len(sequences_bpe)} sequences")
# Need to match by index or wurcs field - check structure first
print(f"Sample BPE entry keys: {sequences_bpe[0].keys() if isinstance(sequences_bpe[0], dict) else 'not a dict'}")
if isinstance(sequences_bpe[0], dict):
filtered_bpe = [s for s in sequences_bpe if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs]
print(f"After BPE exclusion: {len(filtered_bpe)} sequences")
output_bpe = BASE / "bert_v5b_excluded/data/sequences_bpe_excluded.pkl"
with open(output_bpe, "wb") as f:
pickle.dump(filtered_bpe, f)
print(f"Saved BPE to {output_bpe}")