File size: 1,833 Bytes
1d6f391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python3
"""
Create V5-B training dataset by excluding leaked benchmark WURCS.
"""
import pickle
from pathlib import Path

BASE = Path("/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/v3.1_cluster_training")

# Load exclusion list
with open(BASE / "bench/data_leakage_analysis/leaked_wurcs_list.txt") as f:
    excluded_wurcs = set(line.strip() for line in f if line.strip())
print(f"Loaded {len(excluded_wurcs)} WURCS to exclude")

# Load original sequences
with open(BASE / "data/sequences.pkl", "rb") as f:
    sequences = pickle.load(f)
print(f"Original dataset: {len(sequences)} sequences")

# Filter - assuming sequences is a list of dicts with 'wurcs' key
filtered = [s for s in sequences if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs]
print(f"After exclusion: {len(filtered)} sequences")
print(f"Removed: {len(sequences) - len(filtered)} sequences")

# Save
output_path = BASE / "bert_v5b_excluded/data/sequences_excluded.pkl"
with open(output_path, "wb") as f:
    pickle.dump(filtered, f)
print(f"Saved to {output_path}")

# Also filter BPE sequences
with open(BASE / "data/sequences_bpe.pkl", "rb") as f:
    sequences_bpe = pickle.load(f)
print(f"Original BPE dataset: {len(sequences_bpe)} sequences")

# Need to match by index or wurcs field - check structure first
print(f"Sample BPE entry keys: {sequences_bpe[0].keys() if isinstance(sequences_bpe[0], dict) else 'not a dict'}")

if isinstance(sequences_bpe[0], dict):
    filtered_bpe = [s for s in sequences_bpe if s.get('wurcs', s.get('WURCS', '')) not in excluded_wurcs]
    print(f"After BPE exclusion: {len(filtered_bpe)} sequences")
    
    output_bpe = BASE / "bert_v5b_excluded/data/sequences_bpe_excluded.pkl"
    with open(output_bpe, "wb") as f:
        pickle.dump(filtered_bpe, f)
    print(f"Saved BPE to {output_bpe}")