File size: 1,472 Bytes
daf4247 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | # src/data/leakage.py
#
# Verifies zero overlap between training PDB IDs and CASF-2016.
# Saves a supplementary CSV confirming each CASF complex was not in training.
# This CSV goes directly into the paper as Supplementary Table S1.
import pandas as pd
from pathlib import Path
def check_leakage(train_df: pd.DataFrame,
casf_df: pd.DataFrame,
out_path: Path) -> int:
"""
Compares PDB IDs between training and test sets.
Returns number of overlapping complexes (should be 0).
Saves a report CSV at out_path.
"""
train_ids = set(train_df['pdb_id'].str.lower())
casf_ids = set(casf_df['pdb_id'].str.lower())
overlap = train_ids & casf_ids
# Build supplementary table
rows = []
for pid in sorted(casf_ids):
rows.append({
'PDB_ID': pid.upper(),
'In_Training': 'Yes' if pid in train_ids else 'No',
})
report = pd.DataFrame(rows)
report.to_csv(out_path, index=False)
print(f"\nLeakage Check:")
print(f" Training complexes: {len(train_ids)}")
print(f" CASF complexes: {len(casf_ids)}")
print(f" Overlap: {len(overlap)}")
if overlap:
print(f"\n WARNING — overlapping PDB IDs:")
for pid in sorted(overlap):
print(f" {pid}")
else:
print(f" Result: CLEAN — zero overlap confirmed")
print(f" Report saved: {out_path}")
return len(overlap)
|