#!/usr/bin/env python3 """Remove train-test leakage from dnrti_5class.jsonl. Checks prefix-80 and full-text overlap against all test sets. """ import json TRAIN = "/home/ubuntu/alkyline/data/processed/dnrti_5class.jsonl" OUTPUT = "/home/ubuntu/alkyline/data/processed/dnrti_5class_deleaked.jsonl" TEST_SETS = [ "/home/ubuntu/alkyline/data/processed/enriched_5class_test.jsonl", "/home/ubuntu/alkyline/data/processed/cyner_test.jsonl", "/home/ubuntu/alkyline/data/processed/securebert2_5class_test.jsonl", "/home/ubuntu/alkyline/data/processed/aptner_5class_test_clean.jsonl", ] # Build test fingerprint sets test_prefix80 = set() test_fulltext = set() for path in TEST_SETS: with open(path) as f: for line in f: text = json.loads(line)["text"] test_prefix80.add(text[:80]) test_fulltext.add(text) print(f"Test fingerprints: {len(test_prefix80)} prefix-80, {len(test_fulltext)} full-text") # Filter kept = 0 removed = 0 with open(TRAIN) as fin, open(OUTPUT, "w") as fout: for line in fin: text = json.loads(line)["text"] if text[:80] in test_prefix80 or text in test_fulltext: removed += 1 else: fout.write(line) kept += 1 total = kept + removed print(f"Original: {total}") print(f"Removed: {removed}") print(f"Kept: {kept}") print(f"Output: {OUTPUT}")