File size: 1,395 Bytes
3dac39e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | #!/usr/bin/env python3
"""Remove train-test leakage from dnrti_5class.jsonl.
Checks prefix-80 and full-text overlap against all test sets.
"""
import json
TRAIN = "/home/ubuntu/alkyline/data/processed/dnrti_5class.jsonl"
OUTPUT = "/home/ubuntu/alkyline/data/processed/dnrti_5class_deleaked.jsonl"
TEST_SETS = [
"/home/ubuntu/alkyline/data/processed/enriched_5class_test.jsonl",
"/home/ubuntu/alkyline/data/processed/cyner_test.jsonl",
"/home/ubuntu/alkyline/data/processed/securebert2_5class_test.jsonl",
"/home/ubuntu/alkyline/data/processed/aptner_5class_test_clean.jsonl",
]
# Build test fingerprint sets
test_prefix80 = set()
test_fulltext = set()
for path in TEST_SETS:
with open(path) as f:
for line in f:
text = json.loads(line)["text"]
test_prefix80.add(text[:80])
test_fulltext.add(text)
print(f"Test fingerprints: {len(test_prefix80)} prefix-80, {len(test_fulltext)} full-text")
# Filter
kept = 0
removed = 0
with open(TRAIN) as fin, open(OUTPUT, "w") as fout:
for line in fin:
text = json.loads(line)["text"]
if text[:80] in test_prefix80 or text in test_fulltext:
removed += 1
else:
fout.write(line)
kept += 1
total = kept + removed
print(f"Original: {total}")
print(f"Removed: {removed}")
print(f"Kept: {kept}")
print(f"Output: {OUTPUT}")
|