arcspan / scripts /deleak_dnrti.py
chairulridjal's picture
Add files using upload-large-folder tool
3dac39e verified
#!/usr/bin/env python3
"""Remove train-test leakage from dnrti_5class.jsonl.
Checks prefix-80 and full-text overlap against all test sets.
"""
import json
TRAIN = "/home/ubuntu/alkyline/data/processed/dnrti_5class.jsonl"
OUTPUT = "/home/ubuntu/alkyline/data/processed/dnrti_5class_deleaked.jsonl"
TEST_SETS = [
"/home/ubuntu/alkyline/data/processed/enriched_5class_test.jsonl",
"/home/ubuntu/alkyline/data/processed/cyner_test.jsonl",
"/home/ubuntu/alkyline/data/processed/securebert2_5class_test.jsonl",
"/home/ubuntu/alkyline/data/processed/aptner_5class_test_clean.jsonl",
]
# Build test fingerprint sets
test_prefix80 = set()
test_fulltext = set()
for path in TEST_SETS:
with open(path) as f:
for line in f:
text = json.loads(line)["text"]
test_prefix80.add(text[:80])
test_fulltext.add(text)
print(f"Test fingerprints: {len(test_prefix80)} prefix-80, {len(test_fulltext)} full-text")
# Filter
kept = 0
removed = 0
with open(TRAIN) as fin, open(OUTPUT, "w") as fout:
for line in fin:
text = json.loads(line)["text"]
if text[:80] in test_prefix80 or text in test_fulltext:
removed += 1
else:
fout.write(line)
kept += 1
total = kept + removed
print(f"Original: {total}")
print(f"Removed: {removed}")
print(f"Kept: {kept}")
print(f"Output: {OUTPUT}")