| |
| """Remove train-test leakage from dnrti_5class.jsonl. |
| |
| Checks prefix-80 and full-text overlap against all test sets. |
| """ |
| import json |
|
|
| TRAIN = "/home/ubuntu/alkyline/data/processed/dnrti_5class.jsonl" |
| OUTPUT = "/home/ubuntu/alkyline/data/processed/dnrti_5class_deleaked.jsonl" |
|
|
| TEST_SETS = [ |
| "/home/ubuntu/alkyline/data/processed/enriched_5class_test.jsonl", |
| "/home/ubuntu/alkyline/data/processed/cyner_test.jsonl", |
| "/home/ubuntu/alkyline/data/processed/securebert2_5class_test.jsonl", |
| "/home/ubuntu/alkyline/data/processed/aptner_5class_test_clean.jsonl", |
| ] |
|
|
| |
| test_prefix80 = set() |
| test_fulltext = set() |
|
|
| for path in TEST_SETS: |
| with open(path) as f: |
| for line in f: |
| text = json.loads(line)["text"] |
| test_prefix80.add(text[:80]) |
| test_fulltext.add(text) |
|
|
| print(f"Test fingerprints: {len(test_prefix80)} prefix-80, {len(test_fulltext)} full-text") |
|
|
| |
| kept = 0 |
| removed = 0 |
| with open(TRAIN) as fin, open(OUTPUT, "w") as fout: |
| for line in fin: |
| text = json.loads(line)["text"] |
| if text[:80] in test_prefix80 or text in test_fulltext: |
| removed += 1 |
| else: |
| fout.write(line) |
| kept += 1 |
|
|
| total = kept + removed |
| print(f"Original: {total}") |
| print(f"Removed: {removed}") |
| print(f"Kept: {kept}") |
| print(f"Output: {OUTPUT}") |
|
|