File size: 1,528 Bytes
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json, random
from collections import defaultdict

random.seed(42)

with open('data2/combined_annotations.json', encoding='utf-8') as f:
    all_records = json.load(f)

# Group pages by source PDF
pdf_groups = defaultdict(list)
for r in all_records:
    pdf_id = r['image_file'].rsplit('_p', 1)[0]
    pdf_groups[pdf_id].append(r)

pdfs = list(pdf_groups.keys())
random.shuffle(pdfs)

# 70/15/15 split at the PDF level
n = len(pdfs)
train_pdfs = pdfs[:int(n * 0.70)]
val_pdfs   = pdfs[int(n * 0.70):int(n * 0.85)]
test_pdfs  = pdfs[int(n * 0.85):]

def flatten(pdf_list):
    return [r for p in pdf_list for r in pdf_groups[p]]

train = flatten(train_pdfs)
val   = flatten(val_pdfs)
test  = flatten(test_pdfs)

json.dump(train, open('data_combined/combined_train_v2.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
json.dump(val,   open('data_combined/combined_val_v2.json',   'w', encoding='utf-8'), ensure_ascii=False, indent=2)
json.dump(test,  open('data_combined/combined_test_v2.json',  'w', encoding='utf-8'), ensure_ascii=False, indent=2)

print(f"Train: {len(train)} records | Val: {len(val)} | Test: {len(test)}")

# Verify no contamination
train_pdfs_set = set(train_pdfs)
val_pdfs_set   = set(val_pdfs)
test_pdfs_set  = set(test_pdfs)
print(f"train∩val overlap:  {len(train_pdfs_set & val_pdfs_set)} PDFs (should be 0)")
print(f"train∩test overlap: {len(train_pdfs_set & test_pdfs_set)} PDFs (should be 0)")
print(f"val∩test overlap:   {len(val_pdfs_set & test_pdfs_set)} PDFs (should be 0)")