File size: 1,226 Bytes
7111e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json, os

# Maps any image path to its correct form subfolder.
# FIXED: was only handling form1a/form2a — missed form3a and form90.
def detect_folder(image_path):
    for form in ['form1a', 'form2a', 'form3a', 'form90']:
        if form in image_path:
            return form
    return 'form1a'   # safe fallback

for split in ['train', 'val']:
    ann_file = f'data/{split}_annotations.json'
    if not os.path.exists(ann_file):
        print(f'SKIP: {ann_file} not found')
        continue

    with open(ann_file) as f:
        data = json.load(f)

    fixed = []
    skipped = 0
    for d in data:
        # Support both old key names ('image'/'label') and new ('image_path'/'text')
        image_val = d.get('image') or d.get('image_path', '')
        text_val  = d.get('label') or d.get('text', '')

        if not image_val or not text_val:
            skipped += 1
            continue

        filename = os.path.basename(image_val)
        folder   = detect_folder(image_val)
        fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})

    with open(ann_file, 'w') as f:
        json.dump(fixed, f, indent=2)

    print(f'{split}: {len(fixed)} fixed, {skipped} skipped')

print('Done!')