File size: 909 Bytes
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
from pathlib import Path

for split in ['combined_train.json', 'combined_val.json', 'combined_test.json']:
    path = Path('data2') / split
    if not path.exists():
        continue
    
    with open(path, encoding='utf-8') as f:
        records = json.load(f)
    
    total = len(records)
    with_labels = 0
    total_boxes = 0
    entity_boxes = 0
    
    for r in records:
        box_ids = r.get('box_label_ids', [])
        total_boxes += len(box_ids)
        if box_ids and any(lid != 0 for lid in box_ids):
            with_labels += 1
            entity_boxes += sum(1 for lid in box_ids if lid != 0)
    
    print(f'\n{split}:')
    print(f'  Records: {total} total, {with_labels} with entities')
    print(f'  Boxes: {total_boxes} total, {entity_boxes} entity boxes')
    if total > 0:
        print(f'  Entity rate: {100*entity_boxes/total_boxes if total_boxes > 0 else 0:.2f}%')