FiberGate / tools /check_data.py
AzizMiladi's picture
chore: git mv scripts, UI, dev tools, docs into folders
70c46cc
Raw
History Blame
909 Bytes
import json
from pathlib import Path
for split in ['combined_train.json', 'combined_val.json', 'combined_test.json']:
path = Path('data2') / split
if not path.exists():
continue
with open(path, encoding='utf-8') as f:
records = json.load(f)
total = len(records)
with_labels = 0
total_boxes = 0
entity_boxes = 0
for r in records:
box_ids = r.get('box_label_ids', [])
total_boxes += len(box_ids)
if box_ids and any(lid != 0 for lid in box_ids):
with_labels += 1
entity_boxes += sum(1 for lid in box_ids if lid != 0)
print(f'\n{split}:')
print(f' Records: {total} total, {with_labels} with entities')
print(f' Boxes: {total_boxes} total, {entity_boxes} entity boxes')
if total > 0:
print(f' Entity rate: {100*entity_boxes/total_boxes if total_boxes > 0 else 0:.2f}%')