import json from pathlib import Path for split in ['combined_train.json', 'combined_val.json', 'combined_test.json']: path = Path('data2') / split if not path.exists(): continue with open(path, encoding='utf-8') as f: records = json.load(f) total = len(records) with_labels = 0 total_boxes = 0 entity_boxes = 0 for r in records: box_ids = r.get('box_label_ids', []) total_boxes += len(box_ids) if box_ids and any(lid != 0 for lid in box_ids): with_labels += 1 entity_boxes += sum(1 for lid in box_ids if lid != 0) print(f'\n{split}:') print(f' Records: {total} total, {with_labels} with entities') print(f' Boxes: {total_boxes} total, {entity_boxes} entity boxes') if total > 0: print(f' Entity rate: {100*entity_boxes/total_boxes if total_boxes > 0 else 0:.2f}%')