Spaces:
Sleeping
Sleeping
File size: 2,250 Bytes
33ddb61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #!/usr/bin/env python3
"""Diagnose logement field extraction failures."""
import json
from pathlib import Path
from collections import Counter
# Check label mappings
with open('data2/label_mappings.json') as f:
mappings = json.load(f)
labels = mappings['field_labels']
print('Field labels with "log":')
for i, l in enumerate(labels):
if 'log' in l.lower():
print(f' {i}: {l}')
# Check sample annotations
print('\n' + '='*60)
print('Sample records with logement fields:')
print('='*60)
data = json.loads(Path('data_combined/combined_train_v2.json').read_text(encoding='utf-8'))
count = 0
for r in data:
if r.get('box_labels') and any('log' in b.lower() for b in r.get('box_labels',[])):
count += 1
if count <= 3: # Show first 3
print(f'\n Record {count}:')
print(f' image_file: {r.get("image_file")}')
print(f' doc_class: {r.get("doc_class")}')
# Find logement-related annotations
for label, lid, bbox in zip(r.get('box_labels',[]), r.get('box_label_ids',[]), r.get('boxes',[])):
if 'log' in label.lower():
print(f' {label} (id={lid}): bbox={bbox}')
# Print OCR snippet around first logement field
ocr = r.get('ocr_text', '')
if len(ocr) > 300:
print(f' ocr_text (first 300 chars): {ocr[:300]}...')
else:
print(f' ocr_text: {ocr}')
print(f'\nTotal records with logement fields: {count}')
# Check training progress on these fields
print('\n' + '='*60)
print('Training performance on logement fields:')
print('='*60)
trainer_state = json.loads(Path('models/extractor_v3/checkpoint-645/trainer_state.json').read_text(encoding='utf-8'))
evals = [x for x in trainer_state['log_history'] if 'eval_macro_span_f1' in x]
if evals:
first = evals[0]
last = evals[-1]
print('\nEpoch 1 (first eval):')
for k, v in sorted(first.items()):
if 'log' in k.lower() and 'span_f1' in k:
print(f' {k}: {v}')
print('\nFinal epoch (last eval):')
for k, v in sorted(last.items()):
if 'log' in k.lower() and 'span_f1' in k:
print(f' {k}: {v}')
|