FiberGate / tools /debug_logement.py
AzizMiladi's picture
chore: git mv scripts, UI, dev tools, docs into folders
70c46cc
Raw
History Blame
2.25 kB
#!/usr/bin/env python3
"""Diagnose logement field extraction failures."""
import json
from pathlib import Path
from collections import Counter
# Check label mappings
with open('data2/label_mappings.json') as f:
mappings = json.load(f)
labels = mappings['field_labels']
print('Field labels with "log":')
for i, l in enumerate(labels):
if 'log' in l.lower():
print(f' {i}: {l}')
# Check sample annotations
print('\n' + '='*60)
print('Sample records with logement fields:')
print('='*60)
data = json.loads(Path('data_combined/combined_train_v2.json').read_text(encoding='utf-8'))
count = 0
for r in data:
if r.get('box_labels') and any('log' in b.lower() for b in r.get('box_labels',[])):
count += 1
if count <= 3: # Show first 3
print(f'\n Record {count}:')
print(f' image_file: {r.get("image_file")}')
print(f' doc_class: {r.get("doc_class")}')
# Find logement-related annotations
for label, lid, bbox in zip(r.get('box_labels',[]), r.get('box_label_ids',[]), r.get('boxes',[])):
if 'log' in label.lower():
print(f' {label} (id={lid}): bbox={bbox}')
# Print OCR snippet around first logement field
ocr = r.get('ocr_text', '')
if len(ocr) > 300:
print(f' ocr_text (first 300 chars): {ocr[:300]}...')
else:
print(f' ocr_text: {ocr}')
print(f'\nTotal records with logement fields: {count}')
# Check training progress on these fields
print('\n' + '='*60)
print('Training performance on logement fields:')
print('='*60)
trainer_state = json.loads(Path('models/extractor_v3/checkpoint-645/trainer_state.json').read_text(encoding='utf-8'))
evals = [x for x in trainer_state['log_history'] if 'eval_macro_span_f1' in x]
if evals:
first = evals[0]
last = evals[-1]
print('\nEpoch 1 (first eval):')
for k, v in sorted(first.items()):
if 'log' in k.lower() and 'span_f1' in k:
print(f' {k}: {v}')
print('\nFinal epoch (last eval):')
for k, v in sorted(last.items()):
if 'log' in k.lower() and 'span_f1' in k:
print(f' {k}: {v}')