#!/usr/bin/env python3 """Diagnose logement field extraction failures.""" import json from pathlib import Path from collections import Counter # Check label mappings with open('data2/label_mappings.json') as f: mappings = json.load(f) labels = mappings['field_labels'] print('Field labels with "log":') for i, l in enumerate(labels): if 'log' in l.lower(): print(f' {i}: {l}') # Check sample annotations print('\n' + '='*60) print('Sample records with logement fields:') print('='*60) data = json.loads(Path('data_combined/combined_train_v2.json').read_text(encoding='utf-8')) count = 0 for r in data: if r.get('box_labels') and any('log' in b.lower() for b in r.get('box_labels',[])): count += 1 if count <= 3: # Show first 3 print(f'\n Record {count}:') print(f' image_file: {r.get("image_file")}') print(f' doc_class: {r.get("doc_class")}') # Find logement-related annotations for label, lid, bbox in zip(r.get('box_labels',[]), r.get('box_label_ids',[]), r.get('boxes',[])): if 'log' in label.lower(): print(f' {label} (id={lid}): bbox={bbox}') # Print OCR snippet around first logement field ocr = r.get('ocr_text', '') if len(ocr) > 300: print(f' ocr_text (first 300 chars): {ocr[:300]}...') else: print(f' ocr_text: {ocr}') print(f'\nTotal records with logement fields: {count}') # Check training progress on these fields print('\n' + '='*60) print('Training performance on logement fields:') print('='*60) trainer_state = json.loads(Path('models/extractor_v3/checkpoint-645/trainer_state.json').read_text(encoding='utf-8')) evals = [x for x in trainer_state['log_history'] if 'eval_macro_span_f1' in x] if evals: first = evals[0] last = evals[-1] print('\nEpoch 1 (first eval):') for k, v in sorted(first.items()): if 'log' in k.lower() and 'span_f1' in k: print(f' {k}: {v}') print('\nFinal epoch (last eval):') for k, v in sorted(last.items()): if 'log' in k.lower() and 'span_f1' in k: print(f' {k}: {v}')