Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Diagnose logement field extraction failures.""" | |
| import json | |
| from pathlib import Path | |
| from collections import Counter | |
| # Check label mappings | |
| with open('data2/label_mappings.json') as f: | |
| mappings = json.load(f) | |
| labels = mappings['field_labels'] | |
| print('Field labels with "log":') | |
| for i, l in enumerate(labels): | |
| if 'log' in l.lower(): | |
| print(f' {i}: {l}') | |
| # Check sample annotations | |
| print('\n' + '='*60) | |
| print('Sample records with logement fields:') | |
| print('='*60) | |
| data = json.loads(Path('data_combined/combined_train_v2.json').read_text(encoding='utf-8')) | |
| count = 0 | |
| for r in data: | |
| if r.get('box_labels') and any('log' in b.lower() for b in r.get('box_labels',[])): | |
| count += 1 | |
| if count <= 3: # Show first 3 | |
| print(f'\n Record {count}:') | |
| print(f' image_file: {r.get("image_file")}') | |
| print(f' doc_class: {r.get("doc_class")}') | |
| # Find logement-related annotations | |
| for label, lid, bbox in zip(r.get('box_labels',[]), r.get('box_label_ids',[]), r.get('boxes',[])): | |
| if 'log' in label.lower(): | |
| print(f' {label} (id={lid}): bbox={bbox}') | |
| # Print OCR snippet around first logement field | |
| ocr = r.get('ocr_text', '') | |
| if len(ocr) > 300: | |
| print(f' ocr_text (first 300 chars): {ocr[:300]}...') | |
| else: | |
| print(f' ocr_text: {ocr}') | |
| print(f'\nTotal records with logement fields: {count}') | |
| # Check training progress on these fields | |
| print('\n' + '='*60) | |
| print('Training performance on logement fields:') | |
| print('='*60) | |
| trainer_state = json.loads(Path('models/extractor_v3/checkpoint-645/trainer_state.json').read_text(encoding='utf-8')) | |
| evals = [x for x in trainer_state['log_history'] if 'eval_macro_span_f1' in x] | |
| if evals: | |
| first = evals[0] | |
| last = evals[-1] | |
| print('\nEpoch 1 (first eval):') | |
| for k, v in sorted(first.items()): | |
| if 'log' in k.lower() and 'span_f1' in k: | |
| print(f' {k}: {v}') | |
| print('\nFinal epoch (last eval):') | |
| for k, v in sorted(last.items()): | |
| if 'log' in k.lower() and 'span_f1' in k: | |
| print(f' {k}: {v}') | |