File size: 2,250 Bytes
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
"""Diagnose logement field extraction failures."""
import json
from pathlib import Path
from collections import Counter

# Check label mappings
with open('data2/label_mappings.json') as f:
    mappings = json.load(f)

labels = mappings['field_labels']
print('Field labels with "log":')
for i, l in enumerate(labels):
    if 'log' in l.lower():
        print(f'  {i}: {l}')

# Check sample annotations
print('\n' + '='*60)
print('Sample records with logement fields:')
print('='*60)

data = json.loads(Path('data_combined/combined_train_v2.json').read_text(encoding='utf-8'))
count = 0
for r in data:
    if r.get('box_labels') and any('log' in b.lower() for b in r.get('box_labels',[])):
        count += 1
        if count <= 3:  # Show first 3
            print(f'\n Record {count}:')
            print(f'   image_file: {r.get("image_file")}')
            print(f'   doc_class: {r.get("doc_class")}')
            
            # Find logement-related annotations
            for label, lid, bbox in zip(r.get('box_labels',[]), r.get('box_label_ids',[]), r.get('boxes',[])):
                if 'log' in label.lower():
                    print(f'   {label} (id={lid}): bbox={bbox}')
            
            # Print OCR snippet around first logement field
            ocr = r.get('ocr_text', '')
            if len(ocr) > 300:
                print(f'   ocr_text (first 300 chars): {ocr[:300]}...')
            else:
                print(f'   ocr_text: {ocr}')

print(f'\nTotal records with logement fields: {count}')

# Check training progress on these fields
print('\n' + '='*60)
print('Training performance on logement fields:')
print('='*60)

trainer_state = json.loads(Path('models/extractor_v3/checkpoint-645/trainer_state.json').read_text(encoding='utf-8'))
evals = [x for x in trainer_state['log_history'] if 'eval_macro_span_f1' in x]
if evals:
    first = evals[0]
    last = evals[-1]
    
    print('\nEpoch 1 (first eval):')
    for k, v in sorted(first.items()):
        if 'log' in k.lower() and 'span_f1' in k:
            print(f'  {k}: {v}')
    
    print('\nFinal epoch (last eval):')
    for k, v in sorted(last.items()):
        if 'log' in k.lower() and 'span_f1' in k:
            print(f'  {k}: {v}')