File size: 1,405 Bytes
be5f706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""Verify generated dataset quality."""
import json
from collections import Counter

with open('data/synthetic_small.jsonl', 'r', encoding='utf-8') as f:
    samples = [json.loads(line) for line in f]

print(f'Total samples: {len(samples)}')

# Check a few samples
for i in range(min(5, len(samples))):
    s = samples[i]
    print(f'\nSample {i}:')
    print(f'  Tokens: {s["tokens"]}')
    print(f'  Labels: {s["labels"]}')
    
    assert len(s['tokens']) == len(s['labels']), f'Mismatch: {len(s["tokens"])} != {len(s["labels"])}'
    
    # Check BIO format validity
    for j, label in enumerate(s['labels']):
        if label.startswith('I-'):
            if j == 0:
                print(f'  ERROR: First token is {label}')
            else:
                prev = s['labels'][j-1]
                expected_prefix = 'B-' + label[2:]
                if prev != label and prev != expected_prefix:
                    print(f'  WARN: I- without B- at pos {j}: {prev} -> {label}')

# Label distribution
print('\nLabel distribution:')
all_labels = [l for s in samples for l in s['labels']]
total = len(all_labels)
for label, count in Counter(all_labels).most_common():
    print(f'  {label}: {count} ({count*100/total:.1f}%)')

# Sequence length stats
lengths = [len(s['tokens']) for s in samples]
print(f'\nSequence length: min={min(lengths)}, max={max(lengths)}, avg={sum(lengths)/len(lengths):.1f}')