AniFileBERT / verify_data.py
ModerRAS's picture
Add AniFileBERT model and training project
be5f706
raw
history blame
1.41 kB
"""Verify generated dataset quality."""
import json
from collections import Counter
with open('data/synthetic_small.jsonl', 'r', encoding='utf-8') as f:
samples = [json.loads(line) for line in f]
print(f'Total samples: {len(samples)}')
# Check a few samples
for i in range(min(5, len(samples))):
s = samples[i]
print(f'\nSample {i}:')
print(f' Tokens: {s["tokens"]}')
print(f' Labels: {s["labels"]}')
assert len(s['tokens']) == len(s['labels']), f'Mismatch: {len(s["tokens"])} != {len(s["labels"])}'
# Check BIO format validity
for j, label in enumerate(s['labels']):
if label.startswith('I-'):
if j == 0:
print(f' ERROR: First token is {label}')
else:
prev = s['labels'][j-1]
expected_prefix = 'B-' + label[2:]
if prev != label and prev != expected_prefix:
print(f' WARN: I- without B- at pos {j}: {prev} -> {label}')
# Label distribution
print('\nLabel distribution:')
all_labels = [l for s in samples for l in s['labels']]
total = len(all_labels)
for label, count in Counter(all_labels).most_common():
print(f' {label}: {count} ({count*100/total:.1f}%)')
# Sequence length stats
lengths = [len(s['tokens']) for s in samples]
print(f'\nSequence length: min={min(lengths)}, max={max(lengths)}, avg={sum(lengths)/len(lengths):.1f}')