File size: 2,748 Bytes

be5f706

"""Validate the fixed data generator produces correct labels."""
import json
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))

from tokenizer import AnimeTokenizer
from data_generator import generate_sample, TEMPLATES

tok = AnimeTokenizer()
tok.build_vocab([["test"]])

# Check specific problem patterns
problem_cases = [
    # "E" starting words in titles/groups
    ("Eighty Six", "episode"),  # was being mislabeled as episode
    ("Evangelion", "episode"),  # was being mislabeled
    ("Erai", "episode"),        # from Erai-raws, was mislabeled
    
    # Numbers in titles
    ("86", "episode"),          # from "86 Eighty Six"
    ("100", "episode"),         # from "100万の命の上に"
    ("07", "episode"),          # possible episode or title number
]

print("Testing specific problem patterns...")
print("=" * 60)

# Track label counts
label_counts = {}
for i in range(5000):
    sample = generate_sample(tok, TEMPLATES)
    for label in sample["labels"]:
        label_counts[label] = label_counts.get(label, 0) + 1
    
    # Check for E-starting mislabels
    for token, label in zip(sample["tokens"], sample["labels"]):
        # Check E-starting English words
        if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE':
            print(f"POTENTIAL BUG: '{token}' labeled as EPISODE")

        # Check number tokens
        if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE':
            # Should only appear in proper episode context
            pass

print(f"\nLabel distribution from {5000} samples:")
total = sum(label_counts.values())
for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
    print(f"  {label}: {count} ({count*100/total:.1f}%)")

# Check for IOB2 validity
print("\nIOB2 validity check...")
errors = 0
for i in range(1000):
    sample = generate_sample(tok, TEMPLATES)
    labels = sample["labels"]
    for j, label in enumerate(labels):
        if label.startswith("I-"):
            if j == 0:
                print(f"  ERROR: I- at position 0 in sample {i}")
                errors += 1
            else:
                prev = labels[j-1]
                expected = label.replace("I-", "B-")
                if prev not in (label, expected):
                    # Check if prev is O and there's a B- earlier (spanning O)
                    pass  # This is now valid for multi-word entities

print(f"IOB2 errors found: {errors}")

# Spot-check a few samples
print("\nSample outputs:")
for i in range(3):
    sample = generate_sample(tok, TEMPLATES)
    print(f"\nSample {i}:")
    for token, label in zip(sample["tokens"], sample["labels"]):
        print(f"  {label}: {token}")

print("\nValidation complete!")