"""Validate the fixed data generator produces correct labels.""" import json import sys import os sys.path.insert(0, os.path.dirname(__file__)) from tokenizer import AnimeTokenizer from data_generator import generate_sample, TEMPLATES tok = AnimeTokenizer() tok.build_vocab([["test"]]) # Check specific problem patterns problem_cases = [ # "E" starting words in titles/groups ("Eighty Six", "episode"), # was being mislabeled as episode ("Evangelion", "episode"), # was being mislabeled ("Erai", "episode"), # from Erai-raws, was mislabeled # Numbers in titles ("86", "episode"), # from "86 Eighty Six" ("100", "episode"), # from "100万の命の上に" ("07", "episode"), # possible episode or title number ] print("Testing specific problem patterns...") print("=" * 60) # Track label counts label_counts = {} for i in range(5000): sample = generate_sample(tok, TEMPLATES) for label in sample["labels"]: label_counts[label] = label_counts.get(label, 0) + 1 # Check for E-starting mislabels for token, label in zip(sample["tokens"], sample["labels"]): # Check E-starting English words if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE': print(f"POTENTIAL BUG: '{token}' labeled as EPISODE") # Check number tokens if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE': # Should only appear in proper episode context pass print(f"\nLabel distribution from {5000} samples:") total = sum(label_counts.values()) for label, count in sorted(label_counts.items(), key=lambda x: -x[1]): print(f" {label}: {count} ({count*100/total:.1f}%)") # Check for IOB2 validity print("\nIOB2 validity check...") errors = 0 for i in range(1000): sample = generate_sample(tok, TEMPLATES) labels = sample["labels"] for j, label in enumerate(labels): if label.startswith("I-"): if j == 0: print(f" ERROR: I- at position 0 in sample {i}") errors += 1 else: prev = labels[j-1] expected = label.replace("I-", "B-") if prev not in (label, expected): # Check if prev is O and there's a B- earlier (spanning O) pass # This is now valid for multi-word entities print(f"IOB2 errors found: {errors}") # Spot-check a few samples print("\nSample outputs:") for i in range(3): sample = generate_sample(tok, TEMPLATES) print(f"\nSample {i}:") for token, label in zip(sample["tokens"], sample["labels"]): print(f" {label}: {token}") print("\nValidation complete!")