ModerRAS
/

AniFileBERT

Token Classification

filename-parsing

Eval Results (legacy)

Model card Files Files and versions

AniFileBERT / validate_fix.py

ModerRAS's picture

Add AniFileBERT model and training project

be5f706 11 days ago

2.75 kB

	"""Validate the fixed data generator produces correct labels."""
	import json
	import sys
	import os
	sys.path.insert(0, os.path.dirname(__file__))

	from tokenizer import AnimeTokenizer
	from data_generator import generate_sample, TEMPLATES

	tok = AnimeTokenizer()
	tok.build_vocab([["test"]])

	# Check specific problem patterns
	problem_cases = [
	# "E" starting words in titles/groups
	("Eighty Six", "episode"), # was being mislabeled as episode
	("Evangelion", "episode"), # was being mislabeled
	("Erai", "episode"), # from Erai-raws, was mislabeled

	# Numbers in titles
	("86", "episode"), # from "86 Eighty Six"
	("100", "episode"), # from "100万の命の上に"
	("07", "episode"), # possible episode or title number
	]

	print("Testing specific problem patterns...")
	print("=" * 60)

	# Track label counts
	label_counts = {}
	for i in range(5000):
	sample = generate_sample(tok, TEMPLATES)
	for label in sample["labels"]:
	label_counts[label] = label_counts.get(label, 0) + 1

	# Check for E-starting mislabels
	for token, label in zip(sample["tokens"], sample["labels"]):
	# Check E-starting English words
	if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE':
	print(f"POTENTIAL BUG: '{token}' labeled as EPISODE")

	# Check number tokens
	if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE':
	# Should only appear in proper episode context
	pass

	print(f"\nLabel distribution from {5000} samples:")
	total = sum(label_counts.values())
	for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
	print(f" {label}: {count} ({count*100/total:.1f}%)")

	# Check for IOB2 validity
	print("\nIOB2 validity check...")
	errors = 0
	for i in range(1000):
	sample = generate_sample(tok, TEMPLATES)
	labels = sample["labels"]
	for j, label in enumerate(labels):
	if label.startswith("I-"):
	if j == 0:
	print(f" ERROR: I- at position 0 in sample {i}")
	errors += 1
	else:
	prev = labels[j-1]
	expected = label.replace("I-", "B-")
	if prev not in (label, expected):
	# Check if prev is O and there's a B- earlier (spanning O)
	pass # This is now valid for multi-word entities

	print(f"IOB2 errors found: {errors}")

	# Spot-check a few samples
	print("\nSample outputs:")
	for i in range(3):
	sample = generate_sample(tok, TEMPLATES)
	print(f"\nSample {i}:")
	for token, label in zip(sample["tokens"], sample["labels"]):
	print(f" {label}: {token}")

	print("\nValidation complete!")