Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 2,748 Bytes
be5f706 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """Validate the fixed data generator produces correct labels."""
import json
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from tokenizer import AnimeTokenizer
from data_generator import generate_sample, TEMPLATES
tok = AnimeTokenizer()
tok.build_vocab([["test"]])
# Check specific problem patterns
problem_cases = [
# "E" starting words in titles/groups
("Eighty Six", "episode"), # was being mislabeled as episode
("Evangelion", "episode"), # was being mislabeled
("Erai", "episode"), # from Erai-raws, was mislabeled
# Numbers in titles
("86", "episode"), # from "86 Eighty Six"
("100", "episode"), # from "100万の命の上に"
("07", "episode"), # possible episode or title number
]
print("Testing specific problem patterns...")
print("=" * 60)
# Track label counts
label_counts = {}
for i in range(5000):
sample = generate_sample(tok, TEMPLATES)
for label in sample["labels"]:
label_counts[label] = label_counts.get(label, 0) + 1
# Check for E-starting mislabels
for token, label in zip(sample["tokens"], sample["labels"]):
# Check E-starting English words
if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE':
print(f"POTENTIAL BUG: '{token}' labeled as EPISODE")
# Check number tokens
if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE':
# Should only appear in proper episode context
pass
print(f"\nLabel distribution from {5000} samples:")
total = sum(label_counts.values())
for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
print(f" {label}: {count} ({count*100/total:.1f}%)")
# Check for IOB2 validity
print("\nIOB2 validity check...")
errors = 0
for i in range(1000):
sample = generate_sample(tok, TEMPLATES)
labels = sample["labels"]
for j, label in enumerate(labels):
if label.startswith("I-"):
if j == 0:
print(f" ERROR: I- at position 0 in sample {i}")
errors += 1
else:
prev = labels[j-1]
expected = label.replace("I-", "B-")
if prev not in (label, expected):
# Check if prev is O and there's a B- earlier (spanning O)
pass # This is now valid for multi-word entities
print(f"IOB2 errors found: {errors}")
# Spot-check a few samples
print("\nSample outputs:")
for i in range(3):
sample = generate_sample(tok, TEMPLATES)
print(f"\nSample {i}:")
for token, label in zip(sample["tokens"], sample["labels"]):
print(f" {label}: {token}")
print("\nValidation complete!")
|