Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Validate the fixed data generator produces correct labels.""" | |
| import json | |
| import sys | |
| import os | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from tokenizer import AnimeTokenizer | |
| from data_generator import generate_sample, TEMPLATES | |
| tok = AnimeTokenizer() | |
| tok.build_vocab([["test"]]) | |
| # Check specific problem patterns | |
| problem_cases = [ | |
| # "E" starting words in titles/groups | |
| ("Eighty Six", "episode"), # was being mislabeled as episode | |
| ("Evangelion", "episode"), # was being mislabeled | |
| ("Erai", "episode"), # from Erai-raws, was mislabeled | |
| # Numbers in titles | |
| ("86", "episode"), # from "86 Eighty Six" | |
| ("100", "episode"), # from "100万の命の上に" | |
| ("07", "episode"), # possible episode or title number | |
| ] | |
| print("Testing specific problem patterns...") | |
| print("=" * 60) | |
| # Track label counts | |
| label_counts = {} | |
| for i in range(5000): | |
| sample = generate_sample(tok, TEMPLATES) | |
| for label in sample["labels"]: | |
| label_counts[label] = label_counts.get(label, 0) + 1 | |
| # Check for E-starting mislabels | |
| for token, label in zip(sample["tokens"], sample["labels"]): | |
| # Check E-starting English words | |
| if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE': | |
| print(f"POTENTIAL BUG: '{token}' labeled as EPISODE") | |
| # Check number tokens | |
| if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE': | |
| # Should only appear in proper episode context | |
| pass | |
| print(f"\nLabel distribution from {5000} samples:") | |
| total = sum(label_counts.values()) | |
| for label, count in sorted(label_counts.items(), key=lambda x: -x[1]): | |
| print(f" {label}: {count} ({count*100/total:.1f}%)") | |
| # Check for IOB2 validity | |
| print("\nIOB2 validity check...") | |
| errors = 0 | |
| for i in range(1000): | |
| sample = generate_sample(tok, TEMPLATES) | |
| labels = sample["labels"] | |
| for j, label in enumerate(labels): | |
| if label.startswith("I-"): | |
| if j == 0: | |
| print(f" ERROR: I- at position 0 in sample {i}") | |
| errors += 1 | |
| else: | |
| prev = labels[j-1] | |
| expected = label.replace("I-", "B-") | |
| if prev not in (label, expected): | |
| # Check if prev is O and there's a B- earlier (spanning O) | |
| pass # This is now valid for multi-word entities | |
| print(f"IOB2 errors found: {errors}") | |
| # Spot-check a few samples | |
| print("\nSample outputs:") | |
| for i in range(3): | |
| sample = generate_sample(tok, TEMPLATES) | |
| print(f"\nSample {i}:") | |
| for token, label in zip(sample["tokens"], sample["labels"]): | |
| print(f" {label}: {token}") | |
| print("\nValidation complete!") | |