Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Convert token-level anime filename JSONL datasets to character tokens. | |
| Input records must contain parallel ``tokens`` and ``labels`` arrays. The | |
| converter expands each original token into Unicode code points and projects BIO | |
| labels onto the expanded sequence: | |
| - ``B-X`` keeps ``B-X`` on the first character and uses ``I-X`` afterwards. | |
| - ``I-X`` remains ``I-X`` on every character. | |
| - ``O`` remains ``O`` on every character. | |
| The script streams both input and output so it can process the full DMHY weak | |
| dataset without loading hundreds of MB into memory. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from collections import Counter | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from statistics import mean | |
| from typing import Iterable | |
| SPECIAL_TOKENS = ("[PAD]", "[UNK]", "[CLS]", "[SEP]") | |
| def projected_labels(token: str, label: str) -> tuple[list[str], list[str]]: | |
| """Return character tokens and projected BIO labels for one source token.""" | |
| chars = list(token) | |
| if not chars: | |
| return [], [] | |
| if label.startswith("B-"): | |
| entity = label.split("-", 1)[1] | |
| return chars, [label] + [f"I-{entity}"] * (len(chars) - 1) | |
| if label.startswith("I-"): | |
| return chars, [label] * len(chars) | |
| return chars, [label] * len(chars) | |
| def convert_record(record: dict) -> dict: | |
| """Convert one JSONL record while preserving non-token metadata.""" | |
| tokens = record["tokens"] | |
| labels = record["labels"] | |
| if len(tokens) != len(labels): | |
| raise ValueError( | |
| f"token/label length mismatch: {len(tokens)} tokens, {len(labels)} labels" | |
| ) | |
| char_tokens: list[str] = [] | |
| char_labels: list[str] = [] | |
| for token, label in zip(tokens, labels): | |
| pieces, piece_labels = projected_labels(str(token), str(label)) | |
| char_tokens.extend(pieces) | |
| char_labels.extend(piece_labels) | |
| converted = dict(record) | |
| converted["tokens"] = char_tokens | |
| converted["labels"] = char_labels | |
| converted["tokenizer_variant"] = "char" | |
| converted["source_token_count"] = len(tokens) | |
| converted["char_token_count"] = len(char_tokens) | |
| return converted | |
| def iter_jsonl(path: Path) -> Iterable[dict]: | |
| with path.open("r", encoding="utf-8") as handle: | |
| for line_no, line in enumerate(handle, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| yield json.loads(line) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError(f"{path}:{line_no}: invalid JSON") from exc | |
| def build_vocab(counter: Counter[str], max_size: int | None = None) -> dict[str, int]: | |
| """Build a frequency-sorted vocab with fixed special-token IDs.""" | |
| vocab = {token: idx for idx, token in enumerate(SPECIAL_TOKENS)} | |
| limit = None if max_size is None else max(max_size - len(vocab), 0) | |
| for token, _count in counter.most_common(limit): | |
| if token not in vocab: | |
| vocab[token] = len(vocab) | |
| return vocab | |
| def coverage(counter: Counter[str], vocab: dict[str, int]) -> float: | |
| total = sum(counter.values()) | |
| if total == 0: | |
| return 1.0 | |
| covered = sum(count for token, count in counter.items() if token in vocab) | |
| return covered / total | |
| def percentile(values: list[int], pct: float) -> int: | |
| if not values: | |
| return 0 | |
| ordered = sorted(values) | |
| index = min(len(ordered) - 1, round((pct / 100) * (len(ordered) - 1))) | |
| return ordered[index] | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Convert JSONL token labels to character labels") | |
| parser.add_argument("--input", required=True, help="Input token-level JSONL") | |
| parser.add_argument("--output", required=True, help="Output character-level JSONL") | |
| parser.add_argument("--vocab-output", required=True, help="Output vocab JSON") | |
| parser.add_argument("--manifest-output", default=None, help="Output manifest JSON") | |
| parser.add_argument("--max-vocab-size", type=int, default=None, | |
| help="Optional vocab cap including special tokens") | |
| parser.add_argument("--limit", type=int, default=None, help="Convert only the first N records") | |
| parser.add_argument("--progress", type=int, default=50_000, | |
| help="Print progress every N records") | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| input_path = Path(args.input) | |
| output_path = Path(args.output) | |
| vocab_path = Path(args.vocab_output) | |
| manifest_path = ( | |
| Path(args.manifest_output) | |
| if args.manifest_output | |
| else output_path.with_suffix(".manifest.json") | |
| ) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| vocab_path.parent.mkdir(parents=True, exist_ok=True) | |
| manifest_path.parent.mkdir(parents=True, exist_ok=True) | |
| char_counter: Counter[str] = Counter() | |
| label_counter: Counter[str] = Counter() | |
| row_count = 0 | |
| source_token_count = 0 | |
| char_token_count = 0 | |
| lengths: list[int] = [] | |
| examples: list[dict] = [] | |
| with output_path.open("w", encoding="utf-8", newline="\n") as out: | |
| for record in iter_jsonl(input_path): | |
| converted = convert_record(record) | |
| out.write(json.dumps(converted, ensure_ascii=False, separators=(",", ":")) + "\n") | |
| row_count += 1 | |
| source_token_count += converted["source_token_count"] | |
| char_len = converted["char_token_count"] | |
| char_token_count += char_len | |
| lengths.append(char_len) | |
| char_counter.update(converted["tokens"]) | |
| label_counter.update(converted["labels"]) | |
| if len(examples) < 5: | |
| examples.append(converted) | |
| if args.limit is not None and row_count >= args.limit: | |
| break | |
| if args.progress and row_count % args.progress == 0: | |
| print(f"converted {row_count:,} rows; unique chars={len(char_counter):,}") | |
| vocab = build_vocab(char_counter, args.max_vocab_size) | |
| vocab_path.write_text(json.dumps(vocab, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") | |
| manifest = { | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "input": str(input_path), | |
| "output": str(output_path), | |
| "vocab_output": str(vocab_path), | |
| "tokenizer_variant": "char", | |
| "projection": { | |
| "B-X": "first char keeps B-X; remaining chars become I-X", | |
| "I-X": "all chars keep I-X", | |
| "O": "all chars keep O", | |
| }, | |
| "row_count": row_count, | |
| "source_token_count": source_token_count, | |
| "char_token_count": char_token_count, | |
| "unique_char_count": len(char_counter), | |
| "vocab_size": len(vocab), | |
| "max_vocab_size": args.max_vocab_size, | |
| "vocab_coverage": coverage(char_counter, vocab), | |
| "label_counts": dict(label_counter), | |
| "char_length": { | |
| "min": min(lengths) if lengths else 0, | |
| "mean": mean(lengths) if lengths else 0, | |
| "p50": percentile(lengths, 50), | |
| "p90": percentile(lengths, 90), | |
| "p95": percentile(lengths, 95), | |
| "p99": percentile(lengths, 99), | |
| "max": max(lengths) if lengths else 0, | |
| }, | |
| "examples": examples, | |
| } | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") | |
| print(json.dumps({k: v for k, v in manifest.items() if k != "examples"}, ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |