Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Build a small fine-tuning set focused on repaired filename structures.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import random | |
| import re | |
| from pathlib import Path | |
| from typing import Iterable, List | |
| from anifilebert.label_repairs import repair_jsonl_item | |
| SPECIAL_FOCUS_RE = re.compile( | |
| r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}" | |
| r"(?:[_\-.]?\s*(?:EP?|#)?\d{1,4})?(?![A-Za-z0-9])", | |
| re.I, | |
| ) | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data") | |
| parser.add_argument("--input", required=True, help="Repaired char JSONL dataset") | |
| parser.add_argument("--output", required=True, help="Output focus JSONL") | |
| parser.add_argument("--context-samples", type=int, default=50000, | |
| help="Random non-repaired rows to include for stability") | |
| parser.add_argument("--repeat-repaired", type=int, default=4, | |
| help="Repeat rows that still trigger a repair pass") | |
| parser.add_argument("--repeat-focus", type=int, default=3, | |
| help="Repeat rows matching special-code focus patterns") | |
| parser.add_argument("--max-focus-rows", type=int, default=80000, | |
| help="Maximum dataset rows matching special-code focus patterns") | |
| parser.add_argument("--repeat-manual", type=int, default=24, | |
| help="Repeat hand-labeled hard cases") | |
| parser.add_argument("--seed", type=int, default=42) | |
| return parser.parse_args() | |
| def char_item(filename: str, spans: List[tuple[str, str]]) -> dict: | |
| tokens = list(filename) | |
| labels = ["O"] * len(tokens) | |
| cursor = 0 | |
| for text, entity in spans: | |
| start = filename.find(text, cursor) | |
| if start < 0: | |
| start = filename.find(text) | |
| if start < 0: | |
| raise ValueError(f"Could not find span {text!r} in {filename!r}") | |
| end = start + len(text) | |
| labels[start] = f"B-{entity}" | |
| for idx in range(start + 1, end): | |
| labels[idx] = f"I-{entity}" | |
| cursor = end | |
| return { | |
| "filename": filename, | |
| "tokens": tokens, | |
| "labels": labels, | |
| "tokenizer_variant": "char", | |
| "source": "manual_repair_focus", | |
| } | |
| def manual_cases() -> Iterable[dict]: | |
| yield char_item( | |
| "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264", | |
| [ | |
| ("One.Piece", "TITLE"), | |
| ("1110", "EPISODE"), | |
| ("1080p", "RESOLUTION"), | |
| ("WEB-DL", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264", | |
| [ | |
| ("One.Piece", "TITLE"), | |
| ("1111", "EPISODE"), | |
| ("1080p", "RESOLUTION"), | |
| ("WEB-DL", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]", | |
| [ | |
| ("喵萌奶茶屋", "GROUP"), | |
| ("葬送的芙莉莲", "TITLE"), | |
| ("01", "EPISODE"), | |
| ("1080P", "RESOLUTION"), | |
| ("HEVC", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]", | |
| [ | |
| ("喵萌奶茶屋", "GROUP"), | |
| ("药屋少女的呢喃", "TITLE"), | |
| ("02", "EPISODE"), | |
| ("1080P", "RESOLUTION"), | |
| ("HEVC", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4", | |
| [ | |
| ("Billion Meta Lab", "GROUP"), | |
| ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"), | |
| ("07", "EPISODE"), | |
| ("1080P", "RESOLUTION"), | |
| ("CHT&JPN", "SOURCE"), | |
| ("檢索:魔法姊妹露露特莉莉", "SPECIAL"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4", | |
| [ | |
| ("Billion Meta Lab", "GROUP"), | |
| ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"), | |
| ("08", "EPISODE"), | |
| ("1080P", "RESOLUTION"), | |
| ("CHT&JPN", "SOURCE"), | |
| ("检索:魔法姊妹露露特莉莉", "SPECIAL"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv", | |
| [ | |
| ("LoliHouse", "GROUP"), | |
| ("Kakuriyo no Yadomeshi", "TITLE"), | |
| ("Ni", "SEASON"), | |
| ("12", "EPISODE"), | |
| ("WebRip", "SOURCE"), | |
| ("1080p", "RESOLUTION"), | |
| ("HEVC", "SOURCE"), | |
| ("AAC", "SOURCE"), | |
| ("SRTx2", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv", | |
| [ | |
| ("LoliHouse", "GROUP"), | |
| ("Kakuriyo no Yadomeshi", "TITLE"), | |
| ("Ni", "SEASON"), | |
| ("13", "EPISODE"), | |
| ("WebRip", "SOURCE"), | |
| ("1080p", "RESOLUTION"), | |
| ("HEVC", "SOURCE"), | |
| ("AAC", "SOURCE"), | |
| ("SRTx2", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv", | |
| [ | |
| ("AI-Raws", "GROUP"), | |
| ("炎炎の消防隊", "TITLE"), | |
| ("弐ノ章", "SEASON"), | |
| ("13", "EPISODE"), | |
| ("BD", "SOURCE"), | |
| ("HEVC", "SOURCE"), | |
| ("1920x1080", "RESOLUTION"), | |
| ("FLAC", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv", | |
| [ | |
| ("AI-Raws", "GROUP"), | |
| ("炎炎の消防隊", "TITLE"), | |
| ("弐ノ章", "SEASON"), | |
| ("01", "EPISODE"), | |
| ("BD", "SOURCE"), | |
| ("HEVC", "SOURCE"), | |
| ("1920x1080", "RESOLUTION"), | |
| ("FLAC", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]", | |
| [ | |
| ("DBD-Raws", "GROUP"), | |
| ("炎炎消防队", "TITLE"), | |
| ("貳之章", "SEASON"), | |
| ("01", "EPISODE"), | |
| ("1080P", "RESOLUTION"), | |
| ("BDRip", "SOURCE"), | |
| ("FLAC", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4", | |
| [ | |
| ("GM-Team", "GROUP"), | |
| ("逆天邪神", "TITLE"), | |
| ("第2季", "SEASON"), | |
| ("04", "EPISODE"), | |
| ("HEVC", "SOURCE"), | |
| ("GB", "SOURCE"), | |
| ("4K", "RESOLUTION"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]", | |
| [ | |
| ("GM-Team", "GROUP"), | |
| ("剑来", "TITLE"), | |
| ("第2季", "SEASON"), | |
| ("04", "EPISODE"), | |
| ("HEVC", "SOURCE"), | |
| ("GB", "SOURCE"), | |
| ("4K", "RESOLUTION"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]", | |
| [ | |
| ("GM-Team", "GROUP"), | |
| ("大主宰", "TITLE"), | |
| ("第2季", "SEASON"), | |
| ("04", "EPISODE"), | |
| ("HEVC", "SOURCE"), | |
| ("GB", "SOURCE"), | |
| ("4K", "RESOLUTION"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv", | |
| [ | |
| ("YYDM&VCB-Studio", "GROUP"), | |
| ("Shinsekai Yori", "TITLE"), | |
| ("IV05", "SPECIAL"), | |
| ("1080p", "RESOLUTION"), | |
| ("x265_aac", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv", | |
| [ | |
| ("YYDM&VCB-Studio", "GROUP"), | |
| ("Shinsekai Yori", "TITLE"), | |
| ("NCED02", "SPECIAL"), | |
| ("1080p", "RESOLUTION"), | |
| ("x265_flac", "SOURCE"), | |
| ], | |
| ) | |
| yield char_item( | |
| "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi", | |
| [ | |
| ("InuYasha", "TITLE"), | |
| ("NCED02", "SPECIAL"), | |
| ("BDrip", "SOURCE"), | |
| ("AV1", "SOURCE"), | |
| ("DTS", "SOURCE"), | |
| ("1080p", "RESOLUTION"), | |
| ], | |
| ) | |
| yield char_item( | |
| "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]", | |
| [ | |
| ("VCB-Studio", "GROUP"), | |
| ("Yamada-kun to 7-nin no Majo", "TITLE"), | |
| ("NCED", "SPECIAL"), | |
| ("1080p", "RESOLUTION"), | |
| ("x265_flac", "SOURCE"), | |
| ], | |
| ) | |
| def main() -> None: | |
| args = parse_args() | |
| rng = random.Random(args.seed) | |
| input_path = Path(args.input) | |
| output_path = Path(args.output) | |
| repaired_rows: List[dict] = [] | |
| focus_rows: List[dict] = [] | |
| reservoir: List[dict] = [] | |
| seen_filenames = set() | |
| total_rows = 0 | |
| with input_path.open("r", encoding="utf-8") as handle: | |
| for line in handle: | |
| if not line.strip(): | |
| continue | |
| total_rows += 1 | |
| item = json.loads(line) | |
| _repaired_item, repairs = repair_jsonl_item(item) | |
| filename = item.get("filename") | |
| if repairs: | |
| repaired_rows.append(item) | |
| if filename: | |
| seen_filenames.add(filename) | |
| continue | |
| if filename and SPECIAL_FOCUS_RE.search(filename): | |
| if len(focus_rows) < args.max_focus_rows: | |
| focus_rows.append(item) | |
| seen_filenames.add(filename) | |
| else: | |
| index = rng.randrange(total_rows) | |
| if index < args.max_focus_rows: | |
| focus_rows[index] = item | |
| continue | |
| if filename in seen_filenames: | |
| continue | |
| if len(reservoir) < args.context_samples: | |
| reservoir.append(item) | |
| else: | |
| index = rng.randrange(total_rows) | |
| if index < args.context_samples: | |
| reservoir[index] = item | |
| rows: List[dict] = [] | |
| for item in repaired_rows: | |
| rows.extend([item] * max(1, args.repeat_repaired)) | |
| for item in focus_rows: | |
| rows.extend([item] * max(1, args.repeat_focus)) | |
| rows.extend(reservoir) | |
| for item in manual_cases(): | |
| rows.extend([item] * max(1, args.repeat_manual)) | |
| rng.shuffle(rows) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with output_path.open("w", encoding="utf-8") as handle: | |
| for item in rows: | |
| handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n") | |
| print(json.dumps({ | |
| "input": str(input_path), | |
| "output": str(output_path), | |
| "total_rows": total_rows, | |
| "repaired_rows": len(repaired_rows), | |
| "focus_rows": len(focus_rows), | |
| "context_rows": len(reservoir), | |
| "manual_rows": len(list(manual_cases())), | |
| "written_rows": len(rows), | |
| }, ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |