Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Smoke tests for annotated DMHY graph dataset helpers.""" | |
| from __future__ import annotations | |
| import tempfile | |
| import json | |
| import subprocess | |
| import sys | |
| import unittest | |
| from pathlib import Path | |
| from tools.annotate_dmhy_prefix_graph import normalize_generated_tokens | |
| from tools.convert_annotated_dmhy_dataset import ( | |
| iter_validated_jsonl, | |
| validate_record, | |
| ) | |
| from tools.convert_to_char_dataset import convert_record | |
| class AnnotatedDmhyWorkflowTests(unittest.TestCase): | |
| def test_generated_tokens_split_punctuation_and_use_b_only_labels(self) -> None: | |
| tokens, labels = normalize_generated_tokens( | |
| ["[ANi]", " ", "Title-Name", "07"], | |
| ["B-GROUP", "O", "I-TITLE", "B-EPISODE"], | |
| ) | |
| self.assertEqual(tokens, ["[", "ANi", "]", " ", "Title", "-", "Name", "07"]) | |
| self.assertEqual( | |
| labels, | |
| ["O", "B-GROUP", "O", "O", "B-TITLE", "O", "B-TITLE", "B-EPISODE"], | |
| ) | |
| self.assertTrue(all(label == "O" or label.startswith("B-") for label in labels)) | |
| def test_preserve_i_labels_keeps_i_on_non_separator_pieces(self) -> None: | |
| tokens, labels = normalize_generated_tokens( | |
| ["Title-Name"], | |
| ["I-TITLE"], | |
| preserve_i_labels=True, | |
| ) | |
| self.assertEqual(tokens, ["Title", "-", "Name"]) | |
| self.assertEqual(labels, ["I-TITLE", "O", "I-TITLE"]) | |
| def test_validation_rejects_embedded_punctuation(self) -> None: | |
| record = { | |
| "filename": "Title-Name 07", | |
| "tokens": ["Title-Name", "07"], | |
| "labels": ["B-TITLE", "B-EPISODE"], | |
| } | |
| with self.assertRaisesRegex(ValueError, "contains punctuation"): | |
| validate_record(record, Path("sample.jsonl"), 1) | |
| def test_validation_rejects_embedded_symbol_separator(self) -> None: | |
| record = { | |
| "filename": "Title 1920×1080 07", | |
| "tokens": ["Title", "1920×1080", "07"], | |
| "labels": ["B-TITLE", "B-RESOLUTION", "B-EPISODE"], | |
| } | |
| with self.assertRaisesRegex(ValueError, "contains punctuation"): | |
| validate_record(record, Path("sample.jsonl"), 1) | |
| def test_b_only_input_converts_to_char_i_labels(self) -> None: | |
| record = { | |
| "filename": "Title-Name 07", | |
| "tokens": ["Title", "-", "Name", " ", "07"], | |
| "labels": ["B-TITLE", "O", "B-TITLE", "O", "B-EPISODE"], | |
| } | |
| validate_record(record, Path("sample.jsonl"), 1) | |
| converted = convert_record(record) | |
| self.assertIn("I-TITLE", converted["labels"]) | |
| self.assertEqual(converted["tokens"][:5], ["T", "i", "t", "l", "e"]) | |
| def test_iter_validated_jsonl_accepts_generated_shape(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| path = Path(tmpdir) / "records.jsonl" | |
| path.write_text( | |
| '{"filename":"A 01","tokens":["A"," ","01"],"labels":["B-TITLE","O","B-EPISODE"]}\n', | |
| encoding="utf-8", | |
| ) | |
| rows = list(iter_validated_jsonl(path)) | |
| self.assertEqual(len(rows), 1) | |
| self.assertEqual(rows[0]["filename"], "A 01") | |
| def test_cli_smoke_annotate_then_convert_with_temp_files(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmp = Path(tmpdir) | |
| graph_path = tmp / "graph.json" | |
| dataset_path = tmp / "dmhy_weak.generated.jsonl" | |
| char_path = tmp / "dmhy_weak.generated_char.jsonl" | |
| vocab_path = tmp / "vocab.generated.char.json" | |
| manifest_path = tmp / "manifest.json" | |
| graph_path.write_text( | |
| json.dumps( | |
| { | |
| "terminals": [ | |
| { | |
| "terminal_id": "t0", | |
| "weight": 1, | |
| "value_examples": [ | |
| "[ANi] Test Show - 01 [1080P][WEB-DL].mkv" | |
| ], | |
| "suffix_examples": [" [1080P][WEB-DL]"], | |
| } | |
| ] | |
| }, | |
| ensure_ascii=False, | |
| ), | |
| encoding="utf-8", | |
| ) | |
| annotate = subprocess.run( | |
| [ | |
| sys.executable, | |
| "-m", | |
| "tools.annotate_dmhy_prefix_graph", | |
| "--graph", | |
| str(graph_path), | |
| "--output", | |
| str(dataset_path), | |
| "--patch-output", | |
| "", | |
| "--examples-only", | |
| ], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| self.assertEqual(annotate.returncode, 0, annotate.stderr) | |
| rows = [ | |
| json.loads(line) | |
| for line in dataset_path.read_text(encoding="utf-8").splitlines() | |
| if line.strip() | |
| ] | |
| self.assertEqual(len(rows), 1) | |
| self.assertIn("annotations", rows[0]) | |
| self.assertEqual(rows[0]["tokens"][0], "[") | |
| self.assertEqual(rows[0]["labels"][0], "O") | |
| convert = subprocess.run( | |
| [ | |
| sys.executable, | |
| "-m", | |
| "tools.convert_annotated_dmhy_dataset", | |
| "--input", | |
| str(dataset_path), | |
| "--output", | |
| str(char_path), | |
| "--vocab-output", | |
| str(vocab_path), | |
| "--manifest-output", | |
| str(manifest_path), | |
| "--progress", | |
| "0", | |
| ], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| self.assertEqual(convert.returncode, 0, convert.stderr) | |
| self.assertTrue(char_path.exists()) | |
| self.assertTrue(vocab_path.exists()) | |
| self.assertTrue(manifest_path.exists()) | |
| def test_cli_source_list_mode_expands_beyond_value_examples(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmp = Path(tmpdir) | |
| graph_path = tmp / "graph.json" | |
| source_path = tmp / "dmhy_list.jsonl" | |
| dataset_path = tmp / "dmhy_weak.generated.jsonl" | |
| graph_path.write_text( | |
| json.dumps( | |
| { | |
| "terminals": [ | |
| { | |
| "terminal_id": "t0", | |
| "prefix": "[ANi] Full Show - ", | |
| "weight": 10, | |
| "value_examples": [ | |
| "[ANi] Full Show - 01 [1080P][WEB-DL].mkv" | |
| ], | |
| "suffix_examples": ["01 [1080P][WEB-DL]"], | |
| }, | |
| { | |
| "terminal_id": "t1", | |
| "prefix": "[ANi] Other Show - ", | |
| "weight": 10, | |
| "value_examples": [ | |
| "[ANi] Other Show - 01 [1080P][WEB-DL].mkv" | |
| ], | |
| "suffix_examples": ["01 [1080P][WEB-DL]"], | |
| }, | |
| ] | |
| }, | |
| ensure_ascii=False, | |
| ), | |
| encoding="utf-8", | |
| ) | |
| source_path.write_text( | |
| "\n".join( | |
| json.dumps({"value": value}, ensure_ascii=False) | |
| for value in [ | |
| "[ANi] Full Show - 01 [1080P][WEB-DL].mkv", | |
| "[ANi] Full Show - 02 [1080P][WEB-DL].mkv", | |
| "[ANi] Full Show - 03 [1080P][WEB-DL].mkv", | |
| "[ANi] Other Show - 01 [1080P][WEB-DL].mkv", | |
| ] | |
| ) | |
| + "\n", | |
| encoding="utf-8", | |
| ) | |
| annotate = subprocess.run( | |
| [ | |
| sys.executable, | |
| "-m", | |
| "tools.annotate_dmhy_prefix_graph", | |
| "--graph", | |
| str(graph_path), | |
| "--source-list", | |
| str(source_path), | |
| "--output", | |
| str(dataset_path), | |
| "--patch-output", | |
| "", | |
| "--limit", | |
| "1", | |
| ], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| self.assertEqual(annotate.returncode, 0, annotate.stderr) | |
| rows = [ | |
| json.loads(line) | |
| for line in dataset_path.read_text(encoding="utf-8").splitlines() | |
| if line.strip() | |
| ] | |
| self.assertEqual(len(rows), 3) | |
| self.assertEqual([row["filename"] for row in rows], [ | |
| "[ANi] Full Show - 01 [1080P][WEB-DL].mkv", | |
| "[ANi] Full Show - 02 [1080P][WEB-DL].mkv", | |
| "[ANi] Full Show - 03 [1080P][WEB-DL].mkv", | |
| ]) | |
| self.assertTrue(all(row["terminal_id"] == "t0" for row in rows)) | |
| def test_cli_examples_only_uses_terminal_value_examples(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmp = Path(tmpdir) | |
| graph_path = tmp / "graph.json" | |
| source_path = tmp / "dmhy_list.jsonl" | |
| dataset_path = tmp / "dmhy_weak.generated.jsonl" | |
| graph_path.write_text( | |
| json.dumps( | |
| { | |
| "terminals": [ | |
| { | |
| "terminal_id": "t0", | |
| "prefix": "[ANi] Example Show - ", | |
| "weight": 10, | |
| "value_examples": [ | |
| "[ANi] Example Show - 01 [1080P][WEB-DL].mkv" | |
| ], | |
| "suffix_examples": ["01 [1080P][WEB-DL]"], | |
| } | |
| ] | |
| }, | |
| ensure_ascii=False, | |
| ), | |
| encoding="utf-8", | |
| ) | |
| source_path.write_text( | |
| "\n".join( | |
| json.dumps({"value": value}, ensure_ascii=False) | |
| for value in [ | |
| "[ANi] Example Show - 01 [1080P][WEB-DL].mkv", | |
| "[ANi] Example Show - 02 [1080P][WEB-DL].mkv", | |
| ] | |
| ) | |
| + "\n", | |
| encoding="utf-8", | |
| ) | |
| annotate = subprocess.run( | |
| [ | |
| sys.executable, | |
| "-m", | |
| "tools.annotate_dmhy_prefix_graph", | |
| "--graph", | |
| str(graph_path), | |
| "--source-list", | |
| str(source_path), | |
| "--output", | |
| str(dataset_path), | |
| "--patch-output", | |
| "", | |
| "--examples-only", | |
| ], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| self.assertEqual(annotate.returncode, 0, annotate.stderr) | |
| rows = [ | |
| json.loads(line) | |
| for line in dataset_path.read_text(encoding="utf-8").splitlines() | |
| if line.strip() | |
| ] | |
| self.assertEqual(len(rows), 1) | |
| self.assertEqual(rows[0]["filename"], "[ANi] Example Show - 01 [1080P][WEB-DL].mkv") | |
| def test_cli_dag_annotation_units_include_shared_node_terminals(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmp = Path(tmpdir) | |
| dag_path = tmp / "dmhy_prefix_dag.json" | |
| output_path = tmp / "dmhy_prefix_dag.annotation_units.jsonl" | |
| dag_path.write_text( | |
| json.dumps( | |
| { | |
| "meta": {"version": "prefix-dag-v1"}, | |
| "root": 0, | |
| "nodes": [ | |
| { | |
| "id": 0, | |
| "terminal": False, | |
| "children": [ | |
| {"label": "A", "target": 1}, | |
| {"label": "B", "target": 2}, | |
| ], | |
| "incoming_count": 0, | |
| "reachable_terminals": 2, | |
| "reachable_weight": 20, | |
| }, | |
| { | |
| "id": 1, | |
| "terminal": False, | |
| "children": [{"label": " shared", "target": 3}], | |
| "incoming_count": 1, | |
| "reachable_terminals": 1, | |
| "reachable_weight": 10, | |
| }, | |
| { | |
| "id": 2, | |
| "terminal": False, | |
| "children": [{"label": " shared", "target": 3}], | |
| "incoming_count": 1, | |
| "reachable_terminals": 1, | |
| "reachable_weight": 10, | |
| }, | |
| { | |
| "id": 3, | |
| "terminal": False, | |
| "children": [ | |
| {"label": " 01", "target": 4}, | |
| {"label": " 02", "target": 5}, | |
| ], | |
| "incoming_count": 2, | |
| "reachable_terminals": 2, | |
| "reachable_weight": 20, | |
| }, | |
| { | |
| "id": 4, | |
| "terminal": True, | |
| "children": [], | |
| "incoming_count": 1, | |
| "reachable_terminals": 1, | |
| "reachable_weight": 10, | |
| }, | |
| { | |
| "id": 5, | |
| "terminal": True, | |
| "children": [], | |
| "incoming_count": 1, | |
| "reachable_terminals": 1, | |
| "reachable_weight": 10, | |
| }, | |
| ], | |
| "terminals": [ | |
| { | |
| "terminal_id": "t0", | |
| "node_id": 4, | |
| "prefix": "Show A shared 01", | |
| "digit_skeleton": "Show A shared <NUM>", | |
| "count": 10, | |
| "weight": 10, | |
| "suffix_examples": [" [1080P][WEB-DL]"], | |
| "value_examples": ["Show A shared 01 [1080P][WEB-DL].mkv"], | |
| "annotations": {}, | |
| }, | |
| { | |
| "terminal_id": "t1", | |
| "node_id": 5, | |
| "prefix": "Show B shared 02", | |
| "digit_skeleton": "Show B shared <NUM>", | |
| "count": 10, | |
| "weight": 10, | |
| "suffix_examples": [" [1080P][WEB-DL]"], | |
| "value_examples": ["Show B shared 02 [1080P][WEB-DL].mkv"], | |
| "annotations": {}, | |
| }, | |
| ], | |
| }, | |
| ensure_ascii=False, | |
| ), | |
| encoding="utf-8", | |
| ) | |
| annotate = subprocess.run( | |
| [ | |
| sys.executable, | |
| "-m", | |
| "tools.annotate_dmhy_prefix_dag", | |
| "--dag", | |
| str(dag_path), | |
| "--output", | |
| str(output_path), | |
| "--min-reachable-terminals", | |
| "2", | |
| "--min-incoming-count", | |
| "2", | |
| "--limit", | |
| "1", | |
| ], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| self.assertEqual(annotate.returncode, 0, annotate.stderr) | |
| rows = [ | |
| json.loads(line) | |
| for line in output_path.read_text(encoding="utf-8").splitlines() | |
| if line.strip() | |
| ] | |
| self.assertEqual(len(rows), 1) | |
| self.assertEqual(rows[0]["unit_id"], "dag-node-3") | |
| self.assertEqual(rows[0]["kind"], "shared_suffix") | |
| self.assertEqual(rows[0]["terminal_ids"], ["t0", "t1"]) | |
| self.assertEqual( | |
| rows[0]["prefix_examples"], | |
| ["Show A shared 01", "Show B shared 02"], | |
| ) | |
| self.assertEqual(rows[0]["common_edge_labels"], [" 01", " 02"]) | |
| self.assertIn("annotations", rows[0]) | |
| if __name__ == "__main__": | |
| unittest.main() | |