Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Export a status-annotated DMHY filename metadata list.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| import sqlite3 | |
| from collections import Counter | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from tools.dmhy_dataset import VIDEO_EXTENSIONS | |
| DIGIT_RE = re.compile(r"\d+") | |
| TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$") | |
| def normalize_db_path(filename: str) -> str: | |
| return filename.replace("\\", "/").strip() | |
| def strip_video_extension(filename: str) -> tuple[str, str]: | |
| stem, ext = os.path.splitext(filename) | |
| return stem.strip(), ext.lower() | |
| def is_bdmv_stream_path(filename: str) -> bool: | |
| return "/BDMV/STREAM/" in filename.upper() | |
| def digit_skeleton(text: str) -> str: | |
| """Return a key where only same-position non-digits must match.""" | |
| return DIGIT_RE.sub("<NUM>", text) | |
| def strip_trailing_hash(text: str) -> str: | |
| return TRAILING_HASH_RE.sub("", text).rstrip() | |
| def has_trailing_hash(text: str) -> bool: | |
| return TRAILING_HASH_RE.search(text) is not None | |
| def iter_filenames(db_path: Path, min_id: int, max_id: int): | |
| conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30) | |
| conn.execute("PRAGMA query_only=ON") | |
| try: | |
| yield from conn.execute( | |
| "SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id", | |
| (min_id, max_id), | |
| ) | |
| finally: | |
| conn.close() | |
| def export_list(args: argparse.Namespace) -> None: | |
| db_path = Path(args.db) | |
| output_path = Path(args.output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30) | |
| conn.execute("PRAGMA query_only=ON") | |
| try: | |
| db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0 | |
| max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id) | |
| finally: | |
| conn.close() | |
| record_counts: Counter[str] = Counter() | |
| stats = { | |
| "scanned_rows": 0, | |
| "video_rows": 0, | |
| "skipped_bdmv_stream_paths": 0, | |
| "duplicate_exact_strings": 0, | |
| "rows_with_path": 0, | |
| "rows_with_trailing_hash": 0, | |
| "written_rows": 0, | |
| } | |
| kept_records: list[dict] = [] | |
| for _, raw_filename in iter_filenames(db_path, args.min_id, max_id): | |
| stats["scanned_rows"] += 1 | |
| normalized_path = normalize_db_path(raw_filename) | |
| value, ext = strip_video_extension(normalized_path) | |
| if ext not in VIDEO_EXTENSIONS: | |
| continue | |
| stats["video_rows"] += 1 | |
| if is_bdmv_stream_path(value): | |
| stats["skipped_bdmv_stream_paths"] += 1 | |
| continue | |
| if value in record_counts: | |
| stats["duplicate_exact_strings"] += 1 | |
| record_counts[value] += 1 | |
| continue | |
| record_counts[value] = 1 | |
| record = { | |
| "value": value, | |
| "uses_path": "/" in value, | |
| "has_trailing_hash": has_trailing_hash(value), | |
| "has_digits": DIGIT_RE.search(value) is not None, | |
| "digit_skeleton": digit_skeleton(value), | |
| } | |
| kept_records.append(record) | |
| if record["uses_path"]: | |
| stats["rows_with_path"] += 1 | |
| if record["has_trailing_hash"]: | |
| stats["rows_with_trailing_hash"] += 1 | |
| kept_records.sort(key=lambda record: record["value"]) | |
| stats["written_rows"] = len(kept_records) | |
| with output_path.open("w", encoding="utf-8", newline="\n") as out: | |
| for record in kept_records: | |
| record["count"] = record_counts[record["value"]] | |
| out.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| manifest = { | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "source_db": str(db_path), | |
| "output": str(output_path), | |
| "min_file_id": args.min_id, | |
| "last_file_id": max_id, | |
| "db_max_file_id_at_export_start": db_max_id, | |
| "dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated", | |
| "record_schema": { | |
| "value": "extensionless normalized DB filename/path string", | |
| "uses_path": "true when value contains / from the original DB path", | |
| "has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash", | |
| "has_digits": "true when value contains at least one digit", | |
| "digit_skeleton": "value with every contiguous digit run replaced by <NUM>", | |
| "count": "number of exact DB strings collapsed into this row", | |
| }, | |
| "sort_order": "Python default Unicode string order", | |
| "stats": stats, | |
| } | |
| manifest_path = output_path.with_suffix(".manifest.json") | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") | |
| print(json.dumps(manifest, ensure_ascii=False, indent=2)) | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields" | |
| ) | |
| parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db") | |
| parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl") | |
| parser.add_argument("--min-id", type=int, default=1) | |
| parser.add_argument("--max-id", type=int, default=None) | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| export_list(parse_args()) | |