Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 5,605 Bytes
4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 4248c69 beb7665 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """Export a status-annotated DMHY filename metadata list."""
from __future__ import annotations
import argparse
import json
import os
import re
import sqlite3
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from tools.dmhy_dataset import VIDEO_EXTENSIONS
DIGIT_RE = re.compile(r"\d+")
TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$")
def normalize_db_path(filename: str) -> str:
return filename.replace("\\", "/").strip()
def strip_video_extension(filename: str) -> tuple[str, str]:
stem, ext = os.path.splitext(filename)
return stem.strip(), ext.lower()
def is_bdmv_stream_path(filename: str) -> bool:
return "/BDMV/STREAM/" in filename.upper()
def digit_skeleton(text: str) -> str:
"""Return a key where only same-position non-digits must match."""
return DIGIT_RE.sub("<NUM>", text)
def strip_trailing_hash(text: str) -> str:
return TRAILING_HASH_RE.sub("", text).rstrip()
def has_trailing_hash(text: str) -> bool:
return TRAILING_HASH_RE.search(text) is not None
def iter_filenames(db_path: Path, min_id: int, max_id: int):
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
conn.execute("PRAGMA query_only=ON")
try:
yield from conn.execute(
"SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id",
(min_id, max_id),
)
finally:
conn.close()
def export_list(args: argparse.Namespace) -> None:
db_path = Path(args.db)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
conn.execute("PRAGMA query_only=ON")
try:
db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0
max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id)
finally:
conn.close()
record_counts: Counter[str] = Counter()
stats = {
"scanned_rows": 0,
"video_rows": 0,
"skipped_bdmv_stream_paths": 0,
"duplicate_exact_strings": 0,
"rows_with_path": 0,
"rows_with_trailing_hash": 0,
"written_rows": 0,
}
kept_records: list[dict] = []
for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
stats["scanned_rows"] += 1
normalized_path = normalize_db_path(raw_filename)
value, ext = strip_video_extension(normalized_path)
if ext not in VIDEO_EXTENSIONS:
continue
stats["video_rows"] += 1
if is_bdmv_stream_path(value):
stats["skipped_bdmv_stream_paths"] += 1
continue
if value in record_counts:
stats["duplicate_exact_strings"] += 1
record_counts[value] += 1
continue
record_counts[value] = 1
record = {
"value": value,
"uses_path": "/" in value,
"has_trailing_hash": has_trailing_hash(value),
"has_digits": DIGIT_RE.search(value) is not None,
"digit_skeleton": digit_skeleton(value),
}
kept_records.append(record)
if record["uses_path"]:
stats["rows_with_path"] += 1
if record["has_trailing_hash"]:
stats["rows_with_trailing_hash"] += 1
kept_records.sort(key=lambda record: record["value"])
stats["written_rows"] = len(kept_records)
with output_path.open("w", encoding="utf-8", newline="\n") as out:
for record in kept_records:
record["count"] = record_counts[record["value"]]
out.write(json.dumps(record, ensure_ascii=False) + "\n")
manifest = {
"created_at": datetime.now(timezone.utc).isoformat(),
"source_db": str(db_path),
"output": str(output_path),
"min_file_id": args.min_id,
"last_file_id": max_id,
"db_max_file_id_at_export_start": db_max_id,
"dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated",
"record_schema": {
"value": "extensionless normalized DB filename/path string",
"uses_path": "true when value contains / from the original DB path",
"has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash",
"has_digits": "true when value contains at least one digit",
"digit_skeleton": "value with every contiguous digit run replaced by <NUM>",
"count": "number of exact DB strings collapsed into this row",
},
"sort_order": "Python default Unicode string order",
"stats": stats,
}
manifest_path = output_path.with_suffix(".manifest.json")
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print(json.dumps(manifest, ensure_ascii=False, indent=2))
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields"
)
parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl")
parser.add_argument("--min-id", type=int, default=1)
parser.add_argument("--max-id", type=int, default=None)
return parser.parse_args()
if __name__ == "__main__":
export_list(parse_args())
|