Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 4,454 Bytes
1e1bc1f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """Append path-shaped char BIO focus examples.
This helper is intentionally small: it builds a handful of deterministic path
examples where leading directories are noise and the parseable entities appear
in later path segments.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
tokens = list(filename)
labels = ["O"] * len(tokens)
cursor = 0
for text, entity in spans:
start = filename.find(text, cursor)
if start < 0:
start = filename.find(text)
if start < 0:
raise ValueError(f"span {text!r} not found in {filename!r}")
labels[start] = f"B-{entity}"
for index in range(start + 1, start + len(text)):
labels[index] = f"I-{entity}"
cursor = start + len(text)
return {
"filename": filename,
"tokens": tokens,
"labels": labels,
"tokenizer_variant": "char",
"source": source,
}
def build_cases(source: str) -> list[dict[str, object]]:
return [
char_item(
r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
[
("Shinsekai Yori", "TITLE"),
("NCED02", "SPECIAL"),
("1080p", "RESOLUTION"),
("x265_flac", "SOURCE"),
],
source,
),
char_item(
r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
[
("Sousou no Frieren", "TITLE"),
("Season 01", "SEASON"),
("31", "EPISODE"),
("1080P", "RESOLUTION"),
("Baha", "SOURCE"),
("WEB-DL", "SOURCE"),
],
source,
),
char_item(
r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
[
("One Piece", "TITLE"),
("Season 21", "SEASON"),
("1110", "EPISODE"),
("1080p", "RESOLUTION"),
("WEB-DL", "SOURCE"),
],
source,
),
char_item(
r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
[
("Witch Watch", "TITLE"),
("S01", "SEASON"),
("15", "EPISODE"),
("1080p", "RESOLUTION"),
("CHS", "SOURCE"),
],
source,
),
char_item(
r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
[
("Kakuriyo no Yadomeshi", "TITLE"),
("Season 02", "SEASON"),
("12", "EPISODE"),
("WebRip", "SOURCE"),
("1080p", "RESOLUTION"),
],
source,
),
char_item(
r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
[
("One Piece", "TITLE"),
("Season 21", "SEASON"),
("1110", "EPISODE"),
("1080p", "RESOLUTION"),
("WEB-DL", "SOURCE"),
],
source,
),
]
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--output", required=True)
parser.add_argument("--repeat", type=int, default=96)
parser.add_argument("--source", default="manual_path_focus")
parser.add_argument("--append", action="store_true")
args = parser.parse_args()
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
mode = "a" if args.append else "w"
cases = build_cases(args.source)
with output.open(mode, encoding="utf-8") as handle:
for _ in range(args.repeat):
for item in cases:
handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
print(
json.dumps(
{
"output": str(output),
"repeat": args.repeat,
"case_count": len(cases),
"written_rows": args.repeat * len(cases),
"append": args.append,
},
ensure_ascii=False,
indent=2,
)
)
if __name__ == "__main__":
main()
|