"""Append path-shaped char BIO focus examples. This helper is intentionally small: it builds a handful of deterministic path examples where leading directories are noise and the parseable entities appear in later path segments. """ from __future__ import annotations import argparse import json from pathlib import Path def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]: tokens = list(filename) labels = ["O"] * len(tokens) cursor = 0 for text, entity in spans: start = filename.find(text, cursor) if start < 0: start = filename.find(text) if start < 0: raise ValueError(f"span {text!r} not found in {filename!r}") labels[start] = f"B-{entity}" for index in range(start + 1, start + len(text)): labels[index] = f"I-{entity}" cursor = start + len(text) return { "filename": filename, "tokens": tokens, "labels": labels, "tokenizer_variant": "char", "source": source, } def build_cases(source: str) -> list[dict[str, object]]: return [ char_item( r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv", [ ("Shinsekai Yori", "TITLE"), ("NCED02", "SPECIAL"), ("1080p", "RESOLUTION"), ("x265_flac", "SOURCE"), ], source, ), char_item( r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv", [ ("Sousou no Frieren", "TITLE"), ("Season 01", "SEASON"), ("31", "EPISODE"), ("1080P", "RESOLUTION"), ("Baha", "SOURCE"), ("WEB-DL", "SOURCE"), ], source, ), char_item( r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv", [ ("One Piece", "TITLE"), ("Season 21", "SEASON"), ("1110", "EPISODE"), ("1080p", "RESOLUTION"), ("WEB-DL", "SOURCE"), ], source, ), char_item( r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv", [ ("Witch Watch", "TITLE"), ("S01", "SEASON"), ("15", "EPISODE"), ("1080p", "RESOLUTION"), ("CHS", "SOURCE"), ], source, ), char_item( r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv", [ ("Kakuriyo no Yadomeshi", "TITLE"), ("Season 02", "SEASON"), ("12", "EPISODE"), ("WebRip", "SOURCE"), ("1080p", "RESOLUTION"), ], source, ), char_item( r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv", [ ("One Piece", "TITLE"), ("Season 21", "SEASON"), ("1110", "EPISODE"), ("1080p", "RESOLUTION"), ("WEB-DL", "SOURCE"), ], source, ), ] def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--output", required=True) parser.add_argument("--repeat", type=int, default=96) parser.add_argument("--source", default="manual_path_focus") parser.add_argument("--append", action="store_true") args = parser.parse_args() output = Path(args.output) output.parent.mkdir(parents=True, exist_ok=True) mode = "a" if args.append else "w" cases = build_cases(args.source) with output.open(mode, encoding="utf-8") as handle: for _ in range(args.repeat): for item in cases: handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n") print( json.dumps( { "output": str(output), "repeat": args.repeat, "case_count": len(cases), "written_rows": args.repeat * len(cases), "append": args.append, }, ensure_ascii=False, indent=2, ) ) if __name__ == "__main__": main()