"""Build a small fine-tuning set focused on repaired filename structures.""" from __future__ import annotations import argparse import json import random import re from pathlib import Path from typing import Iterable, List from anifilebert.label_repairs import repair_jsonl_item SPECIAL_FOCUS_RE = re.compile( r"(? argparse.Namespace: parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data") parser.add_argument("--input", required=True, help="Repaired char JSONL dataset") parser.add_argument("--output", required=True, help="Output focus JSONL") parser.add_argument("--context-samples", type=int, default=50000, help="Random non-repaired rows to include for stability") parser.add_argument("--repeat-repaired", type=int, default=4, help="Repeat rows that still trigger a repair pass") parser.add_argument("--repeat-focus", type=int, default=3, help="Repeat rows matching special-code focus patterns") parser.add_argument("--max-focus-rows", type=int, default=80000, help="Maximum dataset rows matching special-code focus patterns") parser.add_argument("--repeat-manual", type=int, default=24, help="Repeat hand-labeled hard cases") parser.add_argument("--seed", type=int, default=42) return parser.parse_args() def char_item(filename: str, spans: List[tuple[str, str]]) -> dict: tokens = list(filename) labels = ["O"] * len(tokens) cursor = 0 for text, entity in spans: start = filename.find(text, cursor) if start < 0: start = filename.find(text) if start < 0: raise ValueError(f"Could not find span {text!r} in {filename!r}") end = start + len(text) labels[start] = f"B-{entity}" for idx in range(start + 1, end): labels[idx] = f"I-{entity}" cursor = end return { "filename": filename, "tokens": tokens, "labels": labels, "tokenizer_variant": "char", "source": "manual_repair_focus", } def manual_cases() -> Iterable[dict]: yield char_item( "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264", [ ("One.Piece", "TITLE"), ("1110", "EPISODE"), ("1080p", "RESOLUTION"), ("WEB-DL", "SOURCE"), ], ) yield char_item( "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264", [ ("One.Piece", "TITLE"), ("1111", "EPISODE"), ("1080p", "RESOLUTION"), ("WEB-DL", "SOURCE"), ], ) yield char_item( "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]", [ ("喵萌奶茶屋", "GROUP"), ("葬送的芙莉莲", "TITLE"), ("01", "EPISODE"), ("1080P", "RESOLUTION"), ("HEVC", "SOURCE"), ], ) yield char_item( "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]", [ ("喵萌奶茶屋", "GROUP"), ("药屋少女的呢喃", "TITLE"), ("02", "EPISODE"), ("1080P", "RESOLUTION"), ("HEVC", "SOURCE"), ], ) yield char_item( "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4", [ ("Billion Meta Lab", "GROUP"), ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"), ("07", "EPISODE"), ("1080P", "RESOLUTION"), ("CHT&JPN", "SOURCE"), ("檢索:魔法姊妹露露特莉莉", "SPECIAL"), ], ) yield char_item( "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4", [ ("Billion Meta Lab", "GROUP"), ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"), ("08", "EPISODE"), ("1080P", "RESOLUTION"), ("CHT&JPN", "SOURCE"), ("检索:魔法姊妹露露特莉莉", "SPECIAL"), ], ) yield char_item( "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv", [ ("LoliHouse", "GROUP"), ("Kakuriyo no Yadomeshi", "TITLE"), ("Ni", "SEASON"), ("12", "EPISODE"), ("WebRip", "SOURCE"), ("1080p", "RESOLUTION"), ("HEVC", "SOURCE"), ("AAC", "SOURCE"), ("SRTx2", "SOURCE"), ], ) yield char_item( "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv", [ ("LoliHouse", "GROUP"), ("Kakuriyo no Yadomeshi", "TITLE"), ("Ni", "SEASON"), ("13", "EPISODE"), ("WebRip", "SOURCE"), ("1080p", "RESOLUTION"), ("HEVC", "SOURCE"), ("AAC", "SOURCE"), ("SRTx2", "SOURCE"), ], ) yield char_item( "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv", [ ("AI-Raws", "GROUP"), ("炎炎の消防隊", "TITLE"), ("弐ノ章", "SEASON"), ("13", "EPISODE"), ("BD", "SOURCE"), ("HEVC", "SOURCE"), ("1920x1080", "RESOLUTION"), ("FLAC", "SOURCE"), ], ) yield char_item( "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv", [ ("AI-Raws", "GROUP"), ("炎炎の消防隊", "TITLE"), ("弐ノ章", "SEASON"), ("01", "EPISODE"), ("BD", "SOURCE"), ("HEVC", "SOURCE"), ("1920x1080", "RESOLUTION"), ("FLAC", "SOURCE"), ], ) yield char_item( "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]", [ ("DBD-Raws", "GROUP"), ("炎炎消防队", "TITLE"), ("貳之章", "SEASON"), ("01", "EPISODE"), ("1080P", "RESOLUTION"), ("BDRip", "SOURCE"), ("FLAC", "SOURCE"), ], ) yield char_item( "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4", [ ("GM-Team", "GROUP"), ("逆天邪神", "TITLE"), ("第2季", "SEASON"), ("04", "EPISODE"), ("HEVC", "SOURCE"), ("GB", "SOURCE"), ("4K", "RESOLUTION"), ], ) yield char_item( "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]", [ ("GM-Team", "GROUP"), ("剑来", "TITLE"), ("第2季", "SEASON"), ("04", "EPISODE"), ("HEVC", "SOURCE"), ("GB", "SOURCE"), ("4K", "RESOLUTION"), ], ) yield char_item( "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]", [ ("GM-Team", "GROUP"), ("大主宰", "TITLE"), ("第2季", "SEASON"), ("04", "EPISODE"), ("HEVC", "SOURCE"), ("GB", "SOURCE"), ("4K", "RESOLUTION"), ], ) yield char_item( "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv", [ ("YYDM&VCB-Studio", "GROUP"), ("Shinsekai Yori", "TITLE"), ("IV05", "SPECIAL"), ("1080p", "RESOLUTION"), ("x265_aac", "SOURCE"), ], ) yield char_item( "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv", [ ("YYDM&VCB-Studio", "GROUP"), ("Shinsekai Yori", "TITLE"), ("NCED02", "SPECIAL"), ("1080p", "RESOLUTION"), ("x265_flac", "SOURCE"), ], ) yield char_item( "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi", [ ("InuYasha", "TITLE"), ("NCED02", "SPECIAL"), ("BDrip", "SOURCE"), ("AV1", "SOURCE"), ("DTS", "SOURCE"), ("1080p", "RESOLUTION"), ], ) yield char_item( "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]", [ ("VCB-Studio", "GROUP"), ("Yamada-kun to 7-nin no Majo", "TITLE"), ("NCED", "SPECIAL"), ("1080p", "RESOLUTION"), ("x265_flac", "SOURCE"), ], ) def main() -> None: args = parse_args() rng = random.Random(args.seed) input_path = Path(args.input) output_path = Path(args.output) repaired_rows: List[dict] = [] focus_rows: List[dict] = [] reservoir: List[dict] = [] seen_filenames = set() total_rows = 0 with input_path.open("r", encoding="utf-8") as handle: for line in handle: if not line.strip(): continue total_rows += 1 item = json.loads(line) _repaired_item, repairs = repair_jsonl_item(item) filename = item.get("filename") if repairs: repaired_rows.append(item) if filename: seen_filenames.add(filename) continue if filename and SPECIAL_FOCUS_RE.search(filename): if len(focus_rows) < args.max_focus_rows: focus_rows.append(item) seen_filenames.add(filename) else: index = rng.randrange(total_rows) if index < args.max_focus_rows: focus_rows[index] = item continue if filename in seen_filenames: continue if len(reservoir) < args.context_samples: reservoir.append(item) else: index = rng.randrange(total_rows) if index < args.context_samples: reservoir[index] = item rows: List[dict] = [] for item in repaired_rows: rows.extend([item] * max(1, args.repeat_repaired)) for item in focus_rows: rows.extend([item] * max(1, args.repeat_focus)) rows.extend(reservoir) for item in manual_cases(): rows.extend([item] * max(1, args.repeat_manual)) rng.shuffle(rows) output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open("w", encoding="utf-8") as handle: for item in rows: handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n") print(json.dumps({ "input": str(input_path), "output": str(output_path), "total_rows": total_rows, "repaired_rows": len(repaired_rows), "focus_rows": len(focus_rows), "context_rows": len(reservoir), "manual_rows": len(list(manual_cases())), "written_rows": len(rows), }, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()