AniFileBERT / tools /build_repair_focus_dataset.py
ModerRAS's picture
Organize parser modules and tools
8c50d16
"""Build a small fine-tuning set focused on repaired filename structures."""
from __future__ import annotations
import argparse
import json
import random
import re
from pathlib import Path
from typing import Iterable, List
from anifilebert.label_repairs import repair_jsonl_item
SPECIAL_FOCUS_RE = re.compile(
r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
r"(?:[_\-.]?\s*(?:EP?|#)?\d{1,4})?(?![A-Za-z0-9])",
re.I,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
parser.add_argument("--input", required=True, help="Repaired char JSONL dataset")
parser.add_argument("--output", required=True, help="Output focus JSONL")
parser.add_argument("--context-samples", type=int, default=50000,
help="Random non-repaired rows to include for stability")
parser.add_argument("--repeat-repaired", type=int, default=4,
help="Repeat rows that still trigger a repair pass")
parser.add_argument("--repeat-focus", type=int, default=3,
help="Repeat rows matching special-code focus patterns")
parser.add_argument("--max-focus-rows", type=int, default=80000,
help="Maximum dataset rows matching special-code focus patterns")
parser.add_argument("--repeat-manual", type=int, default=24,
help="Repeat hand-labeled hard cases")
parser.add_argument("--seed", type=int, default=42)
return parser.parse_args()
def char_item(filename: str, spans: List[tuple[str, str]]) -> dict:
tokens = list(filename)
labels = ["O"] * len(tokens)
cursor = 0
for text, entity in spans:
start = filename.find(text, cursor)
if start < 0:
start = filename.find(text)
if start < 0:
raise ValueError(f"Could not find span {text!r} in {filename!r}")
end = start + len(text)
labels[start] = f"B-{entity}"
for idx in range(start + 1, end):
labels[idx] = f"I-{entity}"
cursor = end
return {
"filename": filename,
"tokens": tokens,
"labels": labels,
"tokenizer_variant": "char",
"source": "manual_repair_focus",
}
def manual_cases() -> Iterable[dict]:
yield char_item(
"One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
[
("One.Piece", "TITLE"),
("1110", "EPISODE"),
("1080p", "RESOLUTION"),
("WEB-DL", "SOURCE"),
],
)
yield char_item(
"One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
[
("One.Piece", "TITLE"),
("1111", "EPISODE"),
("1080p", "RESOLUTION"),
("WEB-DL", "SOURCE"),
],
)
yield char_item(
"【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
[
("喵萌奶茶屋", "GROUP"),
("葬送的芙莉莲", "TITLE"),
("01", "EPISODE"),
("1080P", "RESOLUTION"),
("HEVC", "SOURCE"),
],
)
yield char_item(
"【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
[
("喵萌奶茶屋", "GROUP"),
("药屋少女的呢喃", "TITLE"),
("02", "EPISODE"),
("1080P", "RESOLUTION"),
("HEVC", "SOURCE"),
],
)
yield char_item(
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4",
[
("Billion Meta Lab", "GROUP"),
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
("07", "EPISODE"),
("1080P", "RESOLUTION"),
("CHT&JPN", "SOURCE"),
("檢索:魔法姊妹露露特莉莉", "SPECIAL"),
],
)
yield char_item(
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4",
[
("Billion Meta Lab", "GROUP"),
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
("08", "EPISODE"),
("1080P", "RESOLUTION"),
("CHT&JPN", "SOURCE"),
("检索:魔法姊妹露露特莉莉", "SPECIAL"),
],
)
yield char_item(
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
[
("LoliHouse", "GROUP"),
("Kakuriyo no Yadomeshi", "TITLE"),
("Ni", "SEASON"),
("12", "EPISODE"),
("WebRip", "SOURCE"),
("1080p", "RESOLUTION"),
("HEVC", "SOURCE"),
("AAC", "SOURCE"),
("SRTx2", "SOURCE"),
],
)
yield char_item(
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
[
("LoliHouse", "GROUP"),
("Kakuriyo no Yadomeshi", "TITLE"),
("Ni", "SEASON"),
("13", "EPISODE"),
("WebRip", "SOURCE"),
("1080p", "RESOLUTION"),
("HEVC", "SOURCE"),
("AAC", "SOURCE"),
("SRTx2", "SOURCE"),
],
)
yield char_item(
"[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
[
("AI-Raws", "GROUP"),
("炎炎の消防隊", "TITLE"),
("弐ノ章", "SEASON"),
("13", "EPISODE"),
("BD", "SOURCE"),
("HEVC", "SOURCE"),
("1920x1080", "RESOLUTION"),
("FLAC", "SOURCE"),
],
)
yield char_item(
"[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
[
("AI-Raws", "GROUP"),
("炎炎の消防隊", "TITLE"),
("弐ノ章", "SEASON"),
("01", "EPISODE"),
("BD", "SOURCE"),
("HEVC", "SOURCE"),
("1920x1080", "RESOLUTION"),
("FLAC", "SOURCE"),
],
)
yield char_item(
"[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
[
("DBD-Raws", "GROUP"),
("炎炎消防队", "TITLE"),
("貳之章", "SEASON"),
("01", "EPISODE"),
("1080P", "RESOLUTION"),
("BDRip", "SOURCE"),
("FLAC", "SOURCE"),
],
)
yield char_item(
"[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
[
("GM-Team", "GROUP"),
("逆天邪神", "TITLE"),
("第2季", "SEASON"),
("04", "EPISODE"),
("HEVC", "SOURCE"),
("GB", "SOURCE"),
("4K", "RESOLUTION"),
],
)
yield char_item(
"[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
[
("GM-Team", "GROUP"),
("剑来", "TITLE"),
("第2季", "SEASON"),
("04", "EPISODE"),
("HEVC", "SOURCE"),
("GB", "SOURCE"),
("4K", "RESOLUTION"),
],
)
yield char_item(
"[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
[
("GM-Team", "GROUP"),
("大主宰", "TITLE"),
("第2季", "SEASON"),
("04", "EPISODE"),
("HEVC", "SOURCE"),
("GB", "SOURCE"),
("4K", "RESOLUTION"),
],
)
yield char_item(
"[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
[
("YYDM&VCB-Studio", "GROUP"),
("Shinsekai Yori", "TITLE"),
("IV05", "SPECIAL"),
("1080p", "RESOLUTION"),
("x265_aac", "SOURCE"),
],
)
yield char_item(
"[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
[
("YYDM&VCB-Studio", "GROUP"),
("Shinsekai Yori", "TITLE"),
("NCED02", "SPECIAL"),
("1080p", "RESOLUTION"),
("x265_flac", "SOURCE"),
],
)
yield char_item(
"InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
[
("InuYasha", "TITLE"),
("NCED02", "SPECIAL"),
("BDrip", "SOURCE"),
("AV1", "SOURCE"),
("DTS", "SOURCE"),
("1080p", "RESOLUTION"),
],
)
yield char_item(
"[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
[
("VCB-Studio", "GROUP"),
("Yamada-kun to 7-nin no Majo", "TITLE"),
("NCED", "SPECIAL"),
("1080p", "RESOLUTION"),
("x265_flac", "SOURCE"),
],
)
def main() -> None:
args = parse_args()
rng = random.Random(args.seed)
input_path = Path(args.input)
output_path = Path(args.output)
repaired_rows: List[dict] = []
focus_rows: List[dict] = []
reservoir: List[dict] = []
seen_filenames = set()
total_rows = 0
with input_path.open("r", encoding="utf-8") as handle:
for line in handle:
if not line.strip():
continue
total_rows += 1
item = json.loads(line)
_repaired_item, repairs = repair_jsonl_item(item)
filename = item.get("filename")
if repairs:
repaired_rows.append(item)
if filename:
seen_filenames.add(filename)
continue
if filename and SPECIAL_FOCUS_RE.search(filename):
if len(focus_rows) < args.max_focus_rows:
focus_rows.append(item)
seen_filenames.add(filename)
else:
index = rng.randrange(total_rows)
if index < args.max_focus_rows:
focus_rows[index] = item
continue
if filename in seen_filenames:
continue
if len(reservoir) < args.context_samples:
reservoir.append(item)
else:
index = rng.randrange(total_rows)
if index < args.context_samples:
reservoir[index] = item
rows: List[dict] = []
for item in repaired_rows:
rows.extend([item] * max(1, args.repeat_repaired))
for item in focus_rows:
rows.extend([item] * max(1, args.repeat_focus))
rows.extend(reservoir)
for item in manual_cases():
rows.extend([item] * max(1, args.repeat_manual))
rng.shuffle(rows)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as handle:
for item in rows:
handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
print(json.dumps({
"input": str(input_path),
"output": str(output_path),
"total_rows": total_rows,
"repaired_rows": len(repaired_rows),
"focus_rows": len(focus_rows),
"context_rows": len(reservoir),
"manual_rows": len(list(manual_cases())),
"written_rows": len(rows),
}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()