Clean special code parsing

Browse files

Files changed (14) hide show

build_repair_focus_dataset.py +65 -0
case_metrics.json +100 -14
data/parser_regression_cases.json +47 -0
datasets/AnimeName +1 -1
dmhy_dataset.py +273 -6
exports/anime_filename_parser.metadata.json +1 -1
exports/anime_filename_parser.onnx +2 -2
inference.py +156 -17
label_repairs.py +5 -1
model.safetensors +1 -1
parse_eval_metrics.json +345 -326
run_metadata.json +12 -12
trainer_eval_metrics.json +9 -9
training_args.bin +1 -1

build_repair_focus_dataset.py CHANGED Viewed

@@ -5,11 +5,18 @@ from __future__ import annotations
 import argparse
 import json
 import random
 from pathlib import Path
 from typing import Iterable, List
 from label_repairs import repair_jsonl_item
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
@@ -19,6 +26,10 @@ def parse_args() -> argparse.Namespace:
                         help="Random non-repaired rows to include for stability")
     parser.add_argument("--repeat-repaired", type=int, default=4,
                         help="Repeat rows that still trigger a repair pass")
     parser.add_argument("--repeat-manual", type=int, default=24,
                         help="Repeat hand-labeled hard cases")
     parser.add_argument("--seed", type=int, default=42)
@@ -124,6 +135,47 @@ def manual_cases() -> Iterable[dict]:
             ("4K", "RESOLUTION"),
         ],
     )
 def main() -> None:
@@ -133,6 +185,7 @@ def main() -> None:
     output_path = Path(args.output)
     repaired_rows: List[dict] = []
     reservoir: List[dict] = []
     seen_filenames = set()
     total_rows = 0
@@ -150,6 +203,15 @@ def main() -> None:
                 if filename:
                     seen_filenames.add(filename)
                 continue
             if filename in seen_filenames:
                 continue
             if len(reservoir) < args.context_samples:
@@ -162,6 +224,8 @@ def main() -> None:
     rows: List[dict] = []
     for item in repaired_rows:
         rows.extend([item] * max(1, args.repeat_repaired))
     rows.extend(reservoir)
     for item in manual_cases():
         rows.extend([item] * max(1, args.repeat_manual))
@@ -177,6 +241,7 @@ def main() -> None:
         "output": str(output_path),
         "total_rows": total_rows,
         "repaired_rows": len(repaired_rows),
         "context_rows": len(reservoir),
         "manual_rows": len(list(manual_cases())),
         "written_rows": len(rows),

 import argparse
 import json
 import random
+import re
 from pathlib import Path
 from typing import Iterable, List
 from label_repairs import repair_jsonl_item
+SPECIAL_FOCUS_RE = re.compile(
+    r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
+    r"(?:[_\-.]?\s*(?:EP?|#)?\d{1,4})?(?![A-Za-z0-9])",
+    re.I,
+)
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
                         help="Random non-repaired rows to include for stability")
     parser.add_argument("--repeat-repaired", type=int, default=4,
                         help="Repeat rows that still trigger a repair pass")
+    parser.add_argument("--repeat-focus", type=int, default=3,
+                        help="Repeat rows matching special-code focus patterns")
+    parser.add_argument("--max-focus-rows", type=int, default=80000,
+                        help="Maximum dataset rows matching special-code focus patterns")
     parser.add_argument("--repeat-manual", type=int, default=24,
                         help="Repeat hand-labeled hard cases")
     parser.add_argument("--seed", type=int, default=42)
             ("4K", "RESOLUTION"),
         ],
     )
+    yield char_item(
+        "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
+        [
+            ("YYDM&VCB-Studio", "GROUP"),
+            ("Shinsekai Yori", "TITLE"),
+            ("IV05", "SPECIAL"),
+            ("1080p", "RESOLUTION"),
+            ("x265_aac", "SOURCE"),
+        ],
+    )
+    yield char_item(
+        "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
+        [
+            ("YYDM&VCB-Studio", "GROUP"),
+            ("Shinsekai Yori", "TITLE"),
+            ("NCED02", "SPECIAL"),
+            ("1080p", "RESOLUTION"),
+            ("x265_flac", "SOURCE"),
+        ],
+    )
+    yield char_item(
+        "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
+        [
+            ("InuYasha", "TITLE"),
+            ("NCED02", "SPECIAL"),
+            ("BDrip", "SOURCE"),
+            ("AV1", "SOURCE"),
+            ("DTS", "SOURCE"),
+            ("1080p", "RESOLUTION"),
+        ],
+    )
+    yield char_item(
+        "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
+        [
+            ("VCB-Studio", "GROUP"),
+            ("Yamada-kun to 7-nin no Majo", "TITLE"),
+            ("NCED", "SPECIAL"),
+            ("1080p", "RESOLUTION"),
+            ("x265_flac", "SOURCE"),
+        ],
+    )
 def main() -> None:
     output_path = Path(args.output)
     repaired_rows: List[dict] = []
+    focus_rows: List[dict] = []
     reservoir: List[dict] = []
     seen_filenames = set()
     total_rows = 0
                 if filename:
                     seen_filenames.add(filename)
                 continue
+            if filename and SPECIAL_FOCUS_RE.search(filename):
+                if len(focus_rows) < args.max_focus_rows:
+                    focus_rows.append(item)
+                    seen_filenames.add(filename)
+                else:
+                    index = rng.randrange(total_rows)
+                    if index < args.max_focus_rows:
+                        focus_rows[index] = item
+                continue
             if filename in seen_filenames:
                 continue
             if len(reservoir) < args.context_samples:
     rows: List[dict] = []
     for item in repaired_rows:
         rows.extend([item] * max(1, args.repeat_repaired))
+    for item in focus_rows:
+        rows.extend([item] * max(1, args.repeat_focus))
     rows.extend(reservoir)
     for item in manual_cases():
         rows.extend([item] * max(1, args.repeat_manual))
         "output": str(output_path),
         "total_rows": total_rows,
         "repaired_rows": len(repaired_rows),
+        "focus_rows": len(focus_rows),
         "context_rows": len(reservoir),
         "manual_rows": len(list(manual_cases())),
         "written_rows": len(rows),

case_metrics.json CHANGED Viewed

@@ -5,26 +5,26 @@
   "max_length": 128,
   "use_rules": true,
   "constrain_bio": true,
-  "case_count": 22,
-  "full_correct": 22,
   "full_accuracy": 1.0,
   "field_correct": {
-    "group": 19,
-    "title": 22,
-    "episode": 22,
-    "resolution": 22,
-    "source": 15,
     "season": 9,
-    "special": 1
   },
   "field_total": {
-    "group": 19,
-    "title": 22,
-    "episode": 22,
-    "resolution": 22,
-    "source": 15,
     "season": 9,
-    "special": 1
   },
   "field_accuracy": {
     "episode": 1.0,
@@ -476,6 +476,92 @@
         "source": "GB",
         "title": "逆天邪神"
       }
     }
   ]
 }

   "max_length": 128,
   "use_rules": true,
   "constrain_bio": true,
+  "case_count": 26,
+  "full_correct": 26,
   "full_accuracy": 1.0,
   "field_correct": {
+    "group": 22,
+    "title": 26,
+    "episode": 26,
+    "resolution": 26,
+    "source": 19,
     "season": 9,
+    "special": 5
   },
   "field_total": {
+    "group": 22,
+    "title": 26,
+    "episode": 26,
+    "resolution": 26,
+    "source": 19,
     "season": 9,
+    "special": 5
   },
   "field_accuracy": {
     "episode": 1.0,
         "source": "GB",
         "title": "逆天邪神"
       }
+    },
+    {
+      "id": "vcb_special_iv_not_episode",
+      "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
+      "ok": true,
+      "errors": {},
+      "expected": {
+        "group": "YYDM&VCB-Studio",
+        "title": "Shinsekai Yori",
+        "episode": null,
+        "resolution": "1080p",
+        "source": "x265_aac",
+        "special": "IV05"
+      },
+      "pred": {
+        "episode": null,
+        "group": "YYDM&VCB-Studio",
+        "resolution": "1080p",
+        "source": "x265_aac",
+        "special": "IV05",
+        "title": "Shinsekai Yori"
+      }
+    },
+    {
+      "id": "vcb_nced_not_episode",
+      "filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
+      "ok": true,
+      "errors": {},
+      "expected": {
+        "group": "YYDM&VCB-Studio",
+        "title": "Shinsekai Yori",
+        "episode": null,
+        "resolution": "1080p",
+        "source": "x265_flac",
+        "special": "NCED02"
+      },
+      "pred": {
+        "episode": null,
+        "group": "YYDM&VCB-Studio",
+        "resolution": "1080p",
+        "source": "x265_flac",
+        "special": "NCED02",
+        "title": "Shinsekai Yori"
+      }
+    },
+    {
+      "id": "dot_nced_suffix_not_episode",
+      "filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
+      "ok": true,
+      "errors": {},
+      "expected": {
+        "title": "InuYasha",
+        "episode": null,
+        "resolution": "1080p",
+        "source": "BDrip",
+        "special": "NCED02"
+      },
+      "pred": {
+        "episode": null,
+        "resolution": "1080p",
+        "source": "BDrip",
+        "special": "NCED02",
+        "title": "InuYasha"
+      }
+    },
+    {
+      "id": "vcb_numeric_title_nced",
+      "filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
+      "ok": true,
+      "errors": {},
+      "expected": {
+        "group": "VCB-Studio",
+        "title": "Yamada-kun to 7-nin no Majo",
+        "episode": null,
+        "resolution": "1080p",
+        "source": "x265_flac",
+        "special": "NCED"
+      },
+      "pred": {
+        "episode": null,
+        "group": "VCB-Studio",
+        "resolution": "1080p",
+        "source": "x265_flac",
+        "special": "NCED",
+        "title": "Yamada-kun to 7-nin no Majo"
+      }
     }
   ]
 }

data/parser_regression_cases.json CHANGED Viewed

@@ -240,5 +240,52 @@
       "resolution": "4K",
       "source": "GB"
     }
   }
 ]

       "resolution": "4K",
       "source": "GB"
     }
+  },
+  {
+    "id": "vcb_special_iv_not_episode",
+    "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
+    "expected": {
+      "group": "YYDM&VCB-Studio",
+      "title": "Shinsekai Yori",
+      "episode": null,
+      "resolution": "1080p",
+      "source": "x265_aac",
+      "special": "IV05"
+    }
+  },
+  {
+    "id": "vcb_nced_not_episode",
+    "filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
+    "expected": {
+      "group": "YYDM&VCB-Studio",
+      "title": "Shinsekai Yori",
+      "episode": null,
+      "resolution": "1080p",
+      "source": "x265_flac",
+      "special": "NCED02"
+    }
+  },
+  {
+    "id": "dot_nced_suffix_not_episode",
+    "filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
+    "expected": {
+      "title": "InuYasha",
+      "episode": null,
+      "resolution": "1080p",
+      "source": "BDrip",
+      "special": "NCED02"
+    }
+  },
+  {
+    "id": "vcb_numeric_title_nced",
+    "filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
+    "expected": {
+      "group": "VCB-Studio",
+      "title": "Yamada-kun to 7-nin no Majo",
+      "episode": null,
+      "resolution": "1080p",
+      "source": "x265_flac",
+      "special": "NCED"
+    }
   }
 ]

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~004a8c08628b6820fb2d1b59a80fdcfe925ef095~~


1	+ Subproject commit c40cb38963a390a61c6d375409031f8a6c5eb927

dmhy_dataset.py CHANGED Viewed

@@ -33,6 +33,7 @@ NOISE_BRACKETS = {
     "mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
     "raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
     "tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
     "繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
 }
 CATEGORY_BRACKETS = {
@@ -40,7 +41,18 @@ CATEGORY_BRACKETS = {
     "国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
 }
-SPECIAL_RE = re.compile(r"^(?:ova\d*|oad\d*|sp\d*|movie|the\s*movie|op|ed|pv|cm|ncop|nced|剧场版|劇場版|特别篇|特別篇)$", re.I)
 SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+", re.I)
 EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
 SEASON_RE = re.compile(
@@ -72,9 +84,16 @@ SOURCE_RE = re.compile(
     r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
     r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
     r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
     r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
     re.I,
 )
 GROUP_HINT_RE = re.compile(
     r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
     r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
@@ -148,6 +167,8 @@ def is_explicit_season(token: str) -> bool:
 def episode_number(token: str) -> Optional[int]:
     clean = clean_bracket(token)
     if season_number(clean) is not None:
         return None
     if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
@@ -197,7 +218,144 @@ def is_source(token: str) -> bool:
 def is_special(token: str) -> bool:
     clean = clean_bracket(token)
-    return bool(SPECIAL_RE.match(clean) or SPECIAL_SEARCH_RE.match(clean))
 def is_category_bracket(token: str) -> bool:
@@ -269,9 +427,13 @@ def trim_title_span(tokens: Sequence[str], start: int, end: int) -> tuple[int, i
 def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
     candidates: list[tuple[int, int]] = []
     for idx, token in enumerate(tokens):
         number = episode_number(token)
         if number is None:
             continue
         clean = clean_bracket(token)
         if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
             previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
@@ -282,7 +444,8 @@ def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
             score += 4
         if token.startswith("[") or token.startswith("(") or token.startswith("【"):
             score += 3
-        if idx > 0 and tokens[idx - 1] in {"-", "_", "|"}:
             score += 2
         if idx >= len(tokens) // 2:
             score += 1
@@ -325,6 +488,54 @@ def is_context_season_token(tokens: Sequence[str], idx: int, episode_idx: int) -
     return True
 def label_context_season_tokens(
     tokens: Sequence[str],
     categories: List[str],
@@ -347,6 +558,27 @@ def label_context_season_tokens(
             categories[idx] = "season"
 def repair_structured_bracket_title_aliases(
     tokens: Sequence[str],
     categories: List[str],
@@ -385,6 +617,15 @@ def repair_structured_bracket_title_aliases(
 def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
     """Split malformed tokens such as '[Group}Title[658]' into title + episode."""
     if episode_number(token) is not None:
         return None
     match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
@@ -397,6 +638,8 @@ def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
     close = match.group("close") or ""
     if not clean_bracket(prefix):
         return None
     number = int(re.search(r"\d+", episode).group())
     if number == 0 or number > 2000:
         return None
@@ -426,6 +669,7 @@ def finalize_weak_sample(
     categories: Sequence[str],
     tokenizer: AnimeTokenizer,
     require_episode: bool = True,
 ) -> Optional[dict]:
     expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
@@ -446,7 +690,7 @@ def finalize_weak_sample(
     labels = assign_iob2(expanded_categories)
     if len(expanded_tokens) != len(labels):
         return None
-    if not any(label.endswith("TITLE") for label in labels):
         return None
     if require_episode and not any(label.endswith("EPISODE") for label in labels):
         return None
@@ -621,17 +865,29 @@ def fallback_no_episode_sample(tokens: Sequence[str], tokenizer: AnimeTokenizer)
             categories.append("source")
             title_allowed = False
             continue
-        if is_special(token):
             categories.append("special")
             title_allowed = False
             continue
         if is_noise_bracket(token):
             categories.append("sep")
             continue
         categories.append("title")
         seen_title = True
-    return finalize_weak_sample(tokens, categories, tokenizer, require_episode=False)
 def bracket_delimiters(token: str) -> tuple[str, str]:
@@ -706,6 +962,13 @@ def expand_tokens_and_categories(
                 expanded_tokens.extend([match.group(1), match.group(2)])
                 expanded_categories.extend(["season", "episode"])
                 continue
         if category in {"group", "title"} and (
             token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
         ):
@@ -757,6 +1020,8 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
             categories[idx] = "resolution"
         elif is_source(token):
             categories[idx] = "source"
         elif is_special(token):
             categories[idx] = "special"
         elif is_explicit_season(token):
@@ -766,8 +1031,10 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
     episode_idx = find_episode_index(tokens)
     if episode_idx is None:
         return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
     categories[episode_idx] = "episode"
     label_context_season_tokens(tokens, categories, episode_idx)
     repair_structured_bracket_title_aliases(tokens, categories, episode_idx)

     "mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
     "raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
     "tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
+    "sdr", "hdr", "hdr10", "uhd", "remux", "tvb", "srt", "srtx2",
     "繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
 }
 CATEGORY_BRACKETS = {
     "国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
 }
+SPECIAL_RE = re.compile(
+    r"^(?:ova\d*|oad\d*|sp\d*|movie|the\s*movie|op\d*|ed\d*|pv\d*|cm\d*|"
+    r"ncop\d*|nced\d*|iv\d+|剧场版|劇場版|特别篇|特別篇)$",
+    re.I,
+)
+SPECIAL_INDEX_BASE_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM|IV)$", re.I)
+SPECIAL_INDEX_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$", re.I)
+SPECIAL_COMPOSITE_RE = re.compile(
+    r"^(?P<special>(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+))"
+    r"(?:(?P<sep>[\s._-]+)(?P<episode>(?:EP?|#)?\d{1,4}))?$",
+    re.I,
+)
 SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+", re.I)
 EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
 SEASON_RE = re.compile(
     r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
     r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
     r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
+    r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
     r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
     re.I,
 )
+MEDIA_META_RE = re.compile(
+    r"(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
+    r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|Opus|"
+    r"10bit|8bit|Hi10p|Ma10p|YUV\d+P?\d*)",
+    re.I,
+)
 GROUP_HINT_RE = re.compile(
     r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
     r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
 def episode_number(token: str) -> Optional[int]:
     clean = clean_bracket(token)
+    if SPECIAL_INDEX_RE.match(clean):
+        return None
     if season_number(clean) is not None:
         return None
     if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
 def is_special(token: str) -> bool:
     clean = clean_bracket(token)
+    return bool(
+        SPECIAL_RE.match(clean)
+        or SPECIAL_SEARCH_RE.match(clean)
+        or SPECIAL_COMPOSITE_RE.fullmatch(clean)
+    )
+def is_special_index_base(token: str) -> bool:
+    return bool(SPECIAL_INDEX_BASE_RE.match(clean_bracket(token)))
+def previous_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
+    cursor = idx - 1
+    while cursor >= 0:
+        if not is_separator_token(tokens[cursor]):
+            return cursor
+        cursor -= 1
+    return None
+def next_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
+    cursor = idx + 1
+    while cursor < len(tokens):
+        if not is_separator_token(tokens[cursor]):
+            return cursor
+        cursor += 1
+    return None
+def previous_non_space_index(tokens: Sequence[str], idx: int) -> Optional[int]:
+    cursor = idx - 1
+    while cursor >= 0:
+        if tokens[cursor].strip():
+            return cursor
+        cursor -= 1
+    return None
+def is_special_index_continuation(tokens: Sequence[str], idx: int) -> bool:
+    clean = clean_bracket(tokens[idx])
+    if not re.fullmatch(r"\d{1,4}", clean):
+        return False
+    prev_idx = previous_significant_index(tokens, idx)
+    return prev_idx is not None and is_special_index_base(tokens[prev_idx])
+def has_special_index_continuation_after(tokens: Sequence[str], idx: int) -> bool:
+    next_idx = next_significant_index(tokens, idx)
+    return next_idx is not None and is_special_index_continuation(tokens, next_idx)
+def is_special_index_sequence_token(tokens: Sequence[str], idx: int) -> bool:
+    return (
+        is_special_index_continuation(tokens, idx)
+        or (is_special_index_base(tokens[idx]) and has_special_index_continuation_after(tokens, idx))
+    )
+def is_episode_after_special_index(tokens: Sequence[str], idx: int) -> bool:
+    clean = clean_bracket(tokens[idx])
+    if episode_number(clean) is None:
+        return False
+    prev_idx = previous_significant_index(tokens, idx)
+    if prev_idx is None:
+        return False
+    if is_special_index_continuation(tokens, prev_idx):
+        return True
+    if SPECIAL_INDEX_RE.match(clean_bracket(tokens[prev_idx])):
+        return True
+    return False
+def is_numeric_media_fragment(tokens: Sequence[str], idx: int) -> bool:
+    clean = clean_bracket(tokens[idx])
+    if not re.fullmatch(r"\d{1,4}", clean):
+        return False
+    prev_idx = idx - 1 if idx > 0 else None
+    next_idx = idx + 1 if idx + 1 < len(tokens) else None
+    prev_clean = clean_bracket(tokens[prev_idx]).lower() if prev_idx is not None else ""
+    next_clean = clean_bracket(tokens[next_idx]).lower() if next_idx is not None else ""
+    if next_clean in {"bit", "bits"}:
+        return True
+    if prev_clean == "ma" and next_clean == "p":
+        return True
+    if prev_clean in {"aac", "flac", "dts", "ddp", "ac3", "mp"} and next_clean == ".":
+        return True
+    if prev_clean == ".":
+        prev_prev = clean_bracket(tokens[idx - 2]).lower() if idx >= 2 else ""
+        if re.fullmatch(r"\d+", prev_prev):
+            return True
+    return False
+def is_special_index_suffix(tokens: Sequence[str], idx: int) -> bool:
+    clean = clean_bracket(tokens[idx])
+    if not re.fullmatch(r"\d{1,4}", clean):
+        return False
+    prev_idx = previous_significant_index(tokens, idx)
+    if prev_idx is None:
+        return False
+    if is_special_index_base(tokens[prev_idx]):
+        return True
+    prev_clean = clean_bracket(tokens[prev_idx])
+    return bool(re.fullmatch(r"(?:NCOP|NCED|OP|ED|PV|CM)$", prev_clean, re.I))
+def is_structural_episode_candidate(tokens: Sequence[str], idx: int, number: int) -> bool:
+    clean = clean_bracket(tokens[idx])
+    if re.match(r"^(?:[Ee][Pp]?|#|第|OVA|OAD|SP)", clean, re.I):
+        return True
+    if re.match(r"^\d{1,4}(?:v\d+|END)$", clean, re.I):
+        return True
+    if has_wrapping_brackets(tokens[idx]):
+        return True
+    prev_idx = previous_non_space_index(tokens, idx)
+    if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
+        return True
+    if idx > 0 and tokens[idx - 1] == "#":
+        return True
+    if number >= 100:
+        return True
+    next_idx = next_significant_index(tokens, idx)
+    if next_idx is not None and (
+        is_resolution(tokens[next_idx])
+        or is_source(tokens[next_idx])
+        or is_noise_bracket(tokens[next_idx])
+    ):
+        if prev_idx is None:
+            return False
+        if tokens[prev_idx] in {"-", "_", "|"}:
+            return True
+        if has_wrapping_brackets(tokens[idx]):
+            return True
+    return False
 def is_category_bracket(token: str) -> bool:
 def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
     candidates: list[tuple[int, int]] = []
     for idx, token in enumerate(tokens):
+        if is_special_index_continuation(tokens, idx) or is_numeric_media_fragment(tokens, idx):
+            continue
         number = episode_number(token)
         if number is None:
             continue
+        if not is_structural_episode_candidate(tokens, idx, number):
+            continue
         clean = clean_bracket(token)
         if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
             previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
             score += 4
         if token.startswith("[") or token.startswith("(") or token.startswith("【"):
             score += 3
+        prev_idx = previous_non_space_index(tokens, idx)
+        if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
             score += 2
         if idx >= len(tokens) // 2:
             score += 1
     return True
+def split_special_composite(clean: str) -> Optional[tuple[str, Optional[str]]]:
+    match = SPECIAL_COMPOSITE_RE.fullmatch(clean)
+    if not match:
+        return None
+    return match.group("special"), match.group("episode")
+def label_special_composite_contents(token: str, tokenizer: AnimeTokenizer) -> tuple[List[str], List[str]]:
+    inner = clean_bracket(token)
+    composite = split_special_composite(inner)
+    if composite is None:
+        return label_bracket_contents(token, "special", tokenizer)
+    special, episode = composite
+    open_char, close_char = bracket_delimiters(token)
+    tokens: List[str] = []
+    cats: List[str] = []
+    if open_char:
+        tokens.append(open_char)
+        cats.append("sep")
+    for piece in tokenizer.tokenize(special):
+        if is_separator_token(piece):
+            tokens.append(piece)
+            cats.append("sep")
+        else:
+            tokens.append(piece)
+            cats.append("special")
+    if episode:
+        for piece in tokenizer.tokenize(episode):
+            if is_separator_token(piece):
+                tokens.append(piece)
+                cats.append("sep")
+            else:
+                tokens.append(piece)
+                cats.append("episode")
+    if close_char:
+        tokens.append(close_char)
+        cats.append("sep")
+    return tokens, cats
+def clear_trailing_title_separators(tokens: Sequence[str], categories: List[str]) -> None:
+    idx = len(categories) - 1
+    while idx >= 0 and is_separator_token(tokens[idx]) and categories[idx] == "title":
+        categories[idx] = "sep"
+        idx -= 1
 def label_context_season_tokens(
     tokens: Sequence[str],
     categories: List[str],
             categories[idx] = "season"
+def label_special_index_sequences(tokens: Sequence[str], categories: List[str]) -> None:
+    """Keep NCOP_01 / NCED 16 / IV05 style codes as a single SPECIAL span."""
+    idx = 0
+    while idx < len(tokens):
+        if not is_special_index_base(tokens[idx]):
+            idx += 1
+            continue
+        next_idx = next_significant_index(tokens, idx)
+        if next_idx is None or not is_special_index_continuation(tokens, next_idx):
+            idx += 1
+            continue
+        categories[idx] = "special"
+        for between in range(idx + 1, next_idx):
+            if is_separator_token(tokens[between]):
+                categories[between] = "special"
+        categories[next_idx] = "special"
+        idx = next_idx + 1
 def repair_structured_bracket_title_aliases(
     tokens: Sequence[str],
     categories: List[str],
 def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
     """Split malformed tokens such as '[Group}Title[658]' into title + episode."""
+    clean_token = clean_bracket(token)
+    if is_special(token) or SPECIAL_INDEX_RE.match(clean_token) or SPECIAL_COMPOSITE_RE.fullmatch(clean_token):
+        return None
+    if has_wrapping_brackets(token) and (
+        HASH_RE.match(clean_token)
+        or RESOLUTION_SEARCH_RE.search(clean_token)
+        or MEDIA_META_RE.search(clean_token)
+    ):
+        return None
     if episode_number(token) is not None:
         return None
     match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
     close = match.group("close") or ""
     if not clean_bracket(prefix):
         return None
+    if SPECIAL_INDEX_BASE_RE.match(clean_bracket(prefix)):
+        return None
     number = int(re.search(r"\d+", episode).group())
     if number == 0 or number > 2000:
         return None
     categories: Sequence[str],
     tokenizer: AnimeTokenizer,
     require_episode: bool = True,
+    require_title: bool = True,
 ) -> Optional[dict]:
     expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
     labels = assign_iob2(expanded_categories)
     if len(expanded_tokens) != len(labels):
         return None
+    if require_title and not any(label.endswith("TITLE") for label in labels):
         return None
     if require_episode and not any(label.endswith("EPISODE") for label in labels):
         return None
             categories.append("source")
             title_allowed = False
             continue
+        if is_special_index_sequence_token(tokens, idx) or is_special(token):
+            clear_trailing_title_separators(tokens, categories)
             categories.append("special")
             title_allowed = False
             continue
         if is_noise_bracket(token):
             categories.append("sep")
             continue
+        if seen_title and not title_allowed:
+            categories.append("sep")
+            continue
         categories.append("title")
         seen_title = True
+    label_special_index_sequences(tokens, categories)
+    require_title = any(category == "title" for category in categories)
+    return finalize_weak_sample(
+        tokens,
+        categories,
+        tokenizer,
+        require_episode=False,
+        require_title=require_title,
+    )
 def bracket_delimiters(token: str) -> tuple[str, str]:
                 expanded_tokens.extend([match.group(1), match.group(2)])
                 expanded_categories.extend(["season", "episode"])
                 continue
+        if category == "special" and (
+            token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
+        ):
+            split_tokens, split_categories = label_special_composite_contents(token, tokenizer)
+            expanded_tokens.extend(split_tokens)
+            expanded_categories.extend(split_categories)
+            continue
         if category in {"group", "title"} and (
             token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
         ):
             categories[idx] = "resolution"
         elif is_source(token):
             categories[idx] = "source"
+        elif is_special_index_sequence_token(tokens, idx):
+            categories[idx] = "special"
         elif is_special(token):
             categories[idx] = "special"
         elif is_explicit_season(token):
     episode_idx = find_episode_index(tokens)
     if episode_idx is None:
+        label_special_index_sequences(tokens, categories)
         return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
     categories[episode_idx] = "episode"
+    label_special_index_sequences(tokens, categories)
     label_context_season_tokens(tokens, categories, episode_idx)
     repair_structured_bracket_title_aliases(tokens, categories, episode_idx)

exports/anime_filename_parser.metadata.json CHANGED Viewed

@@ -8,5 +8,5 @@
     128,
     15
   ],
-  "max_abs_diff": 5.65648078918457e-05
 }

     128,
     15
   ],
+  "max_abs_diff": 2.6702880859375e-05
 }

exports/anime_filename_parser.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d967c5c2305e6737c9e791956a174655deebef2cfa477e081890ebddd56e004
-size 19633926

 version https://git-lfs.github.com/spec/v1
+oid sha256:28ac9b1e17d0e70f31a986a1d677513d97e77748ccdf96c8d77245cadc54fa4e
+size 19652184

inference.py CHANGED Viewed

@@ -270,7 +270,9 @@ RESOLUTION_RE = re.compile(r"(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]
 SOURCE_TOKEN_PATTERN = (
     r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
     r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
-    r"CHS|CHT|GB|BIG5|JPN?|繁中|简中"
 )
 SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
 SOURCE_TAG_RE = re.compile(
@@ -281,6 +283,16 @@ SPECIAL_TAG_RE = re.compile(
     r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+",
     re.I,
 )
 EPISODE_PATTERNS = [
     ("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
     ("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
@@ -327,7 +339,8 @@ TRAILING_SEQUEL_MARKER_RE = re.compile(
 NOISE_META_RE = re.compile(
     r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
     r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
-    r"Opus|ASS.*|CHS|CHT|BIG5|GB|JPN?|MP4|MKV|繁中|简中|内封|外挂)$",
     re.I,
 )
 DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
@@ -386,10 +399,91 @@ def looks_like_episode_or_meta(text: str) -> bool:
         or SOURCE_TAG_RE.fullmatch(clean)
         or SOURCE_RE.search(clean)
         or SPECIAL_TAG_RE.search(clean)
         or NOISE_META_RE.search(clean)
     )
 def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
     """Heuristic for short leading release-group brackets not in the name list."""
     if looks_like_group(text):
@@ -445,18 +539,23 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
     source_matches = source_candidates(filename)
     current_source = repaired.get("source")
     preferred_source = source_matches[0] if source_matches else None
-    if source_matches and (
         not current_source
-        or not SOURCE_RE.fullmatch(str(current_source))
-        or len(str(current_source)) <= 3 and str(current_source).lower() not in {"nf", "cr"}
         or (
-            preferred_source
-            and str(current_source).lower().replace("_", "-") in {"web-dl", "webdl", "webrip", "web-rip"}
-            and preferred_source.lower().replace("_", "-") not in {"web-dl", "webdl", "webrip", "web-rip"}
         )
     ):
         repaired["source"] = preferred_source
     if not repaired.get("special"):
         for text, _start, _end in brackets:
             clean = text.strip()
@@ -471,6 +570,11 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
     ):
         repaired["episode"] = episode
     if repaired.get("season") is None:
         match = SEASON_RE.search(filename)
         if match:
@@ -506,6 +610,12 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
     if repaired.get("title") and repaired.get("season") is not None:
         repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
     return repaired
@@ -551,6 +661,10 @@ def structural_sequel_marker(
         if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
             continue
         return marker, value
     return None
@@ -566,10 +680,12 @@ def normalize_source_text(text: str) -> str:
 def source_priority(source: str) -> int:
     normalized = source.lower().replace("_", "-").replace(" ", "")
     parts = re.split(r"[&+/,]", normalized)
-    if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x"} for part in parts):
         return 90
-    if any(part in {"web-dl", "webdl", "webrip", "web-rip", "bdrip", "bluray", "bdmv", "bd", "dvdrip", "dvd", "tvrip", "hdtv"} for part in parts):
-        return 60
     if len(parts) > 1:
         return 40
     return 20
@@ -662,13 +778,30 @@ def best_structural_episode(filename: str) -> Optional[int]:
             ep = int(ep_text)
             if ep == 0 or ep > 2000:
                 continue
-            context = filename[max(0, match.start() - 5):match.end() + 5]
             if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
                 continue
             priority = priorities[name]
             if 1 <= ep <= 200:
                 priority += 20
-            candidates.append((priority, match.start(), ep))
     if not candidates:
         return None
     return max(candidates, key=lambda item: (item[0], item[1]))[2]
@@ -686,9 +819,9 @@ def plausible_episode_context(filename: str, episode: int) -> bool:
         rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
         rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
     ]
-    return any(re.search(pattern, filename, re.I) for pattern in patterns) or bool(
-        re.search(rf"(?:^|[\s._\-\[\(【《])(?:{re.escape(ep_text)}|{re.escape(padded)})(?=$|[\s._\-\]\)】》])", filename)
-    )
 def strip_trailing_season_from_title(title: str, season: int) -> str:
@@ -762,7 +895,13 @@ def infer_title_span(filename: str, group: Optional[str], episode: Optional[int]
         for text, bracket_start, _bracket_end in bracket_parts(filename):
             if bracket_start <= start:
                 continue
-            if NOISE_META_RE.search(text) or RESOLUTION_RE.search(text) or SOURCE_RE.search(text):
                 end = bracket_start
                 break

 SOURCE_TOKEN_PATTERN = (
     r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
     r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
+    r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
+    r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
+    r"CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"
 )
 SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
 SOURCE_TAG_RE = re.compile(
     r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+",
     re.I,
 )
+SPECIAL_CODE_RE = re.compile(
+    r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
+    re.I,
+)
+SPECIAL_CODE_INLINE_RE = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"(?P<code>(?:NCOP|NCED)(?:[\s._-]*\d{1,4})?|(?:OP|ED|PV|CM)\d{1,4}|IV\d{1,4})"
+    r"(?![A-Za-z0-9])",
+    re.I,
+)
 EPISODE_PATTERNS = [
     ("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
     ("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
 NOISE_META_RE = re.compile(
     r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
     r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
+    r"Opus|SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASS.*|SRT.*|CHS|CHT|BIG5|GB|JPN?|"
+    r"JPSC|JPTC|MP4|MKV|繁中|简中|内封|外挂)$",
     re.I,
 )
 DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
         or SOURCE_TAG_RE.fullmatch(clean)
         or SOURCE_RE.search(clean)
         or SPECIAL_TAG_RE.search(clean)
+        or SPECIAL_CODE_RE.fullmatch(normalized)
         or NOISE_META_RE.search(clean)
     )
+def normalize_special_code(text: str) -> str:
+    return re.sub(r"[\s._-]+", "", text.strip())
+def special_code_spans(filename: str) -> List[Tuple[str, int, int]]:
+    spans: List[Tuple[str, int, int]] = []
+    for text, start, end in bracket_parts(filename):
+        normalized = normalize_special_code(text)
+        if SPECIAL_CODE_RE.fullmatch(normalized):
+            spans.append((normalized, start, end))
+    for match in SPECIAL_CODE_INLINE_RE.finditer(filename):
+        normalized = normalize_special_code(match.group("code"))
+        if SPECIAL_CODE_RE.fullmatch(normalized):
+            spans.append((normalized, match.start("code"), match.end("code")))
+    deduped: List[Tuple[str, int, int]] = []
+    seen: set[Tuple[str, int, int]] = set()
+    for value, start, end in sorted(spans, key=lambda item: (item[1], item[2])):
+        key = (value.lower(), start, end)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append((value, start, end))
+    return deduped
+def special_code_brackets(filename: str) -> List[Tuple[str, int, int]]:
+    return [
+        (text.strip(), start, end)
+        for text, start, end in bracket_parts(filename)
+        if SPECIAL_CODE_RE.fullmatch(normalize_special_code(text))
+    ]
+def span_is_inside_special_code(filename: str, start: int, end: int) -> bool:
+    return any(special_start <= start and end <= special_end for _code, special_start, special_end in special_code_spans(filename))
+def has_non_special_episode_context(filename: str, episode: int) -> bool:
+    masked = filename
+    for _text, start, end in reversed(special_code_brackets(filename)):
+        masked = masked[:start] + (" " * (end - start)) + masked[end:]
+    return plausible_episode_context(masked, episode) and best_structural_episode(masked) == episode
+def episode_comes_only_from_special_code(filename: str, episode: Optional[int]) -> bool:
+    if episode is None:
+        return False
+    specials = special_code_spans(filename)
+    if not specials:
+        return False
+    ep_text = str(int(episode))
+    for normalized, _start, _end in specials:
+        if re.search(rf"0*{re.escape(ep_text)}$", normalized):
+            return not has_non_special_episode_context(filename, int(episode))
+    return False
+def strip_title_special_codes(title: str, special: Optional[str] = None) -> str:
+    cleaned = title.strip()
+    while True:
+        next_cleaned = re.sub(
+            r"\s*[\[\(【《]\s*(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+|(?:OVA|OAD|SP)\d*)\s*[\]\)】》]\s*$",
+            "",
+            cleaned,
+            flags=re.I,
+        ).strip(" \t-_.")
+        if next_cleaned == cleaned:
+            break
+        cleaned = next_cleaned
+    cleaned = re.sub(r"\s+(?:NCOP|NCED|OP|ED|PV|CM)\d*$", "", cleaned, flags=re.I).strip(" \t-_.")
+    if special:
+        normalized = re.sub(r"[\s._-]+", "", str(special).strip())
+        match = re.fullmatch(r"([A-Za-z]+)\d+", normalized)
+        if match and SPECIAL_CODE_RE.fullmatch(normalized):
+            prefix = re.escape(match.group(1))
+            cleaned = re.sub(rf"\s+{prefix}$", "", cleaned, flags=re.I).strip(" \t-_.")
+    return cleaned or title
 def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
     """Heuristic for short leading release-group brackets not in the name list."""
     if looks_like_group(text):
     source_matches = source_candidates(filename)
     current_source = repaired.get("source")
     preferred_source = source_matches[0] if source_matches else None
+    if preferred_source and (
         not current_source
+        or source_priority(preferred_source) > source_priority(str(current_source))
         or (
+            source_priority(preferred_source) == source_priority(str(current_source))
+            and preferred_source.lower() != str(current_source).lower()
         )
     ):
         repaired["source"] = preferred_source
+    special_spans = special_code_spans(filename)
+    current_special = repaired.get("special")
+    if special_spans:
+        preferred_special = special_spans[0][0]
+        current_normalized = normalize_special_code(str(current_special)) if current_special else ""
+        if not current_special or preferred_special.lower().startswith(current_normalized.lower()):
+            repaired["special"] = preferred_special
     if not repaired.get("special"):
         for text, _start, _end in brackets:
             clean = text.strip()
     ):
         repaired["episode"] = episode
+    if repaired.get("episode") is not None and not plausible_episode_context(filename, int(repaired["episode"])):
+        repaired["episode"] = episode
+    if episode_comes_only_from_special_code(filename, repaired.get("episode")):
+        repaired["episode"] = None
     if repaired.get("season") is None:
         match = SEASON_RE.search(filename)
         if match:
     if repaired.get("title") and repaired.get("season") is not None:
         repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
+    if repaired.get("episode") is None and repaired.get("group") and repaired.get("special"):
+        inferred_title = infer_title_span(filename, repaired.get("group"), None)
+        if inferred_title:
+            repaired["title"] = inferred_title
+    if repaired.get("title"):
+        repaired["title"] = strip_title_special_codes(repaired["title"], repaired.get("special"))
     return repaired
         if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
             continue
         return marker, value
+    numeric_tail = re.search(r"(?:^|[\s._-])(?P<season>[2-9])$", prefix)
+    if numeric_tail:
+        return numeric_tail.group("season"), int(numeric_tail.group("season"))
     return None
 def source_priority(source: str) -> int:
     normalized = source.lower().replace("_", "-").replace(" ", "")
     parts = re.split(r"[&+/,]", normalized)
+    if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x", "web-dl", "webdl", "webrip", "web-rip", "bdrip", "bluray", "bdmv", "bd", "dvdrip", "dvd", "tvrip", "hdtv"} for part in parts):
         return 90
+    if any(part in {"chs", "cht", "gb", "big5", "jpn", "jpsc", "jptc", "繁中", "简中"} for part in parts):
+        return 70
+    if any(part in {"x264", "x265", "h.264", "h264", "h.265", "h265", "hevc", "avc", "av1", "aac", "flac", "mp3", "dts", "opus", "10bit", "8bit", "hi10p", "ma10p", "srt", "srtx2", "ass", "assx2"} for part in parts):
+        return 20
     if len(parts) > 1:
         return 40
     return 20
             ep = int(ep_text)
             if ep == 0 or ep > 2000:
                 continue
+            ep_start = match.start("ep")
+            ep_end = match.end("ep")
+            if span_is_inside_special_code(filename, ep_start, ep_end):
+                continue
+            if name == "generic_episode":
+                tail = filename[ep_end:]
+                if re.match(r"[-_][A-Za-z]", tail):
+                    continue
+                if not re.match(
+                    r"(?:$|[\]\)】》]|[\s._-]+(?:"
+                    r"\[[^\]]*(?:\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC)|"
+                    r"\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC|mkv|mp4|avi"
+                    r"))",
+                    tail,
+                    re.I,
+                ):
+                    continue
+            context = filename[max(0, ep_start - 5):ep_end + 5]
             if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
                 continue
             priority = priorities[name]
             if 1 <= ep <= 200:
                 priority += 20
+            candidates.append((priority, ep_start, ep))
     if not candidates:
         return None
     return max(candidates, key=lambda item: (item[0], item[1]))[2]
         rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
         rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
     ]
+    if any(re.search(pattern, filename, re.I) for pattern in patterns):
+        return True
+    return bool(re.search(rf"(?:^|[\s._-])(?:{re.escape(ep_text)}|{re.escape(padded)})(?:v\d+)?$", filename, re.I))
 def strip_trailing_season_from_title(title: str, season: int) -> str:
         for text, bracket_start, _bracket_end in bracket_parts(filename):
             if bracket_start <= start:
                 continue
+            if (
+                NOISE_META_RE.search(text)
+                or RESOLUTION_RE.search(text)
+                or SOURCE_RE.search(text)
+                or SPECIAL_TAG_RE.search(text)
+                or SPECIAL_CODE_RE.fullmatch(re.sub(r"[\s._-]+", "", text.strip()))
+            ):
                 end = bracket_start
                 break

label_repairs.py CHANGED Viewed

@@ -117,6 +117,10 @@ SPECIAL_TAG_RE = re.compile(
     r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+",
     re.I,
 )
 READING_MARKER_RE = re.compile(
     r"(?<![A-Za-z0-9])"
@@ -373,7 +377,7 @@ def repair_structural_meta_labels(
         if not clean:
             continue
-        if SPECIAL_TAG_RE.fullmatch(clean):
             indices = token_indices_for_span(offsets, inner_start, inner_end)
             if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
                 repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))

     r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+",
     re.I,
 )
+SPECIAL_CODE_RE = re.compile(
+    r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
+    re.I,
+)
 READING_MARKER_RE = re.compile(
     r"(?<![A-Za-z0-9])"
         if not clean:
             continue
+        if SPECIAL_TAG_RE.fullmatch(clean) or SPECIAL_CODE_RE.fullmatch(clean):
             indices = token_indices_for_span(offsets, inner_start, inner_end)
             if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
                 repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:347b2f619fd63a71804c4742a069b20acd0cde870fc03cc2ac0f175b06586b72
 size 19142604

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f251f8d4bbb750ba3bfd6fceffbec32eff3f32e9f07820bdab48294052d15a5
 size 19142604

parse_eval_metrics.json CHANGED Viewed

@@ -1,563 +1,582 @@
 {
-  "sample_count": 2048,
   "field_accuracy": {
-    "group": 0.99951171875,
-    "title": 0.99755859375,
-    "season": 0.99609375,
-    "episode": 0.998046875,
     "resolution": 1.0,
-    "source": 0.99853515625,
-    "special": 0.9990234375
   },
   "field_correct": {
-    "group": 2047,
-    "title": 2043,
-    "season": 2040,
-    "episode": 2044,
-    "resolution": 2048,
-    "source": 2045,
-    "special": 2046
   },
   "field_total": {
-    "group": 2048,
-    "title": 2048,
-    "season": 2048,
-    "episode": 2048,
-    "resolution": 2048,
-    "source": 2048,
-    "special": 2048
   },
-  "full_match_accuracy": 0.99072265625,
-  "full_match_correct": 2029,
-  "full_match_total": 2048,
   "failures": [
     {
-      "filename": "[ig]Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu[WebRip 1920x1080 AVC YUV420 8Bit 1080p  AAC].03.TC",
       "errors": {
-        "episode": {
-          "gold": "3",
-          "pred": null
         }
       },
       "gold": {
-        "group": "ig",
-        "title": "Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu",
         "season": null,
-        "episode": 3,
-        "resolution": "1080p",
-        "source": "WebRip",
         "special": null
       },
       "pred": {
-        "group": "ig",
-        "title": "Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu",
         "season": null,
         "episode": null,
-        "resolution": "1080p",
-        "source": "WebRip",
         "special": null
       }
     },
     {
-      "filename": "[YYDM-11FANS][Nanana's Buried Treasure][preview][09][BDrip][720P][X264-10bit_AAC][34D29ED6]",
       "errors": {
-        "special": {
-          "gold": "ed",
-          "pred": null
         }
       },
       "gold": {
-        "group": "YYDM-11FANS",
-        "title": "Nanana's Buried Treasure",
         "season": null,
-        "episode": 9,
-        "resolution": "720P",
-        "source": "BDrip",
         "special": "ED"
       },
       "pred": {
-        "group": "YYDM-11FANS",
-        "title": "Nanana's Buried Treasure",
         "season": null,
-        "episode": 9,
-        "resolution": "720P",
-        "source": "BDrip",
-        "special": null
       }
     },
     {
-      "filename": "[Moozzi2] Madou King Granzort Saigo no Magical Taisen OVA - 01 [ 1990 ] (BD 1440x1080 x.264 Flac)",
       "errors": {
         "title": {
-          "gold": "madou king granzort saigo no magical taisen ova",
-          "pred": "madou king granzort saigo no magical taisen ova - 01 [ 1990"
         },
         "episode": {
-          "gold": "1",
-          "pred": "1990"
         }
       },
       "gold": {
-        "group": "Moozzi2",
-        "title": "Madou King Granzort Saigo no Magical Taisen OVA",
         "season": null,
-        "episode": 1,
-        "resolution": "1440x1080",
-        "source": "BD",
-        "special": "OVA"
       },
       "pred": {
-        "group": "Moozzi2",
-        "title": "Madou King Granzort Saigo no Magical Taisen OVA - 01 [ 1990 ",
         "season": null,
-        "episode": 1990,
-        "resolution": "1440x1080",
-        "source": "BD",
-        "special": "OVA"
       }
     },
     {
-      "filename": "[64bitsub][Tensui no Sakuna-hime][08][BDRIP_1920x1080][AVC_FLAC_SUP]",
       "errors": {
-        "source": {
-          "gold": "flac",
-          "pred": "avc-flac"
         }
       },
       "gold": {
-        "group": "64bitsub",
-        "title": "Tensui no Sakuna-hime",
         "season": null,
-        "episode": 8,
-        "resolution": "1920x1080",
-        "source": "FLAC",
-        "special": null
       },
       "pred": {
-        "group": "64bitsub",
-        "title": "Tensui no Sakuna-hime",
         "season": null,
-        "episode": 8,
-        "resolution": "1920x1080",
-        "source": "AVC_FLAC",
-        "special": null
       }
     },
     {
-      "filename": "[VCB-Studio] Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3][Ma10p_1080p][x265_flac]",
       "errors": {
         "season": {
           "gold": null,
-          "pred": "3"
         }
       },
       "gold": {
         "group": "VCB-Studio",
-        "title": "Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3",
         "season": null,
-        "episode": 3,
         "resolution": "1080p",
         "source": "x265_flac",
-        "special": "Movie"
       },
       "pred": {
         "group": "VCB-Studio",
-        "title": "Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3",
-        "season": 3,
-        "episode": 3,
         "resolution": "1080p",
         "source": "x265_flac",
-        "special": "Movie"
       }
     },
     {
-      "filename": "FF：U ファイナルファンタジー：アンリミテッド ～異界の章～ #15 「ジェーン～うごきだすうみパズル」(DVD 640x480 DivX5 QB98 120fps lameVBR)[CRC_5FA44899]",
       "errors": {
-        "source": {
-          "gold": "cr",
-          "pred": "dvd"
         }
       },
       "gold": {
-        "group": null,
-        "title": "FF：U ファイナルファンタジー：アンリミテッド ～異界の章～",
         "season": null,
-        "episode": 15,
-        "resolution": "640x480",
-        "source": "CR",
-        "special": null
       },
       "pred": {
-        "group": null,
-        "title": "FF：U ファイナルファンタジー：アンリミテッド ～異界の章～",
         "season": null,
-        "episode": 15,
-        "resolution": "640x480",
-        "source": "DVD",
-        "special": null
       }
     },
     {
-      "filename": "[OVA]GALLFORCE ガルフォース2 宇宙章 vol2 [DESTRUCTION]",
       "errors": {
-        "title": {
-          "gold": "gallforce ガルフォース2 宇宙章 vol",
-          "pred": "gallforce ガルフォース2 宇宙"
         }
       },
       "gold": {
-        "group": "OVA",
-        "title": "GALLFORCE ガルフォース2 宇宙章 vol",
         "season": null,
-        "episode": 2,
-        "resolution": null,
-        "source": null,
-        "special": "OVA"
       },
       "pred": {
-        "group": "OVA",
-        "title": "GALLFORCE ガルフォース2 宇宙",
         "season": null,
         "episode": 2,
-        "resolution": null,
-        "source": null,
-        "special": "OVA"
       }
     },
     {
-      "filename": "[病毒].[Fosky_Fansub][Virus_Buster_Serge][DVDrip][12][H264_AAC][640x480][GB&BIG5][F77551D0](ED2000.COM)",
       "errors": {
-        "special": {
-          "gold": "ed",
-          "pred": "e"
         }
       },
       "gold": {
-        "group": "病毒",
-        "title": "Fosky_Fansub",
         "season": null,
-        "episode": 12,
-        "resolution": "640x480",
-        "source": "DVDrip",
-        "special": "ED"
       },
       "pred": {
-        "group": "病毒",
-        "title": "Fosky_Fansub",
         "season": null,
-        "episode": 12,
-        "resolution": "640x480",
-        "source": "DVDrip",
-        "special": "E"
       }
     },
     {
-      "filename": "[DBD-Raws][Shadows House S1][Gekijou][18][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
-        "season": {
           "gold": null,
-          "pred": "1"
         }
       },
       "gold": {
         "group": "DBD-Raws",
-        "title": "Shadows House",
         "season": null,
-        "episode": 18,
         "resolution": "1080P",
         "source": "BDRip",
-        "special": null
       },
       "pred": {
         "group": "DBD-Raws",
-        "title": "Shadows House",
-        "season": 1,
-        "episode": 18,
         "resolution": "1080P",
         "source": "BDRip",
-        "special": null
       }
     },
     {
-      "filename": "Girls und Panzer - 10.5 (BD 1280x720 AVC AACx2)",
       "errors": {
-        "season": {
-          "gold": "10",
-          "pred": "1"
         }
       },
       "gold": {
-        "group": null,
-        "title": "Girls und Panzer - 10.5",
-        "season": 10,
-        "episode": 5,
-        "resolution": "1280x720",
-        "source": "BD",
-        "special": null
       },
       "pred": {
-        "group": null,
-        "title": "Girls und Panzer - 10.5",
-        "season": 1,
-        "episode": 5,
-        "resolution": "1280x720",
-        "source": "BD",
-        "special": null
       }
     },
     {
-      "filename": "[POPGO&SumiSora&TxxZ] Ginga Eiyuu Densetsu Die Neue These - Seiran 14 (BDRip 1080P X265 Main10p TrueHDX2 Chap)[A4E18C32]",
       "errors": {
-        "group": {
           "gold": null,
-          "pred": "popgo&sumisora&txxz"
-        },
-        "title": {
-          "gold": "popgo&sumisora&txxz",
-          "pred": "ginga eiyuu densetsu die neue these - seiran 14"
         }
       },
       "gold": {
-        "group": null,
-        "title": "POPGO&SumiSora&TxxZ",
         "season": null,
-        "episode": 14,
-        "resolution": "1080P",
-        "source": "BDRip",
-        "special": null
       },
       "pred": {
-        "group": "POPGO&SumiSora&TxxZ",
-        "title": "Ginga Eiyuu Densetsu Die Neue These - Seiran 14",
-        "season": null,
-        "episode": 14,
-        "resolution": "1080P",
-        "source": "BDRip",
-        "special": null
       }
     },
     {
-      "filename": "[アニメ BD] Serial Experiments Lain 映像特典 「trailer 01」 （1440x1080 x264 AAC 2ch）",
       "errors": {
-        "title": {
-          "gold": "serial experiments lain 映像特典 「trailer 01」",
-          "pred": "serial experiments lain 映像特典 「trailer"
-        },
         "episode": {
-          "gold": "2",
-          "pred": "1"
         }
       },
       "gold": {
-        "group": "アニメ BD",
-        "title": "Serial Experiments Lain 映像特典 「trailer 01」",
         "season": null,
-        "episode": 2,
-        "resolution": "1440x1080",
-        "source": "BD",
-        "special": null
       },
       "pred": {
-        "group": "アニメ BD",
-        "title": "Serial Experiments Lain 映像特典 「trailer",
         "season": null,
-        "episode": 1,
-        "resolution": "1440x1080",
-        "source": "BD",
-        "special": null
       }
     },
     {
-      "filename": "[AJZ＆BLU][God Eater][05][BIG5][v2] (2)",
       "errors": {
         "episode": {
-          "gold": "2",
-          "pred": "5"
         }
       },
       "gold": {
-        "group": "AJZ＆BLU",
-        "title": "God Eater",
         "season": null,
-        "episode": 2,
-        "resolution": null,
-        "source": "BIG5",
-        "special": null
       },
       "pred": {
-        "group": "AJZ＆BLU",
-        "title": "God Eater",
         "season": null,
-        "episode": 5,
-        "resolution": null,
-        "source": "BIG5",
-        "special": null
       }
     },
     {
-      "filename": "(アニメ) YAT安心！宇宙旅行 第1期 第07話 「サバイバル！野生のカネア」 (LD 640x480 WMV9 QB90 24fps)",
       "errors": {
         "season": {
-          "gold": null,
-          "pred": "1"
         }
       },
       "gold": {
-        "group": "アニメ",
-        "title": "YAT安心！宇宙旅行",
-        "season": null,
-        "episode": 7,
-        "resolution": "640x480",
-        "source": null,
-        "special": null
       },
       "pred": {
-        "group": "アニメ",
-        "title": "YAT安心！宇宙旅行",
-        "season": 1,
-        "episode": 7,
-        "resolution": "640x480",
-        "source": null,
-        "special": null
       }
     },
     {
-      "filename": "Lord El-Melloi II-sei no Jikenbo 06 [1AAC021C]",
       "errors": {
-        "source": {
-          "gold": "aac",
-          "pred": null
         }
       },
       "gold": {
-        "group": null,
-        "title": "Lord El-Melloi II-sei no Jikenbo",
         "season": null,
-        "episode": 6,
-        "resolution": null,
-        "source": "AAC",
-        "special": null
       },
       "pred": {
-        "group": null,
-        "title": "Lord El-Melloi II-sei no Jikenbo",
         "season": null,
-        "episode": 6,
-        "resolution": null,
-        "source": null,
-        "special": null
       }
     },
     {
-      "filename": "[Skymoon-Raws] Mashle 2nd Season - 01(13) [ViuTV][WEB-DL][1080p][AVC AAC]",
       "errors": {
-        "title": {
-          "gold": "mashle 2nd season - 01",
-          "pred": "mashle 2nd season"
-        },
-        "season": {
-          "gold": "2",
-          "pred": "1"
         }
       },
       "gold": {
-        "group": "Skymoon-Raws",
-        "title": "Mashle 2nd Season - 01",
-        "season": 2,
-        "episode": 13,
-        "resolution": "1080p",
-        "source": "WEB-DL",
-        "special": null
       },
       "pred": {
-        "group": "Skymoon-Raws",
-        "title": "Mashle 2nd Season",
         "season": 1,
-        "episode": 13,
-        "resolution": "1080p",
-        "source": "WEB-DL",
-        "special": null
       }
     },
     {
-      "filename": "【CXRAW】【S17】【Power Rangers RPM】【30】【End Game】【x264 Hi10p AAC】【MP4】",
       "errors": {
-        "season": {
           "gold": null,
-          "pred": "17"
         }
       },
       "gold": {
-        "group": "CXRAW",
-        "title": "S17",
         "season": null,
-        "episode": 30,
-        "resolution": null,
-        "source": "AAC",
-        "special": null
       },
       "pred": {
-        "group": "CXRAW",
-        "title": "S17",
-        "season": 17,
-        "episode": 30,
-        "resolution": null,
-        "source": "AAC",
-        "special": null
       }
     },
     {
-      "filename": "(アニメ) YAT安心！宇宙旅行 第1期 第24話 「モーレツ！かあちゃん珍道中」 (LD 640x480 WMV9 QB90 24fps)",
       "errors": {
-        "season": {
           "gold": null,
-          "pred": "1"
         }
       },
       "gold": {
-        "group": "アニメ",
-        "title": "YAT安心！宇宙旅行",
         "season": null,
-        "episode": 24,
-        "resolution": "640x480",
-        "source": null,
-        "special": null
       },
       "pred": {
-        "group": "アニメ",
-        "title": "YAT安心！宇宙旅行",
-        "season": 1,
-        "episode": 24,
-        "resolution": "640x480",
-        "source": null,
-        "special": null
       }
     },
     {
-      "filename": "[Snow-Raws] アイドルマスター シンデレラガールズ劇場 第2期 SP17 (DVD 1280x720 HEVC-YUV420P10 FLAC)",
       "errors": {
-        "season": {
-          "gold": null,
-          "pred": "2"
         }
       },
       "gold": {
-        "group": "Snow-Raws",
-        "title": "アイドルマスター シンデレラガールズ劇場 第2期 SP17",
         "season": null,
-        "episode": 17,
-        "resolution": "1280x720",
-        "source": "DVD",
-        "special": "SP"
       },
       "pred": {
-        "group": "Snow-Raws",
-        "title": "アイドルマスター シンデレラガールズ劇場 第2期 SP17",
-        "season": 2,
-        "episode": 17,
-        "resolution": "1280x720",
-        "source": "DVD",
-        "special": "SP"
       }
     }
   ]

 {
+  "sample_count": 512,
   "field_accuracy": {
+    "group": 1.0,
+    "title": 0.974609375,
+    "season": 0.98046875,
+    "episode": 0.806640625,
     "resolution": 1.0,
+    "source": 0.998046875,
+    "special": 0.96875
   },
   "field_correct": {
+    "group": 512,
+    "title": 499,
+    "season": 502,
+    "episode": 413,
+    "resolution": 512,
+    "source": 511,
+    "special": 496
   },
   "field_total": {
+    "group": 512,
+    "title": 512,
+    "season": 512,
+    "episode": 512,
+    "resolution": 512,
+    "source": 512,
+    "special": 512
   },
+  "full_match_accuracy": 0.751953125,
+  "full_match_correct": 385,
+  "full_match_total": 512,
   "failures": [
     {
+      "filename": "[ReinForce] Sword Art Online II - ED3 (BDRip 1920x1080 x264 FLAC)",
       "errors": {
+        "season": {
+          "gold": null,
+          "pred": "2"
         }
       },
       "gold": {
+        "group": "ReinForce",
+        "title": "Sword Art Online II",
         "season": null,
+        "episode": null,
+        "resolution": "1920x1080",
+        "source": "BDRip",
+        "special": "ED3"
+      },
+      "pred": {
+        "group": "ReinForce",
+        "title": "Sword Art Online II",
+        "season": 2,
+        "episode": null,
+        "resolution": "1920x1080",
+        "source": "BDRip",
+        "special": "ED3"
+      }
+    },
+    {
+      "filename": "[アニメ DVD] 銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク (DVD 640x480 WMV9 QB90 30fps MP3 192kbps)",
+      "errors": {
+        "title": {
+          "gold": "銀装騎攻オーディアン act.06 特典映像 川田&榎本トーク",
+          "pred": "銀装騎攻オーディアン act.06 特典映像 川田"
+        }
+      },
+      "gold": {
+        "group": "アニメ DVD",
+        "title": "銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク",
+        "season": null,
+        "episode": null,
+        "resolution": "640x480",
+        "source": "DVD",
         "special": null
       },
       "pred": {
+        "group": "アニメ DVD",
+        "title": "銀装騎攻オーディアン ACT.06 特典映像 川田",
         "season": null,
         "episode": null,
+        "resolution": "640x480",
+        "source": "DVD",
         "special": null
       }
     },
     {
+      "filename": "05-ラディアン 第2シリーズ_ED",
       "errors": {
+        "title": {
+          "gold": "05-ラディアン 第2シリーズ",
+          "pred": "05-ラディアン 第2"
         }
       },
       "gold": {
+        "group": null,
+        "title": "05-ラディアン 第2シリーズ",
         "season": null,
+        "episode": null,
+        "resolution": null,
+        "source": null,
         "special": "ED"
       },
       "pred": {
+        "group": null,
+        "title": "05-ラディアン 第2",
         "season": null,
+        "episode": null,
+        "resolution": null,
+        "source": null,
+        "special": "ED"
       }
     },
     {
+      "filename": "[A.A] hinotori 03",
       "errors": {
         "title": {
+          "gold": "hinotori 03",
+          "pred": "hinotori"
         },
         "episode": {
+          "gold": null,
+          "pred": "3"
         }
       },
       "gold": {
+        "group": "A.A",
+        "title": "hinotori 03",
         "season": null,
+        "episode": null,
+        "resolution": null,
+        "source": null,
+        "special": null
       },
       "pred": {
+        "group": "A.A",
+        "title": "hinotori",
         "season": null,
+        "episode": 3,
+        "resolution": null,
+        "source": null,
+        "special": null
       }
     },
     {
+      "filename": "[Nekomoe kissaten] Azur Lane Bisoku Zenshin! [ED][05][BDRip 1080p HEVC-10bit FLAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "5"
         }
       },
       "gold": {
+        "group": "Nekomoe kissaten",
+        "title": "Azur Lane Bisoku Zenshin! [ED",
         "season": null,
+        "episode": null,
+        "resolution": "1080p",
+        "source": "BDRip",
+        "special": "05"
       },
       "pred": {
+        "group": "Nekomoe kissaten",
+        "title": "Azur Lane Bisoku Zenshin! [ED",
         "season": null,
+        "episode": 5,
+        "resolution": "1080p",
+        "source": "BDRip",
+        "special": "05"
       }
     },
     {
+      "filename": "[VCB-Studio] Danmachi IV [10][Ma10p_1080p][x265_flac]",
       "errors": {
         "season": {
           "gold": null,
+          "pred": "4"
+        },
+        "episode": {
+          "gold": null,
+          "pred": "10"
         }
       },
       "gold": {
         "group": "VCB-Studio",
+        "title": "Danmachi",
         "season": null,
+        "episode": null,
         "resolution": "1080p",
         "source": "x265_flac",
+        "special": "10"
       },
       "pred": {
         "group": "VCB-Studio",
+        "title": "Danmachi",
+        "season": 4,
+        "episode": 10,
         "resolution": "1080p",
         "source": "x265_flac",
+        "special": "10"
       }
     },
     {
+      "filename": "[FZSD&DBD-Raws][King of Prism Dramatic Prism.1][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "12"
         }
       },
       "gold": {
+        "group": "FZSD&DBD-Raws",
+        "title": "King of Prism Dramatic Prism.1",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "12"
       },
       "pred": {
+        "group": "FZSD&DBD-Raws",
+        "title": "King of Prism Dramatic Prism.1",
         "season": null,
+        "episode": 12,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "12"
       }
     },
     {
+      "filename": "[SAIO-Raws] Wakaokami wa Shougakusei! PV 02 [BD 1920x1080 HEVC-10bit OPUS]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "2"
         }
       },
       "gold": {
+        "group": "SAIO-Raws",
+        "title": "Wakaokami wa Shougakusei! PV 02",
         "season": null,
+        "episode": null,
+        "resolution": "1920x1080",
+        "source": "BD",
+        "special": "PV 02"
       },
       "pred": {
+        "group": "SAIO-Raws",
+        "title": "Wakaokami wa Shougakusei! PV 02",
         "season": null,
         "episode": 2,
+        "resolution": "1920x1080",
+        "source": "BD",
+        "special": "PV 02"
       }
     },
     {
+      "filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][01][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "1"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Hime-sama Goumon no Jikan Desu",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "01"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Hime-sama Goumon no Jikan Desu",
         "season": null,
+        "episode": 1,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "01"
       }
     },
     {
+      "filename": "[DBD-Raws][Tenshi no 3P!][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
           "gold": null,
+          "pred": "3"
         }
       },
       "gold": {
         "group": "DBD-Raws",
+        "title": "Tenshi no 3P!",
         "season": null,
+        "episode": null,
         "resolution": "1080P",
         "source": "BDRip",
+        "special": "03"
       },
       "pred": {
         "group": "DBD-Raws",
+        "title": "Tenshi no 3P!",
+        "season": null,
+        "episode": 3,
         "resolution": "1080P",
         "source": "BDRip",
+        "special": "03"
       }
     },
     {
+      "filename": "[Suzu-Kaze] DanMachi IV 21 [WebRip 1920x1080 HEVC YUV420P10 AAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "21"
         }
       },
       "gold": {
+        "group": "Suzu-Kaze",
+        "title": "DanMachi IV 21",
+        "season": null,
+        "episode": null,
+        "resolution": "1920x1080",
+        "source": "WebRip",
+        "special": "IV 21"
       },
       "pred": {
+        "group": "Suzu-Kaze",
+        "title": "DanMachi IV 21",
+        "season": null,
+        "episode": 21,
+        "resolution": "1920x1080",
+        "source": "WebRip",
+        "special": "IV 21"
       }
     },
     {
+      "filename": "[VCB-Studio] Log Horizon 2 [IV03][Ma10p_1080p][x265_aac]",
       "errors": {
+        "season": {
           "gold": null,
+          "pred": "2"
         }
       },
       "gold": {
+        "group": "VCB-Studio",
+        "title": "Log Horizon 2",
         "season": null,
+        "episode": null,
+        "resolution": "1080p",
+        "source": "x265_aac",
+        "special": "IV03"
       },
       "pred": {
+        "group": "VCB-Studio",
+        "title": "Log Horizon 2",
+        "season": 2,
+        "episode": null,
+        "resolution": "1080p",
+        "source": "x265_aac",
+        "special": "IV03"
       }
     },
     {
+      "filename": "[DBD-Raws][Mahou Shoujo Lyrical Nanoha The Movie 2nd A's][PV][06][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
         "episode": {
+          "gold": null,
+          "pred": "6"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "06"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
         "season": null,
+        "episode": 6,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "06"
       }
     },
     {
+      "filename": "[DBD-Raws][Hana wa Saku, Shura no Gotoku][PV][11][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
         "episode": {
+          "gold": null,
+          "pred": "11"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Hana wa Saku, Shura no Gotoku",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "11"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Hana wa Saku, Shura no Gotoku",
         "season": null,
+        "episode": 11,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "11"
       }
     },
     {
+      "filename": "[Seed-Raws] Strike the Blood IV - OVA Vol.01 Menu 02 (BD 1280x720 AVC AAC)",
       "errors": {
         "season": {
+          "gold": "4",
+          "pred": null
         }
       },
       "gold": {
+        "group": "Seed-Raws",
+        "title": "Strike the Blood IV - OVA Vol.01 Menu 02",
+        "season": 4,
+        "episode": null,
+        "resolution": "1280x720",
+        "source": "BD",
+        "special": "OVA"
       },
       "pred": {
+        "group": "Seed-Raws",
+        "title": "Strike the Blood IV - OVA Vol.01 Menu 02",
+        "season": null,
+        "episode": null,
+        "resolution": "1280x720",
+        "source": "BD",
+        "special": "OVA"
       }
     },
     {
+      "filename": "[DBD-Raws][Hametsu no Oukoku][PV][05][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "5"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Hametsu no Oukoku",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "05"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Hametsu no Oukoku",
         "season": null,
+        "episode": 5,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "05"
       }
     },
     {
+      "filename": "[DBD-Raws][Tate no Yuusha no Nariagari S1][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
+          "gold": null,
+          "pred": "3"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Tate no Yuusha no Nariagari",
+        "season": 1,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "03"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Tate no Yuusha no Nariagari",
         "season": 1,
+        "episode": 3,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "03"
       }
     },
     {
+      "filename": "[DBD-Raws][Kimi no Iro][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
           "gold": null,
+          "pred": "12"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Kimi no Iro",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "12"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Kimi no Iro",
+        "season": null,
+        "episode": 12,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "12"
       }
     },
     {
+      "filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][02][1080P][BDRip][HEVC-10bit][FLAC]",
       "errors": {
+        "episode": {
           "gold": null,
+          "pred": "2"
         }
       },
       "gold": {
+        "group": "DBD-Raws",
+        "title": "Hime-sama Goumon no Jikan Desu",
         "season": null,
+        "episode": null,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "02"
       },
       "pred": {
+        "group": "DBD-Raws",
+        "title": "Hime-sama Goumon no Jikan Desu",
+        "season": null,
+        "episode": 2,
+        "resolution": "1080P",
+        "source": "BDRip",
+        "special": "02"
       }
     },
     {
+      "filename": "Mahou.no.Angel.Sweet.Mint.TV.1990.DVDRip-Hi.x264.AC3.1024.EP21-nezumi",
       "errors": {
+        "title": {
+          "gold": "mahou.no.angel.sweet.mint.tv.1990. -hi. .ac",
+          "pred": "mahou.no.angel.sweet.mint.tv.1 -h"
         }
       },
       "gold": {
+        "group": null,
+        "title": "Mahou.no.Angel.Sweet.Mint.TV.1990. -Hi. .AC",
         "season": null,
+        "episode": 21,
+        "resolution": null,
+        "source": "DVDRip",
+        "special": null
       },
       "pred": {
+        "group": null,
+        "title": "Mahou.no.Angel.Sweet.Mint.TV.1 -H",
+        "season": null,
+        "episode": 21,
+        "resolution": null,
+        "source": "DVDRip",
+        "special": null
       }
     }
   ]

run_metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "experiment_name": "dmhy-char-guoman-relabel",
-  "data_file": "datasets/AnimeName/dmhy_weak_char.jsonl",
   "tokenizer_variant": "char",
   "vocab_file": "datasets/AnimeName/vocab.char.json",
   "vocab_size": 6199,
@@ -9,15 +9,15 @@
   "num_hidden_layers": 4,
   "num_attention_heads": 8,
   "intermediate_size": 1024,
-  "train_samples": 619361,
-  "eval_samples": 12641,
-  "epochs": 2.0,
-  "batch_size": 256,
-  "learning_rate": 8e-05,
-  "warmup_steps": 300,
-  "seed": 52,
-  "device": "cuda",
-  "fp16": true,
   "gradient_accumulation_steps": 1,
-  "dataloader_num_workers": 4
 }

 {
+  "experiment_name": "dmhy-char-special-focus2",
+  "data_file": "data/repair_focus_char.jsonl",
   "tokenizer_variant": "char",
   "vocab_file": "datasets/AnimeName/vocab.char.json",
   "vocab_size": 6199,
   "num_hidden_layers": 4,
   "num_attention_heads": 8,
   "intermediate_size": 1024,
+  "train_samples": 68939,
+  "eval_samples": 3629,
+  "epochs": 1.0,
+  "batch_size": 64,
+  "learning_rate": 3e-05,
+  "warmup_steps": 50,
+  "seed": 75,
+  "device": "cpu",
+  "fp16": false,
   "gradient_accumulation_steps": 1,
+  "dataloader_num_workers": 0
 }

trainer_eval_metrics.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "eval_loss": 0.005763721186667681,
-  "eval_precision": 0.9921522239605195,
-  "eval_recall": 0.9946191314105016,
-  "eval_f1": 0.9933841461473317,
-  "eval_accuracy": 0.9980711558885925,
-  "eval_runtime": 45.558,
-  "eval_samples_per_second": 277.471,
-  "eval_steps_per_second": 1.098,
-  "epoch": 2.0
 }

 {
+  "eval_loss": 0.03365034610033035,
+  "eval_precision": 0.9612760834670947,
+  "eval_recall": 0.9719629960236955,
+  "eval_f1": 0.9665900012105072,
+  "eval_accuracy": 0.990421109705404,
+  "eval_runtime": 13.2008,
+  "eval_samples_per_second": 274.908,
+  "eval_steps_per_second": 4.318,
+  "epoch": 1.0
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f01503ec029ec161063c2d78a00732c80072525b8d258c7c717b2e21f4f55d93
 size 5265

 version https://git-lfs.github.com/spec/v1
+oid sha256:b23b375ad7f991bc460e29c07b8250afa09ec2d62bad255e0fc6125f0982c56d
 size 5265