Implement schema v2 anime filename labels

Browse files

Files changed (16) hide show

anifilebert/config.py +8 -13
anifilebert/dataset.py +2 -1
anifilebert/inference.py +99 -35
anifilebert/label_repairs.py +11 -9
anifilebert/labels.py +205 -0
anifilebert/model.py +116 -0
anifilebert/train.py +54 -9
data/parser_regression_cases.json +21 -0
label_schema.json +80 -0
tools/build_path_focus_dataset.py +40 -12
tools/build_path_prefix_dataset.py +110 -12
tools/build_repair_focus_dataset.py +29 -18
tools/evaluate_parser_cases.py +15 -1
tools/rust_dmhy_template_apply/src/main.rs +886 -235
tools/virtual_dataset_generator/src/bin/case_combo_generator.rs +129 -50
tools/virtual_dataset_generator/src/main.rs +477 -94

anifilebert/config.py CHANGED Viewed

@@ -4,7 +4,9 @@ All hyperparameters are centralized here for easy tuning.
 """
-from dataclasses import dataclass, field
 @dataclass
@@ -50,24 +52,17 @@ class Config:
     cls_token: str = "[CLS]"
     sep_token: str = "[SEP]"
-    # BIO label scheme (8 entity types + O)
     label2id: dict = None
     id2label: dict = None
     def __post_init__(self):
         if self.label2id is None:
-            self.label2id = {
-                "O": 0,
-                "B-TITLE": 1, "I-TITLE": 2,
-                "B-SEASON": 3, "I-SEASON": 4,
-                "B-EPISODE": 5, "I-EPISODE": 6,
-                "B-SPECIAL": 7, "I-SPECIAL": 8,
-                "B-GROUP": 9, "I-GROUP": 10,
-                "B-RESOLUTION": 11, "I-RESOLUTION": 12,
-                "B-SOURCE": 13, "I-SOURCE": 14,
-            }
         if self.id2label is None:
-            self.id2label = {v: k for k, v in self.label2id.items()}
     @property
     def num_labels(self) -> int:

 """
+from dataclasses import dataclass
+from .labels import LABEL_SCHEMA_VERSION, make_id2label, make_label2id
 @dataclass
     cls_token: str = "[CLS]"
     sep_token: str = "[SEP]"
+    # BIO label scheme
+    label_schema_version: int = LABEL_SCHEMA_VERSION
     label2id: dict = None
     id2label: dict = None
     def __post_init__(self):
+        using_default_labels = self.label2id is None
         if self.label2id is None:
+            self.label2id = make_label2id()
         if self.id2label is None:
+            self.id2label = make_id2label() if using_default_labels else {v: k for k, v in self.label2id.items()}
     @property
     def num_labels(self) -> int:

anifilebert/dataset.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import Dict, List, Optional, Sequence, Tuple
 from .config import Config
 from .label_repairs import repair_sequel_season_labels
 from .tokenizer import AnimeTokenizer
@@ -33,7 +34,7 @@ def encode_token_classification_values(
     input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
     label_ids: List[int] = [-100]
-    label_ids.extend(label2id.get(label, 0) for label in labels)
     label_ids.append(-100)
     attention_mask = [1] * len(input_ids)

 from .config import Config
 from .label_repairs import repair_sequel_season_labels
+from .labels import canonical_bio_label
 from .tokenizer import AnimeTokenizer
     input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
     label_ids: List[int] = [-100]
+    label_ids.extend(label2id.get(canonical_bio_label(str(label)), 0) for label in labels)
     label_ids.append(-100)
     attention_mask = [1] * len(input_ids)

anifilebert/inference.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 from .config import Config
 from .label_repairs import season_marker_number
 from .model import load_model
 from .tokenizer import AnimeTokenizer, load_tokenizer
@@ -289,6 +290,55 @@ def constrained_bio_decode(emissions: torch.Tensor, id2label: Dict[int, str]) ->
     return decoded
 def postprocess(
     tokens: List[str],
     labels: List[str],
@@ -300,53 +350,68 @@ def postprocess(
     Merges consecutive B- / I- tokens of the same entity type,
     then extracts structured fields.
     """
-    result: Dict = {
-        "title": None,
-        "season": None,
-        "episode": None,
-        "group": None,
-        "resolution": None,
-        "source": None,
-        "special": None,
-    }
     entities = labels_to_entities(tokens, labels, tokenizer)
     grouped_entities: Dict[str, List[str]] = {}
-    for entity_type, text in entities:
         grouped_entities.setdefault(entity_type, []).append(text)
-    title_fragments = [
-        cleaned for text in grouped_entities.get("TITLE", [])
-        if (cleaned := normalize_field_text(text))
-    ]
-    if title_fragments:
-        result["title"] = " ".join(title_fragments)
     for text in grouped_entities.get("SEASON", []):
             season_num = extract_season_number(text)
             if season_num is not None:
                 result["season"] = season_num
     for text in grouped_entities.get("EPISODE", []):
-            ep_num = extract_episode_number(text)
-            if ep_num is not None:
-                if result["episode"] is None:
-                    result["episode"] = ep_num
     for text in grouped_entities.get("GROUP", []):
-            group = normalize_field_text(text)
-            if result["group"] is None:
-                result["group"] = group
     for text in grouped_entities.get("SPECIAL", []):
-            special = normalize_field_text(text)
-            result["special"] = special
     for text in grouped_entities.get("RESOLUTION", []):
-            res = extract_resolution(text)
-            if res:
-                result["resolution"] = res
     result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
@@ -359,6 +424,7 @@ def postprocess(
         or "月番" in result["title"]
     ):
         result["title"] = new_show_title
     search_special = extract_bracketed_search_special(whole_text)
     if search_special is not None:
@@ -375,6 +441,8 @@ def postprocess(
                 "resolution": None,
                 "source": None,
                 "special": standalone_special,
             }
         )
@@ -406,9 +474,7 @@ def parse_filename(
     # Tokenize
     tokens = tokenizer.tokenize(filename)
     if not tokens:
-        return {"title": None, "season": None, "episode": None,
-                "group": None, "resolution": None, "source": None,
-                "special": None}
     # Convert to input IDs
     input_ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -451,9 +517,7 @@ def parse_filename(
     # Truncate real tokens if we had to truncate
     available = min(real_token_count, max_length - 2)
     if available <= 0:
-        return {"title": None, "season": None, "episode": None,
-                "group": None, "resolution": None, "source": None,
-                "special": None}
     with torch.no_grad():
         logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits

 from .config import Config
 from .label_repairs import season_marker_number
+from .labels import is_file_title_entity, is_path_title_entity, title_entity_priority, title_language
 from .model import load_model
 from .tokenizer import AnimeTokenizer, load_tokenizer
     return decoded
+def empty_parse_result() -> Dict:
+    return {
+        "title": None,
+        "season": None,
+        "episode": None,
+        "group": None,
+        "resolution": None,
+        "source": None,
+        "special": None,
+        "title_candidates": [],
+        "tags": [],
+    }
+def append_unique(values: List[str], value: str) -> None:
+    if value and value not in values:
+        values.append(value)
+def infer_title_kind(text: str) -> str:
+    has_latin = any(ch.isascii() and ch.isalpha() for ch in text)
+    has_han = any("\u4e00" <= ch <= "\u9fff" for ch in text)
+    has_kana = any("\u3040" <= ch <= "\u30ff" or "\u31f0" <= ch <= "\u31ff" for ch in text)
+    if has_kana:
+        return "jpn"
+    if has_latin and has_han:
+        return "mixed"
+    if has_han:
+        return "chs"
+    if has_latin:
+        return "latin"
+    return "mixed"
+def append_title_candidate(result: Dict, text: str, entity: Optional[str], source: str) -> None:
+    if not text:
+        return
+    kind = title_language(entity).lower() if entity else infer_title_kind(text)
+    candidate = {"text": text, "kind": kind, "source": source}
+    if candidate not in result["title_candidates"]:
+        result["title_candidates"].append(candidate)
+def choose_title_span(spans: List[Tuple[str, str, int]]) -> Optional[str]:
+    if not spans:
+        return None
+    return min(spans, key=lambda item: (title_entity_priority(item[0]), item[2]))[1]
 def postprocess(
     tokens: List[str],
     labels: List[str],
     Merges consecutive B- / I- tokens of the same entity type,
     then extracts structured fields.
     """
+    result: Dict = empty_parse_result()
     entities = labels_to_entities(tokens, labels, tokenizer)
     grouped_entities: Dict[str, List[str]] = {}
+    file_title_spans: List[Tuple[str, str, int]] = []
+    path_title_spans: List[Tuple[str, str, int]] = []
+    for index, (entity_type, text) in enumerate(entities):
         grouped_entities.setdefault(entity_type, []).append(text)
+        title = normalize_field_text(text)
+        if not title:
+            continue
+        if is_file_title_entity(entity_type):
+            file_title_spans.append((entity_type, title, index))
+        elif is_path_title_entity(entity_type):
+            path_title_spans.append((entity_type, title, index))
+    for entity, title, _index in file_title_spans:
+        append_title_candidate(result, title, entity, "file")
+    for entity, title, _index in path_title_spans:
+        append_title_candidate(result, title, entity, "path")
+    if file_title_spans and all(entity == "TITLE" for entity, _title, _index in file_title_spans):
+        result["title"] = " ".join(title for _entity, title, _index in file_title_spans)
+    else:
+        result["title"] = choose_title_span(file_title_spans) or choose_title_span(path_title_spans)
     for text in grouped_entities.get("SEASON", []):
+        season_num = extract_season_number(text)
+        if season_num is not None:
+            result["season"] = season_num
+            break
+    if result["season"] is None:
+        for text in grouped_entities.get("PATH_SEASON", []):
             season_num = extract_season_number(text)
             if season_num is not None:
                 result["season"] = season_num
+                break
     for text in grouped_entities.get("EPISODE", []):
+        ep_num = extract_episode_number(text)
+        if ep_num is not None:
+            if result["episode"] is None:
+                result["episode"] = ep_num
     for text in grouped_entities.get("GROUP", []):
+        group = normalize_field_text(text)
+        if result["group"] is None:
+            result["group"] = group
     for text in grouped_entities.get("SPECIAL", []):
+        special = normalize_field_text(text)
+        result["special"] = special
     for text in grouped_entities.get("RESOLUTION", []):
+        res = extract_resolution(text)
+        if res:
+            result["resolution"] = res
+    for text in grouped_entities.get("TAG", []):
+        tag = normalize_field_text(text)
+        append_unique(result["tags"], tag)
     result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
         or "月番" in result["title"]
     ):
         result["title"] = new_show_title
+        append_title_candidate(result, new_show_title, None, "file")
     search_special = extract_bracketed_search_special(whole_text)
     if search_special is not None:
                 "resolution": None,
                 "source": None,
                 "special": standalone_special,
+                "title_candidates": [],
+                "tags": [],
             }
         )
     # Tokenize
     tokens = tokenizer.tokenize(filename)
     if not tokens:
+        return empty_parse_result()
     # Convert to input IDs
     input_ids = tokenizer.convert_tokens_to_ids(tokens)
     # Truncate real tokens if we had to truncate
     available = min(real_token_count, max_length - 2)
     if available <= 0:
+        return empty_parse_result()
     with torch.no_grad():
         logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits

anifilebert/label_repairs.py CHANGED Viewed

@@ -6,6 +6,8 @@ import re
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple
 SEPARATOR_CHARS = set(" \t-_.|~～")
@@ -282,7 +284,7 @@ def find_sequel_season_markers(text: str) -> List[LabelRepair]:
 def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
-    return any(label.endswith("SEASON") and end <= marker_start for label, (_start, end) in zip(labels, offsets))
 def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
@@ -293,7 +295,7 @@ def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end:
 def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
-    previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity)
     first = not previous_is_same_entity
     for idx in indices:
         labels[idx] = f"B-{entity}" if first else f"I-{entity}"
@@ -301,7 +303,7 @@ def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
 def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
-    previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity)
     first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
     expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
     if [labels[idx] for idx in indices] == expected:
@@ -314,7 +316,7 @@ def safe_to_overwrite_meta(labels: Sequence[str], indices: Sequence[int]) -> boo
     if not indices:
         return False
     return not any(
-        labels[idx].endswith(("GROUP", "EPISODE", "SEASON"))
         for idx in indices
     )
@@ -328,12 +330,12 @@ def mark_adjacent_title_separators_o(
         return
     idx = marker_indices[0] - 1
-    while idx >= 0 and "".join(tokens[idx]).strip() == "" and labels[idx].endswith("TITLE"):
         labels[idx] = "O"
         idx -= 1
     idx = marker_indices[-1] + 1
-    while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and labels[idx].endswith("TITLE"):
         labels[idx] = "O"
         idx += 1
@@ -341,7 +343,7 @@ def mark_adjacent_title_separators_o(
 def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
     ends = [
         end for label, (_start, end) in zip(labels, offsets)
-        if label.endswith("EPISODE")
     ]
     if ends:
         return min(ends)
@@ -465,11 +467,11 @@ def repair_known_label_issues(
                 continue
             existing = [repaired_labels[idx] for idx in indices]
             if any(
-                label.endswith(("GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL"))
                 for label in existing
             ):
                 continue
-            if not any(label.endswith("TITLE") for label in existing):
                 continue
             label_span(repaired_labels, indices, "SEASON")

 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from .labels import is_same_entity_label, is_season_like_label, is_title_like_label, label_entity
 SEPARATOR_CHARS = set(" \t-_.|~～")
 def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
+    return any(is_season_like_label(label) and end <= marker_start for label, (_start, end) in zip(labels, offsets))
 def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
 def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
+    previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
     first = not previous_is_same_entity
     for idx in indices:
         labels[idx] = f"B-{entity}" if first else f"I-{entity}"
 def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
+    previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
     first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
     expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
     if [labels[idx] for idx in indices] == expected:
     if not indices:
         return False
     return not any(
+        label_entity(labels[idx]) in {"GROUP", "EPISODE", "SEASON", "PATH_SEASON"}
         for idx in indices
     )
         return
     idx = marker_indices[0] - 1
+    while idx >= 0 and "".join(tokens[idx]).strip() == "" and is_title_like_label(labels[idx]):
         labels[idx] = "O"
         idx -= 1
     idx = marker_indices[-1] + 1
+    while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and is_title_like_label(labels[idx]):
         labels[idx] = "O"
         idx += 1
 def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
     ends = [
         end for label, (_start, end) in zip(labels, offsets)
+        if label_entity(label) == "EPISODE"
     ]
     if ends:
         return min(ends)
                 continue
             existing = [repaired_labels[idx] for idx in indices]
             if any(
+                label_entity(label) in {"GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL", "TAG", "PATH_SEASON"}
                 for label in existing
             ):
                 continue
+            if not any(is_title_like_label(label) for label in existing):
                 continue
             label_span(repaired_labels, indices, "SEASON")

anifilebert/labels.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""Shared BIO label schema and helpers."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+LABEL_SCHEMA_VERSION = 2
+TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
+TITLE_PRIORITY = ("CHS", "CHT", "JPN", "MIXED", "LATIN")
+FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
+PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
+TITLE_ENTITIES = FILE_TITLE_ENTITIES + PATH_TITLE_ENTITIES
+TITLE_LIKE_ENTITIES = TITLE_ENTITIES + ("TITLE",)
+SEASON_LIKE_ENTITIES = ("SEASON", "PATH_SEASON")
+DEFAULT_TITLE_ENTITY = "TITLE_MIXED"
+_FALLBACK_LABELS = (
+    "O",
+    "B-TITLE_CHS",
+    "I-TITLE_CHS",
+    "B-TITLE_CHT",
+    "I-TITLE_CHT",
+    "B-TITLE_JPN",
+    "I-TITLE_JPN",
+    "B-TITLE_LATIN",
+    "I-TITLE_LATIN",
+    "B-TITLE_MIXED",
+    "I-TITLE_MIXED",
+    "B-PATH_TITLE_CHS",
+    "I-PATH_TITLE_CHS",
+    "B-PATH_TITLE_CHT",
+    "I-PATH_TITLE_CHT",
+    "B-PATH_TITLE_JPN",
+    "I-PATH_TITLE_JPN",
+    "B-PATH_TITLE_LATIN",
+    "I-PATH_TITLE_LATIN",
+    "B-PATH_TITLE_MIXED",
+    "I-PATH_TITLE_MIXED",
+    "B-PATH_SEASON",
+    "I-PATH_SEASON",
+    "B-SEASON",
+    "I-SEASON",
+    "B-EPISODE",
+    "I-EPISODE",
+    "B-SPECIAL",
+    "I-SPECIAL",
+    "B-GROUP",
+    "I-GROUP",
+    "B-RESOLUTION",
+    "I-RESOLUTION",
+    "B-SOURCE",
+    "I-SOURCE",
+    "B-TAG",
+    "I-TAG",
+)
+def _load_schema_labels() -> Tuple[str, ...]:
+    schema_path = Path(__file__).resolve().parents[1] / "label_schema.json"
+    try:
+        with schema_path.open("r", encoding="utf-8") as fh:
+            payload = json.load(fh)
+    except OSError:
+        return _FALLBACK_LABELS
+    labels = payload.get("labels")
+    if not isinstance(labels, list) or not labels:
+        return _FALLBACK_LABELS
+    if not all(isinstance(label, str) and label for label in labels):
+        return _FALLBACK_LABELS
+    return tuple(labels)
+LABELS = _load_schema_labels()
+LEGACY_15_LABELS = (
+    "O",
+    "B-TITLE",
+    "I-TITLE",
+    "B-SEASON",
+    "I-SEASON",
+    "B-EPISODE",
+    "I-EPISODE",
+    "B-SPECIAL",
+    "I-SPECIAL",
+    "B-GROUP",
+    "I-GROUP",
+    "B-RESOLUTION",
+    "I-RESOLUTION",
+    "B-SOURCE",
+    "I-SOURCE",
+)
+LABEL2ID = {label: idx for idx, label in enumerate(LABELS)}
+ID2LABEL = {idx: label for idx, label in enumerate(LABELS)}
+def make_label2id() -> Dict[str, int]:
+    return dict(LABEL2ID)
+def make_id2label() -> Dict[int, str]:
+    return dict(ID2LABEL)
+def split_bio_label(label: str) -> Tuple[Optional[str], Optional[str]]:
+    if not isinstance(label, str) or label == "O":
+        return None, None
+    prefix, sep, entity = label.partition("-")
+    if sep != "-" or prefix not in {"B", "I"} or not entity:
+        return None, None
+    return prefix, entity
+def label_entity(label: str) -> Optional[str]:
+    return split_bio_label(label)[1]
+def canonical_entity(entity: str) -> str:
+    return DEFAULT_TITLE_ENTITY if entity == "TITLE" else entity
+def canonical_bio_label(label: str) -> str:
+    prefix, entity = split_bio_label(label)
+    if prefix is None or entity is None:
+        return "O" if label == "O" else label
+    return f"{prefix}-{canonical_entity(entity)}"
+def is_title_entity(entity: Optional[str]) -> bool:
+    return entity in TITLE_LIKE_ENTITIES
+def is_file_title_entity(entity: Optional[str]) -> bool:
+    return entity in FILE_TITLE_ENTITIES or entity == "TITLE"
+def is_path_title_entity(entity: Optional[str]) -> bool:
+    return entity in PATH_TITLE_ENTITIES
+def is_title_like_label(label: str) -> bool:
+    return is_title_entity(label_entity(label))
+def is_season_entity(entity: Optional[str]) -> bool:
+    return entity in SEASON_LIKE_ENTITIES
+def is_season_like_label(label: str) -> bool:
+    return is_season_entity(label_entity(label))
+def is_same_entity_label(label: str, entity: str) -> bool:
+    return label_entity(label) == entity
+def title_language(entity: Optional[str]) -> str:
+    if entity == "TITLE":
+        return "MIXED"
+    if not entity:
+        return "MIXED"
+    if entity.startswith("PATH_TITLE_"):
+        return entity.removeprefix("PATH_TITLE_")
+    if entity.startswith("TITLE_"):
+        return entity.removeprefix("TITLE_")
+    return "MIXED"
+def title_entity_priority(entity: Optional[str]) -> Tuple[int, int]:
+    language = title_language(entity)
+    language_rank = TITLE_PRIORITY.index(language) if language in TITLE_PRIORITY else len(TITLE_PRIORITY)
+    path_rank = 1 if is_path_title_entity(entity) else 0
+    return path_rank, language_rank
+def label_migration_sources(target_label: str) -> Tuple[str, ...]:
+    """Return old-label candidates that can initialize a target label row."""
+    if target_label == "O":
+        return ("O",)
+    prefix, entity = split_bio_label(target_label)
+    if prefix is None or entity is None:
+        return (target_label,)
+    sources = [target_label]
+    if is_title_entity(entity):
+        sources.append(f"{prefix}-TITLE")
+    elif entity == "PATH_SEASON":
+        sources.append(f"{prefix}-SEASON")
+    return tuple(dict.fromkeys(sources))
+def infer_legacy_id2label(num_labels: int) -> Optional[Dict[int, str]]:
+    if num_labels == len(LEGACY_15_LABELS):
+        return {idx: label for idx, label in enumerate(LEGACY_15_LABELS)}
+    if num_labels == len(LABELS):
+        return make_id2label()
+    return None

anifilebert/model.py CHANGED Viewed

@@ -18,6 +18,7 @@ from transformers.modeling_outputs import TokenClassifierOutput
 from transformers.modeling_utils import PreTrainedModel
 from .config import Config
 class LinearChainCRF(nn.Module):
@@ -266,6 +267,7 @@ def build_bert_config(config: Config) -> BertConfig:
         attention_probs_dropout_prob=config.attention_probs_dropout_prob,
         id2label=config.id2label,
         label2id=config.label2id,
     )
@@ -314,6 +316,120 @@ def load_model(model_dir: str, model_head: Optional[str] = None) -> PreTrainedMo
     return BertForTokenClassification.from_pretrained(model_dir)
 def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
     """Persist the selected head in config.json for later auto-loading."""
     head = normalize_model_head(model_head)

 from transformers.modeling_utils import PreTrainedModel
 from .config import Config
+from .labels import infer_legacy_id2label, label_migration_sources
 class LinearChainCRF(nn.Module):
         attention_probs_dropout_prob=config.attention_probs_dropout_prob,
         id2label=config.id2label,
         label2id=config.label2id,
+        label_schema_version=config.label_schema_version,
     )
     return BertForTokenClassification.from_pretrained(model_dir)
+def _model_id2label_for_migration(model: PreTrainedModel) -> dict[int, str]:
+    raw_id2label = getattr(model.config, "id2label", None) or {}
+    normalized = {int(label_id): str(label) for label_id, label in raw_id2label.items()}
+    classifier = getattr(model, "classifier", None)
+    out_features = getattr(classifier, "out_features", None)
+    if out_features is not None and len(normalized) != int(out_features):
+        inferred = infer_legacy_id2label(int(out_features))
+        if inferred is not None:
+            return inferred
+    return normalized
+def migrate_token_classifier_labels(
+    model: PreTrainedModel,
+    target_label2id: dict[str, int],
+    target_id2label: dict[int, str],
+) -> dict[str, object]:
+    """
+    Expand or reorder token-classification label rows for the shared schema.
+    Exact labels are copied by name. Legacy 15-label TITLE rows initialize all
+    title-like rows, and legacy SEASON rows initialize PATH_SEASON.
+    """
+    classifier = getattr(model, "classifier", None)
+    if classifier is None or not isinstance(classifier, nn.Linear):
+        return {"changed": False, "reason": "no_linear_classifier"}
+    target_id2label = {int(label_id): str(label) for label_id, label in target_id2label.items()}
+    target_label2id = {str(label): int(label_id) for label, label_id in target_label2id.items()}
+    old_id2label = _model_id2label_for_migration(model)
+    old_label2id = {label: label_id for label_id, label in old_id2label.items()}
+    old_num_labels = int(classifier.out_features)
+    new_num_labels = len(target_label2id)
+    same_schema = (
+        old_num_labels == new_num_labels
+        and all(old_id2label.get(idx) == target_id2label.get(idx) for idx in range(new_num_labels))
+    )
+    if same_schema:
+        model.config.num_labels = new_num_labels
+        model.config.id2label = target_id2label
+        model.config.label2id = target_label2id
+        return {"changed": False, "copied": new_num_labels, "target_labels": new_num_labels}
+    old_weight = classifier.weight.detach()
+    old_bias = classifier.bias.detach() if classifier.bias is not None else None
+    new_classifier = nn.Linear(
+        classifier.in_features,
+        new_num_labels,
+        bias=classifier.bias is not None,
+        device=old_weight.device,
+        dtype=old_weight.dtype,
+    )
+    nn.init.normal_(
+        new_classifier.weight,
+        mean=0.0,
+        std=getattr(model.config, "initializer_range", 0.02),
+    )
+    if new_classifier.bias is not None:
+        nn.init.zeros_(new_classifier.bias)
+    row_sources: dict[int, int] = {}
+    copied = 0
+    for target_label, target_id in target_label2id.items():
+        for source_label in label_migration_sources(target_label):
+            source_id = old_label2id.get(source_label)
+            if source_id is None or source_id >= old_num_labels:
+                continue
+            new_classifier.weight.data[target_id].copy_(old_weight[source_id])
+            if new_classifier.bias is not None and old_bias is not None:
+                new_classifier.bias.data[target_id].copy_(old_bias[source_id])
+            row_sources[target_id] = source_id
+            copied += 1
+            break
+    model.classifier = new_classifier
+    model.num_labels = new_num_labels
+    model.config.num_labels = new_num_labels
+    model.config.id2label = target_id2label
+    model.config.label2id = target_label2id
+    if hasattr(model, "crf"):
+        old_crf = model.crf
+        new_crf = LinearChainCRF(new_num_labels, target_id2label).to(
+            device=old_weight.device,
+            dtype=old_weight.dtype,
+        )
+        nn.init.zeros_(new_crf.start_transitions)
+        nn.init.zeros_(new_crf.end_transitions)
+        nn.init.zeros_(new_crf.transitions)
+        with torch.no_grad():
+            for target_id, source_id in row_sources.items():
+                if source_id < old_crf.start_transitions.shape[0]:
+                    new_crf.start_transitions[target_id].copy_(old_crf.start_transitions[source_id])
+                    new_crf.end_transitions[target_id].copy_(old_crf.end_transitions[source_id])
+            for target_to_id, source_to_id in row_sources.items():
+                for target_from_id, source_from_id in row_sources.items():
+                    if (
+                        source_from_id < old_crf.transitions.shape[0]
+                        and source_to_id < old_crf.transitions.shape[1]
+                    ):
+                        new_crf.transitions[target_from_id, target_to_id].copy_(
+                            old_crf.transitions[source_from_id, source_to_id]
+                        )
+        model.crf = new_crf
+    return {
+        "changed": True,
+        "source_labels": old_num_labels,
+        "target_labels": new_num_labels,
+        "copied": copied,
+    }
 def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
     """Persist the selected head in config.json for later auto-loading."""
     head = normalize_model_head(model_head)

anifilebert/train.py CHANGED Viewed

@@ -33,9 +33,22 @@ from seqeval.metrics import classification_report, accuracy_score, f1_score, pre
 from .config import Config
 from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
-from .model import create_model, print_model_summary, count_parameters, load_model, save_model_head_config
 from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
 from .inference import parse_filename, postprocess
 from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
@@ -329,13 +342,23 @@ def extract_entities_from_labels(tokens: Sequence[str], labels: Sequence[str]) -
     active_tokens: List[str] = []
     for token, label in zip(tokens, labels):
         if label.startswith("B-"):
             if active_entity and active_tokens:
                 entities.setdefault(active_entity, []).append("".join(active_tokens))
-            active_entity = label[2:]
             active_tokens = [str(token)]
-        elif label.startswith("I-") and active_entity == label[2:]:
-            active_tokens.append(str(token))
         else:
             if active_entity and active_tokens:
                 entities.setdefault(active_entity, []).append("".join(active_tokens))
@@ -358,6 +381,7 @@ def char_item_from_spans(filename: str, spans: Sequence[tuple[str, str]], source
     for text, entity in spans:
         if not text:
             continue
         start = filename.find(text, cursor)
         if start < 0:
             start = filename.find(text)
@@ -386,6 +410,7 @@ def entity_keep_probability(entity: str) -> float:
         "SPECIAL": 0.3,
         "RESOLUTION": 0.65,
         "SOURCE": 0.65,
     }.get(entity, 0.5)
@@ -397,6 +422,7 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
     special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
     resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
     source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
     specs: List[tuple[str, List[tuple[str, str]]]] = []
     if title:
@@ -418,6 +444,8 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
         specs.append((special, [(special, "SPECIAL")]))
     if title and special:
         specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
     augmented: List[Dict] = []
     for text, spans in specs:
@@ -432,7 +460,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
     entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
     available = [
         entity
-        for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE")
         if entities.get(entity)
     ]
     if not available:
@@ -458,7 +486,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
         if not values:
             continue
         value = rng.choice(values)
-        if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE"} and rng.random() < 0.35:
             parts.append(f"[{value}]")
         else:
             parts.append(value)
@@ -1018,6 +1046,13 @@ def augment_training_data(
 def normalize_field_value(field: str, value) -> Optional[str]:
     if value is None:
         return None
     if field in {"episode", "season"}:
         try:
             return str(int(value))
@@ -1056,9 +1091,10 @@ def parse_exact_metrics(
         gold_labels = gold_labels[:available]
         gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
         gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
-        for optional_field, entity in (("episode", "EPISODE"), ("season", "SEASON")):
-            if entity not in gold_entities:
-                gold[optional_field] = None
         pred = parse_filename(
             filename,
             model,
@@ -1329,9 +1365,17 @@ def main():
                 f"  Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
                 f"tokens from init checkpoint"
             )
         model.config.num_labels = config.num_labels
         model.config.id2label = config.id2label
         model.config.label2id = config.label2id
     else:
         print("Creating model...")
         selected_model_head = "linear" if args.model_head == "auto" else args.model_head
@@ -1525,6 +1569,7 @@ def main():
     # Set proper label mappings in model config before saving
     model.config.id2label = config.id2label
     model.config.label2id = config.label2id
     model.config.tokenizer_variant = tokenizer_variant
     model.config.max_seq_length = config.max_seq_length
     save_model_head_config(model, selected_model_head)

 from .config import Config
 from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
+from .model import (
+    create_model,
+    print_model_summary,
+    count_parameters,
+    load_model,
+    migrate_token_classifier_labels,
+    save_model_head_config,
+)
 from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
 from .inference import parse_filename, postprocess
+from .labels import (
+    canonical_entity,
+    canonical_bio_label,
+    is_season_like_label,
+    is_title_entity,
+)
 from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
     active_tokens: List[str] = []
     for token, label in zip(tokens, labels):
+        label = canonical_bio_label(str(label))
         if label.startswith("B-"):
             if active_entity and active_tokens:
                 entities.setdefault(active_entity, []).append("".join(active_tokens))
+            entity = label[2:]
+            active_entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
             active_tokens = [str(token)]
+        elif label.startswith("I-"):
+            entity = label[2:]
+            entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
+            if active_entity == entity:
+                active_tokens.append(str(token))
+            else:
+                if active_entity and active_tokens:
+                    entities.setdefault(active_entity, []).append("".join(active_tokens))
+                active_entity = entity
+                active_tokens = [str(token)]
         else:
             if active_entity and active_tokens:
                 entities.setdefault(active_entity, []).append("".join(active_tokens))
     for text, entity in spans:
         if not text:
             continue
+        entity = canonical_entity(entity)
         start = filename.find(text, cursor)
         if start < 0:
             start = filename.find(text)
         "SPECIAL": 0.3,
         "RESOLUTION": 0.65,
         "SOURCE": 0.65,
+        "TAG": 0.35,
     }.get(entity, 0.5)
     special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
     resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
     source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
+    tag = next((value.strip() for value in entities.get("TAG", []) if value.strip()), None)
     specs: List[tuple[str, List[tuple[str, str]]]] = []
     if title:
         specs.append((special, [(special, "SPECIAL")]))
     if title and special:
         specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
+    if title and tag:
+        specs.append((f"{title} [{tag}]", [(title, "TITLE"), (tag, "TAG")]))
     augmented: List[Dict] = []
     for text, spans in specs:
     entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
     available = [
         entity
+        for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG")
         if entities.get(entity)
     ]
     if not available:
         if not values:
             continue
         value = rng.choice(values)
+        if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG"} and rng.random() < 0.35:
             parts.append(f"[{value}]")
         else:
             parts.append(value)
 def normalize_field_value(field: str, value) -> Optional[str]:
     if value is None:
         return None
+    if isinstance(value, list):
+        normalized_items = [
+            normalize_field_value(field, item)
+            for item in value
+            if item is not None
+        ]
+        return "|".join(item for item in normalized_items if item)
     if field in {"episode", "season"}:
         try:
             return str(int(value))
         gold_labels = gold_labels[:available]
         gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
         gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
+        if "EPISODE" not in gold_entities:
+            gold["episode"] = None
+        if not any(is_season_like_label(label) for label in gold_labels):
+            gold["season"] = None
         pred = parse_filename(
             filename,
             model,
                 f"  Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
                 f"tokens from init checkpoint"
             )
+        migration = migrate_token_classifier_labels(model, config.label2id, config.id2label)
+        if migration.get("changed"):
+            print(
+                "  Migrated token classifier labels: "
+                f"{migration.get('source_labels')} -> {migration.get('target_labels')} "
+                f"(copied {migration.get('copied')} rows)"
+            )
         model.config.num_labels = config.num_labels
         model.config.id2label = config.id2label
         model.config.label2id = config.label2id
+        model.config.label_schema_version = config.label_schema_version
     else:
         print("Creating model...")
         selected_model_head = "linear" if args.model_head == "auto" else args.model_head
     # Set proper label mappings in model config before saving
     model.config.id2label = config.id2label
     model.config.label2id = config.label2id
+    model.config.label_schema_version = config.label_schema_version
     model.config.tokenizer_variant = tokenizer_variant
     model.config.max_seq_length = config.max_seq_length
     save_model_head_config(model, selected_model_head)

data/parser_regression_cases.json CHANGED Viewed

@@ -110,6 +110,7 @@
     "id": "long_running_episode",
     "filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
     "expected": {
       "title": "One.Piece",
       "episode": 1110,
       "resolution": "1080p",
@@ -241,6 +242,26 @@
       "source": "GB"
     }
   },
   {
     "id": "vcb_special_iv_not_episode",
     "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",

     "id": "long_running_episode",
     "filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
     "expected": {
+      "group": null,
       "title": "One.Piece",
       "episode": 1110,
       "resolution": "1080p",
       "source": "GB"
     }
   },
+  {
+    "id": "path_sousou_dir_season_episode",
+    "filename": "/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
+    "expected": {
+      "group": null,
+      "title": "Sousou no Frieren",
+      "season": 1,
+      "episode": 31
+    }
+  },
+  {
+    "id": "path_generic_title_numeric_season_episode",
+    "filename": "/mnt/media/anime/Title/01/03.mkv",
+    "expected": {
+      "group": null,
+      "title": "Title",
+      "season": 1,
+      "episode": 3
+    }
+  },
   {
     "id": "vcb_special_iv_not_episode",
     "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",

label_schema.json ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+  "version": 2,
+  "labels": [
+    "O",
+    "B-TITLE_CHS",
+    "I-TITLE_CHS",
+    "B-TITLE_CHT",
+    "I-TITLE_CHT",
+    "B-TITLE_JPN",
+    "I-TITLE_JPN",
+    "B-TITLE_LATIN",
+    "I-TITLE_LATIN",
+    "B-TITLE_MIXED",
+    "I-TITLE_MIXED",
+    "B-PATH_TITLE_CHS",
+    "I-PATH_TITLE_CHS",
+    "B-PATH_TITLE_CHT",
+    "I-PATH_TITLE_CHT",
+    "B-PATH_TITLE_JPN",
+    "I-PATH_TITLE_JPN",
+    "B-PATH_TITLE_LATIN",
+    "I-PATH_TITLE_LATIN",
+    "B-PATH_TITLE_MIXED",
+    "I-PATH_TITLE_MIXED",
+    "B-PATH_SEASON",
+    "I-PATH_SEASON",
+    "B-SEASON",
+    "I-SEASON",
+    "B-EPISODE",
+    "I-EPISODE",
+    "B-SPECIAL",
+    "I-SPECIAL",
+    "B-GROUP",
+    "I-GROUP",
+    "B-RESOLUTION",
+    "I-RESOLUTION",
+    "B-SOURCE",
+    "I-SOURCE",
+    "B-TAG",
+    "I-TAG"
+  ],
+  "title_entities": [
+    "TITLE_CHS",
+    "TITLE_CHT",
+    "TITLE_JPN",
+    "TITLE_LATIN",
+    "TITLE_MIXED",
+    "PATH_TITLE_CHS",
+    "PATH_TITLE_CHT",
+    "PATH_TITLE_JPN",
+    "PATH_TITLE_LATIN",
+    "PATH_TITLE_MIXED"
+  ],
+  "file_title_entities": [
+    "TITLE_CHS",
+    "TITLE_CHT",
+    "TITLE_JPN",
+    "TITLE_LATIN",
+    "TITLE_MIXED"
+  ],
+  "path_title_entities": [
+    "PATH_TITLE_CHS",
+    "PATH_TITLE_CHT",
+    "PATH_TITLE_JPN",
+    "PATH_TITLE_LATIN",
+    "PATH_TITLE_MIXED"
+  ],
+  "title_priority": [
+    "CHS",
+    "CHT",
+    "JPN",
+    "MIXED",
+    "LATIN"
+  ],
+  "notes": {
+    "PATH_SEASON": "Season value extracted from a directory/path segment. File-level SEASON wins when both are present.",
+    "TAG": "Non-key side tags such as 国漫, 日漫, 剧场版, Gekijouban, Movie, TV, and years.",
+    "TITLE_LATIN": "Latin-script titles, including English aliases and romaji."
+  }
+}

tools/build_path_focus_dataset.py CHANGED Viewed

@@ -12,11 +12,20 @@ import json
 from pathlib import Path
 def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
     tokens = list(filename)
     labels = ["O"] * len(tokens)
     cursor = 0
     for text, entity in spans:
         start = filename.find(text, cursor)
         if start < 0:
             start = filename.find(text)
@@ -40,7 +49,7 @@ def build_cases(source: str) -> list[dict[str, object]]:
         char_item(
             r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
             [
-                ("Shinsekai Yori", "TITLE"),
                 ("NCED02", "SPECIAL"),
                 ("1080p", "RESOLUTION"),
                 ("x265_flac", "SOURCE"),
@@ -50,8 +59,8 @@ def build_cases(source: str) -> list[dict[str, object]]:
         char_item(
             r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
             [
-                ("Sousou no Frieren", "TITLE"),
-                ("Season 01", "SEASON"),
                 ("31", "EPISODE"),
                 ("1080P", "RESOLUTION"),
                 ("Baha", "SOURCE"),
@@ -59,11 +68,29 @@ def build_cases(source: str) -> list[dict[str, object]]:
             ],
             source,
         ),
         char_item(
             r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
             [
-                ("One Piece", "TITLE"),
-                ("Season 21", "SEASON"),
                 ("1110", "EPISODE"),
                 ("1080p", "RESOLUTION"),
                 ("WEB-DL", "SOURCE"),
@@ -73,19 +100,19 @@ def build_cases(source: str) -> list[dict[str, object]]:
         char_item(
             r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
             [
-                ("Witch Watch", "TITLE"),
-                ("S01", "SEASON"),
                 ("15", "EPISODE"),
                 ("1080p", "RESOLUTION"),
-                ("CHS", "SOURCE"),
             ],
             source,
         ),
         char_item(
             r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
             [
-                ("Kakuriyo no Yadomeshi", "TITLE"),
-                ("Season 02", "SEASON"),
                 ("12", "EPISODE"),
                 ("WebRip", "SOURCE"),
                 ("1080p", "RESOLUTION"),
@@ -95,8 +122,9 @@ def build_cases(source: str) -> list[dict[str, object]]:
         char_item(
             r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
             [
-                ("One Piece", "TITLE"),
-                ("Season 21", "SEASON"),
                 ("1110", "EPISODE"),
                 ("1080p", "RESOLUTION"),
                 ("WEB-DL", "SOURCE"),

 from pathlib import Path
+def canonical_entity(entity: str) -> str:
+    if entity == "TITLE":
+        return "TITLE_MIXED"
+    if entity == "PATH_TITLE":
+        return "PATH_TITLE_MIXED"
+    return entity
 def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
     tokens = list(filename)
     labels = ["O"] * len(tokens)
     cursor = 0
     for text, entity in spans:
+        entity = canonical_entity(entity)
         start = filename.find(text, cursor)
         if start < 0:
             start = filename.find(text)
         char_item(
             r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
             [
+                ("Shinsekai Yori", "PATH_TITLE_LATIN"),
                 ("NCED02", "SPECIAL"),
                 ("1080p", "RESOLUTION"),
                 ("x265_flac", "SOURCE"),
         char_item(
             r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
             [
+                ("Sousou no Frieren", "PATH_TITLE_LATIN"),
+                ("Season 01", "PATH_SEASON"),
                 ("31", "EPISODE"),
                 ("1080P", "RESOLUTION"),
                 ("Baha", "SOURCE"),
             ],
             source,
         ),
+        char_item(
+            r"/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
+            [
+                ("Sousou no Frieren", "PATH_TITLE_LATIN"),
+                ("Season 01", "PATH_SEASON"),
+                ("31", "EPISODE"),
+            ],
+            source,
+        ),
+        char_item(
+            r"/mnt/media/anime/Title/01/03.mkv",
+            [
+                ("Title", "PATH_TITLE_LATIN"),
+                ("01", "PATH_SEASON"),
+                ("03", "EPISODE"),
+            ],
+            source,
+        ),
         char_item(
             r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
             [
+                ("One Piece", "PATH_TITLE_LATIN"),
+                ("Season 21", "PATH_SEASON"),
                 ("1110", "EPISODE"),
                 ("1080p", "RESOLUTION"),
                 ("WEB-DL", "SOURCE"),
         char_item(
             r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
             [
+                ("Witch Watch", "PATH_TITLE_LATIN"),
+                ("S01", "PATH_SEASON"),
                 ("15", "EPISODE"),
                 ("1080p", "RESOLUTION"),
+                ("CHS", "TAG"),
             ],
             source,
         ),
         char_item(
             r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
             [
+                ("Kakuriyo no Yadomeshi", "PATH_TITLE_LATIN"),
+                ("Season 02", "PATH_SEASON"),
                 ("12", "EPISODE"),
                 ("WebRip", "SOURCE"),
                 ("1080p", "RESOLUTION"),
         char_item(
             r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
             [
+                ("One Piece", "PATH_TITLE_LATIN"),
+                ("Season 21", "PATH_SEASON"),
+                ("One.Piece", "TITLE_LATIN"),
                 ("1110", "EPISODE"),
                 ("1080p", "RESOLUTION"),
                 ("WEB-DL", "SOURCE"),

tools/build_path_prefix_dataset.py CHANGED Viewed

@@ -2,9 +2,9 @@
 The generated rows look like:
-    noise/noise/TITLE/Season 01/03 [1080P][WEB-DL].mkv
-Prefix directories are always labeled ``O``. The title directory, season
 directory, episode/special filename stem, and optional meta tags keep their BIO
 labels so the model learns to ignore library paths without relying on runtime
 path stripping.
@@ -22,14 +22,23 @@ from statistics import mean
 from typing import Iterable, Optional
 ENTITY_NAMES = {
-    "TITLE",
     "SEASON",
     "EPISODE",
     "SPECIAL",
     "RESOLUTION",
     "SOURCE",
     "GROUP",
 }
 PREFIX_COMPONENTS = {
@@ -97,6 +106,51 @@ def iter_jsonl(path: Path) -> Iterable[dict]:
                 raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
 def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
     entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
     active_entity: Optional[str] = None
@@ -105,7 +159,7 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
     def flush() -> None:
         nonlocal active_entity, active_tokens
         if active_entity and active_tokens:
-            entities.setdefault(active_entity, []).append("".join(active_tokens).strip())
         active_entity = None
         active_tokens = []
@@ -114,10 +168,10 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
         token = str(token)
         if label.startswith("B-"):
             flush()
-            active_entity = label.split("-", 1)[1]
             active_tokens = [token]
         elif label.startswith("I-"):
-            entity = label.split("-", 1)[1]
             if active_entity == entity:
                 active_tokens.append(token)
             else:
@@ -141,6 +195,43 @@ def choose_entity(entities: dict[str, list[str]], name: str, rng: random.Random)
     return rng.choice(values)
 def choose_group(
     entities: dict[str, list[str]],
     rng: random.Random,
@@ -171,10 +262,10 @@ def season_text(value: Optional[str], rng: random.Random) -> str:
         number = first_ascii_number(value)
         variants = [value.strip()]
         if number is not None:
-            variants.extend([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
         return rng.choice(variants)
     number = rng.choice([1, 1, 1, 2])
-    return rng.choice([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
 def episode_text(value: str, rng: random.Random) -> str:
@@ -219,6 +310,12 @@ def append_meta(
         if source and rng.random() < 0.75:
             pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
 def build_path_row(
     record: dict,
@@ -236,9 +333,10 @@ def build_path_row(
     if len(tokens) != len(labels):
         return None
     entities = extract_entities(tokens, labels)
-    title = choose_entity(entities, "TITLE", rng)
-    if not title:
         return None
     group = choose_group(entities, rng, max_group_length)
     if require_group and not group:
         return None
@@ -251,8 +349,8 @@ def build_path_row(
     style = rng.choice(styles)
     separator = "\\" if style == "windows" else "/"
     components = prefix_components(style, rng)
-    components.append([(title, "TITLE")])
-    components.append([(season_text(choose_entity(entities, "SEASON", rng), rng), "SEASON")])
     endpoint_pieces: list[tuple[str, Optional[str]]] = []
     if group and rng.random() < group_prefix_prob:

 The generated rows look like:
+    noise/noise/PATH_TITLE_LATIN/PATH_SEASON/03 [1080P][WEB-DL].mkv
+Prefix directories are always labeled ``O``. The path-title directory, path-season
 directory, episode/special filename stem, and optional meta tags keep their BIO
 labels so the model learns to ignore library paths without relying on runtime
 path stripping.
 from typing import Iterable, Optional
+TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
+FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
+PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
 ENTITY_NAMES = {
+    *FILE_TITLE_ENTITIES,
+    *PATH_TITLE_ENTITIES,
+    "PATH_SEASON",
     "SEASON",
     "EPISODE",
     "SPECIAL",
     "RESOLUTION",
     "SOURCE",
     "GROUP",
+    "TAG",
+    "TITLE",
+    "PATH_TITLE",
 }
 PREFIX_COMPONENTS = {
                 raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
+def canonical_entity(entity: str) -> Optional[str]:
+    if entity == "TITLE":
+        return "TITLE_MIXED"
+    if entity == "PATH_TITLE":
+        return "PATH_TITLE_MIXED"
+    if entity in ENTITY_NAMES:
+        return entity
+    return None
+def file_title_to_path_title(entity: str) -> Optional[str]:
+    if entity.startswith("TITLE_"):
+        return "PATH_TITLE_" + entity.removeprefix("TITLE_")
+    return None
+def path_title_to_file_title(entity: str) -> Optional[str]:
+    if entity.startswith("PATH_TITLE_"):
+        return "TITLE_" + entity.removeprefix("PATH_TITLE_")
+    return None
+def append_entity_value(entities: dict[str, list[str]], entity: str, value: str) -> None:
+    value = value.strip()
+    if not value:
+        return
+    def append_unique(target_entity: str) -> None:
+        values = entities.setdefault(target_entity, [])
+        if value not in values:
+            values.append(value)
+    append_unique(entity)
+    path_title = file_title_to_path_title(entity)
+    if path_title:
+        append_unique(path_title)
+    file_title = path_title_to_file_title(entity)
+    if file_title:
+        append_unique(file_title)
+    if entity == "SEASON":
+        append_unique("PATH_SEASON")
+    elif entity == "PATH_SEASON":
+        append_unique("SEASON")
 def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
     entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
     active_entity: Optional[str] = None
     def flush() -> None:
         nonlocal active_entity, active_tokens
         if active_entity and active_tokens:
+            append_entity_value(entities, active_entity, "".join(active_tokens))
         active_entity = None
         active_tokens = []
         token = str(token)
         if label.startswith("B-"):
             flush()
+            active_entity = canonical_entity(label.split("-", 1)[1])
             active_tokens = [token]
         elif label.startswith("I-"):
+            entity = canonical_entity(label.split("-", 1)[1])
             if active_entity == entity:
                 active_tokens.append(token)
             else:
     return rng.choice(values)
+def choose_path_title(entities: dict[str, list[str]], rng: random.Random) -> Optional[tuple[str, str]]:
+    candidates: list[tuple[str, str]] = []
+    seen: set[tuple[str, str]] = set()
+    for entity in PATH_TITLE_ENTITIES:
+        for value in entities.get(entity, []):
+            value = value.strip()
+            key = (value, entity)
+            if value and key not in seen:
+                candidates.append(key)
+                seen.add(key)
+    for entity in FILE_TITLE_ENTITIES:
+        path_entity = file_title_to_path_title(entity)
+        if path_entity is None:
+            continue
+        for value in entities.get(entity, []):
+            value = value.strip()
+            key = (value, path_entity)
+            if value and key not in seen:
+                candidates.append(key)
+                seen.add(key)
+    if not candidates:
+        return None
+    return rng.choice(candidates)
+def choose_path_season_value(entities: dict[str, list[str]], rng: random.Random) -> Optional[str]:
+    values = [
+        value.strip()
+        for entity in ("PATH_SEASON", "SEASON")
+        for value in entities.get(entity, [])
+        if value.strip()
+    ]
+    if not values:
+        return None
+    return rng.choice(values)
 def choose_group(
     entities: dict[str, list[str]],
     rng: random.Random,
         number = first_ascii_number(value)
         variants = [value.strip()]
         if number is not None:
+            variants.extend([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
         return rng.choice(variants)
     number = rng.choice([1, 1, 1, 2])
+    return rng.choice([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
 def episode_text(value: str, rng: random.Random) -> str:
         if source and rng.random() < 0.75:
             pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
+    tag_values = list(entities.get("TAG", []))
+    rng.shuffle(tag_values)
+    for tag in tag_values[:1]:
+        if tag and rng.random() < 0.60:
+            pieces.extend([("[", None), (tag.strip(), "TAG"), ("]", None)])
 def build_path_row(
     record: dict,
     if len(tokens) != len(labels):
         return None
     entities = extract_entities(tokens, labels)
+    title_choice = choose_path_title(entities, rng)
+    if not title_choice:
         return None
+    title, path_title_entity = title_choice
     group = choose_group(entities, rng, max_group_length)
     if require_group and not group:
         return None
     style = rng.choice(styles)
     separator = "\\" if style == "windows" else "/"
     components = prefix_components(style, rng)
+    components.append([(title, path_title_entity)])
+    components.append([(season_text(choose_path_season_value(entities, rng), rng), "PATH_SEASON")])
     endpoint_pieces: list[tuple[str, Optional[str]]] = []
     if group and rng.random() < group_prefix_prob:

tools/build_repair_focus_dataset.py CHANGED Viewed

@@ -64,7 +64,7 @@ def manual_cases() -> Iterable[dict]:
     yield char_item(
         "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
         [
-            ("One.Piece", "TITLE"),
             ("1110", "EPISODE"),
             ("1080p", "RESOLUTION"),
             ("WEB-DL", "SOURCE"),
@@ -73,7 +73,7 @@ def manual_cases() -> Iterable[dict]:
     yield char_item(
         "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
         [
-            ("One.Piece", "TITLE"),
             ("1111", "EPISODE"),
             ("1080p", "RESOLUTION"),
             ("WEB-DL", "SOURCE"),
@@ -83,7 +83,8 @@ def manual_cases() -> Iterable[dict]:
         "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
         [
             ("喵萌奶茶屋", "GROUP"),
-            ("葬送的芙莉莲", "TITLE"),
             ("01", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("HEVC", "SOURCE"),
@@ -93,7 +94,8 @@ def manual_cases() -> Iterable[dict]:
         "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
         [
             ("喵萌奶茶屋", "GROUP"),
-            ("药屋少女的呢喃", "TITLE"),
             ("02", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("HEVC", "SOURCE"),
@@ -103,7 +105,7 @@ def manual_cases() -> Iterable[dict]:
         "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索：魔法姊妹露露特莉莉].mp4",
         [
             ("Billion Meta Lab", "GROUP"),
-            ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
             ("07", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("CHT&JPN", "SOURCE"),
@@ -114,7 +116,7 @@ def manual_cases() -> Iterable[dict]:
         "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索：魔法姊妹露露特莉莉].mp4",
         [
             ("Billion Meta Lab", "GROUP"),
-            ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
             ("08", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("CHT&JPN", "SOURCE"),
@@ -125,7 +127,7 @@ def manual_cases() -> Iterable[dict]:
         "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
         [
             ("LoliHouse", "GROUP"),
-            ("Kakuriyo no Yadomeshi", "TITLE"),
             ("Ni", "SEASON"),
             ("12", "EPISODE"),
             ("WebRip", "SOURCE"),
@@ -139,7 +141,7 @@ def manual_cases() -> Iterable[dict]:
         "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
         [
             ("LoliHouse", "GROUP"),
-            ("Kakuriyo no Yadomeshi", "TITLE"),
             ("Ni", "SEASON"),
             ("13", "EPISODE"),
             ("WebRip", "SOURCE"),
@@ -153,7 +155,7 @@ def manual_cases() -> Iterable[dict]:
         "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
         [
             ("AI-Raws", "GROUP"),
-            ("炎炎の消防隊", "TITLE"),
             ("弐ノ章", "SEASON"),
             ("13", "EPISODE"),
             ("BD", "SOURCE"),
@@ -166,7 +168,7 @@ def manual_cases() -> Iterable[dict]:
         "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
         [
             ("AI-Raws", "GROUP"),
-            ("炎炎の消防隊", "TITLE"),
             ("弐ノ章", "SEASON"),
             ("01", "EPISODE"),
             ("BD", "SOURCE"),
@@ -179,7 +181,7 @@ def manual_cases() -> Iterable[dict]:
         "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
         [
             ("DBD-Raws", "GROUP"),
-            ("炎炎消防队", "TITLE"),
             ("貳之章", "SEASON"),
             ("01", "EPISODE"),
             ("1080P", "RESOLUTION"),
@@ -191,8 +193,11 @@ def manual_cases() -> Iterable[dict]:
         "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
         [
             ("GM-Team", "GROUP"),
-            ("逆天邪神", "TITLE"),
             ("第2季", "SEASON"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
@@ -203,8 +208,11 @@ def manual_cases() -> Iterable[dict]:
         "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
         [
             ("GM-Team", "GROUP"),
-            ("剑来", "TITLE"),
             ("第2季", "SEASON"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
@@ -215,8 +223,11 @@ def manual_cases() -> Iterable[dict]:
         "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
         [
             ("GM-Team", "GROUP"),
-            ("大主宰", "TITLE"),
             ("第2季", "SEASON"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
@@ -227,7 +238,7 @@ def manual_cases() -> Iterable[dict]:
         "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
         [
             ("YYDM&VCB-Studio", "GROUP"),
-            ("Shinsekai Yori", "TITLE"),
             ("IV05", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_aac", "SOURCE"),
@@ -237,7 +248,7 @@ def manual_cases() -> Iterable[dict]:
         "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
         [
             ("YYDM&VCB-Studio", "GROUP"),
-            ("Shinsekai Yori", "TITLE"),
             ("NCED02", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_flac", "SOURCE"),
@@ -246,7 +257,7 @@ def manual_cases() -> Iterable[dict]:
     yield char_item(
         "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
         [
-            ("InuYasha", "TITLE"),
             ("NCED02", "SPECIAL"),
             ("BDrip", "SOURCE"),
             ("AV1", "SOURCE"),
@@ -258,7 +269,7 @@ def manual_cases() -> Iterable[dict]:
         "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
         [
             ("VCB-Studio", "GROUP"),
-            ("Yamada-kun to 7-nin no Majo", "TITLE"),
             ("NCED", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_flac", "SOURCE"),

     yield char_item(
         "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
         [
+            ("One.Piece", "TITLE_LATIN"),
             ("1110", "EPISODE"),
             ("1080p", "RESOLUTION"),
             ("WEB-DL", "SOURCE"),
     yield char_item(
         "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
         [
+            ("One.Piece", "TITLE_LATIN"),
             ("1111", "EPISODE"),
             ("1080p", "RESOLUTION"),
             ("WEB-DL", "SOURCE"),
         "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
         [
             ("喵萌奶茶屋", "GROUP"),
+            ("★04月新番★", "TAG"),
+            ("葬送的芙莉莲", "TITLE_CHS"),
             ("01", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("HEVC", "SOURCE"),
         "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
         [
             ("喵萌奶茶屋", "GROUP"),
+            ("★10月新番★", "TAG"),
+            ("药屋少女的呢喃", "TITLE_CHS"),
             ("02", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("HEVC", "SOURCE"),
         "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索：魔法姊妹露露特莉莉].mp4",
         [
             ("Billion Meta Lab", "GROUP"),
+            ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
             ("07", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("CHT&JPN", "SOURCE"),
         "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索：魔法姊妹露露特莉莉].mp4",
         [
             ("Billion Meta Lab", "GROUP"),
+            ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
             ("08", "EPISODE"),
             ("1080P", "RESOLUTION"),
             ("CHT&JPN", "SOURCE"),
         "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
         [
             ("LoliHouse", "GROUP"),
+            ("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
             ("Ni", "SEASON"),
             ("12", "EPISODE"),
             ("WebRip", "SOURCE"),
         "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
         [
             ("LoliHouse", "GROUP"),
+            ("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
             ("Ni", "SEASON"),
             ("13", "EPISODE"),
             ("WebRip", "SOURCE"),
         "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
         [
             ("AI-Raws", "GROUP"),
+            ("炎炎の消防隊", "TITLE_JPN"),
             ("弐ノ章", "SEASON"),
             ("13", "EPISODE"),
             ("BD", "SOURCE"),
         "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
         [
             ("AI-Raws", "GROUP"),
+            ("炎炎の消防隊", "TITLE_JPN"),
             ("弐ノ章", "SEASON"),
             ("01", "EPISODE"),
             ("BD", "SOURCE"),
         "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
         [
             ("DBD-Raws", "GROUP"),
+            ("炎炎消防队", "TITLE_CHS"),
             ("貳之章", "SEASON"),
             ("01", "EPISODE"),
             ("1080P", "RESOLUTION"),
         "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
         [
             ("GM-Team", "GROUP"),
+            ("国漫", "TAG"),
+            ("逆天邪神", "TITLE_CHS"),
             ("第2季", "SEASON"),
+            ("Against the Gods Ⅱ", "TITLE_LATIN"),
+            ("2026", "TAG"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
         "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
         [
             ("GM-Team", "GROUP"),
+            ("国漫", "TAG"),
+            ("剑来", "TITLE_CHS"),
             ("第2季", "SEASON"),
+            ("Sword of Coming Ⅱ", "TITLE_LATIN"),
+            ("2025", "TAG"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
         "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
         [
             ("GM-Team", "GROUP"),
+            ("国漫", "TAG"),
+            ("大主宰", "TITLE_CHS"),
             ("第2季", "SEASON"),
+            ("The Great Ruler Ⅱ", "TITLE_LATIN"),
+            ("2026", "TAG"),
             ("04", "EPISODE"),
             ("HEVC", "SOURCE"),
             ("GB", "SOURCE"),
         "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
         [
             ("YYDM&VCB-Studio", "GROUP"),
+            ("Shinsekai Yori", "TITLE_LATIN"),
             ("IV05", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_aac", "SOURCE"),
         "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
         [
             ("YYDM&VCB-Studio", "GROUP"),
+            ("Shinsekai Yori", "TITLE_LATIN"),
             ("NCED02", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_flac", "SOURCE"),
     yield char_item(
         "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
         [
+            ("InuYasha", "TITLE_LATIN"),
             ("NCED02", "SPECIAL"),
             ("BDrip", "SOURCE"),
             ("AV1", "SOURCE"),
         "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
         [
             ("VCB-Studio", "GROUP"),
+            ("Yamada-kun to 7-nin no Majo", "TITLE_LATIN"),
             ("NCED", "SPECIAL"),
             ("1080p", "RESOLUTION"),
             ("x265_flac", "SOURCE"),

tools/evaluate_parser_cases.py CHANGED Viewed

@@ -20,6 +20,13 @@ DEFAULT_OUTPUT_FILE = os.path.join("reports", "case_metrics.json")
 def normalize_field_value(field: str, value) -> Optional[str]:
     if value is None:
         return None
     if field in {"episode", "season"}:
         try:
             return str(int(value))
@@ -45,11 +52,12 @@ def evaluate_cases(
     tokenizer_variant: Optional[str],
     max_length: Optional[int],
     constrain_bio: bool,
 ) -> Dict:
     cfg = Config()
     tokenizer = load_tokenizer(model_dir, tokenizer_variant)
     model = load_model(model_dir)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     model.eval()
@@ -108,6 +116,7 @@ def evaluate_cases(
         "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
         "max_length": resolved_max_length,
         "constrain_bio": constrain_bio,
         "case_count": len(cases),
         "full_correct": full_correct,
         "full_accuracy": full_correct / len(cases) if cases else 0.0,
@@ -124,6 +133,7 @@ def evaluate_case_modes(
     case_file: str,
     tokenizer_variant: Optional[str],
     max_length: Optional[int],
 ) -> Dict:
     modes = {
         "model_only": {"constrain_bio": False},
@@ -136,6 +146,7 @@ def evaluate_case_modes(
             tokenizer_variant=tokenizer_variant,
             max_length=max_length,
             constrain_bio=settings["constrain_bio"],
         )
         for name, settings in modes.items()
     }
@@ -168,6 +179,7 @@ def main() -> None:
     parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
     parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
     parser.add_argument("--no-constrained-bio", action="store_true")
     args = parser.parse_args()
     if args.mode == "all" and not args.no_constrained_bio:
@@ -176,6 +188,7 @@ def main() -> None:
             case_file=args.case_file,
             tokenizer_variant=args.tokenizer,
             max_length=args.max_length,
         )
         for name in ("model_only", "normalized_only"):
             print_metrics(name, metrics["modes"][name])
@@ -188,6 +201,7 @@ def main() -> None:
             tokenizer_variant=args.tokenizer,
             max_length=args.max_length,
             constrain_bio=constrain_bio,
         )
         print_metrics(args.mode, metrics)

 def normalize_field_value(field: str, value) -> Optional[str]:
     if value is None:
         return None
+    if isinstance(value, list):
+        normalized_items = [
+            normalize_field_value(field, item)
+            for item in value
+            if item is not None
+        ]
+        return "|".join(item for item in normalized_items if item)
     if field in {"episode", "season"}:
         try:
             return str(int(value))
     tokenizer_variant: Optional[str],
     max_length: Optional[int],
     constrain_bio: bool,
+    force_cpu: bool = False,
 ) -> Dict:
     cfg = Config()
     tokenizer = load_tokenizer(model_dir, tokenizer_variant)
     model = load_model(model_dir)
+    device = torch.device("cpu" if force_cpu else ("cuda" if torch.cuda.is_available() else "cpu"))
     model.to(device)
     model.eval()
         "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
         "max_length": resolved_max_length,
         "constrain_bio": constrain_bio,
+        "device": str(device),
         "case_count": len(cases),
         "full_correct": full_correct,
         "full_accuracy": full_correct / len(cases) if cases else 0.0,
     case_file: str,
     tokenizer_variant: Optional[str],
     max_length: Optional[int],
+    force_cpu: bool = False,
 ) -> Dict:
     modes = {
         "model_only": {"constrain_bio": False},
             tokenizer_variant=tokenizer_variant,
             max_length=max_length,
             constrain_bio=settings["constrain_bio"],
+            force_cpu=force_cpu,
         )
         for name, settings in modes.items()
     }
     parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
     parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
     parser.add_argument("--no-constrained-bio", action="store_true")
+    parser.add_argument("--cpu", action="store_true", help="Force CPU evaluation")
     args = parser.parse_args()
     if args.mode == "all" and not args.no_constrained_bio:
             case_file=args.case_file,
             tokenizer_variant=args.tokenizer,
             max_length=args.max_length,
+            force_cpu=args.cpu,
         )
         for name in ("model_only", "normalized_only"):
             print_metrics(name, metrics["modes"][name])
             tokenizer_variant=args.tokenizer,
             max_length=args.max_length,
             constrain_bio=constrain_bio,
+            force_cpu=args.cpu,
         )
         print_metrics(args.mode, metrics)

tools/rust_dmhy_template_apply/src/main.rs CHANGED Viewed

@@ -135,6 +135,7 @@ struct Group {
 struct Stats {
     seen: usize,
     skipped_encoding_noise: usize,
     trimmed_parent_path: usize,
     skipped_no_recipe: usize,
     skipped_sample_cap: usize,
@@ -161,6 +162,8 @@ enum Processed {
     Skipped {
         reason: &'static str,
         trimmed_parent: bool,
     },
 }
@@ -176,8 +179,7 @@ static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
 });
 static EPISODE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
-static DECIMAL_EPISODE_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
 static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
 static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
@@ -198,9 +200,8 @@ static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
 });
 static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
-static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
-});
 static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
 static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
@@ -213,10 +214,10 @@ static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
         .unwrap()
 });
-static SEASON_WORD_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
-static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
@@ -226,7 +227,10 @@ static VOLUME_RE: Lazy<Regex> =
 static DATE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
 static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()
 });
 static CJK_DATE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
@@ -278,6 +282,12 @@ static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
 static SIMPLE_EPISODE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
 static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
 fn main() -> Result<()> {
     let args = Args::parse();
@@ -333,6 +343,10 @@ fn main() -> Result<()> {
     let mut label_counts: HashMap<String, usize> = HashMap::new();
     let mut template_counts: HashMap<String, usize> = HashMap::new();
     let mut examples = Vec::new();
     let mut writer = BufWriter::new(File::create(&args.output)?);
     for item in processed {
         match item {
@@ -359,17 +373,40 @@ fn main() -> Result<()> {
             Processed::Skipped {
                 reason,
                 trimmed_parent,
             } => {
                 if trimmed_parent {
                     stats.trimmed_parent_path += 1;
                 }
                 match reason {
                     "encoding_noise" => stats.skipped_encoding_noise += 1,
                     "no_recipe" => stats.skipped_no_recipe += 1,
                     "sample_cap" => stats.skipped_sample_cap += 1,
                     "role_mismatch" => stats.skipped_role_mismatch += 1,
                     "low_frequency_audit_warning" => {
-                        stats.skipped_low_frequency_audit_warning += 1
                     }
                     _ => {}
                 }
@@ -412,6 +449,9 @@ fn main() -> Result<()> {
         "label_counts": label_counts,
         "top_template_counts": top_template_counts,
         "examples": examples,
         "implementation": "rust_dmhy_template_apply"
     });
     fs::write(
@@ -452,8 +492,8 @@ fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
     if !path.exists() {
         return Ok(Vec::new());
     }
-    let file = File::open(path)
-        .with_context(|| format!("failed to open whitelist {}", path.display()))?;
     let mut lines = Vec::new();
     for line in BufReader::new(file).lines() {
         let line = line?;
@@ -544,6 +584,7 @@ fn run_cluster(args: &Args) -> Result<()> {
         if !args.keep_encoding_noise
             && (has_encoding_noise(&original)
                 || has_non_anime_noise(&original)
                 || has_abstract_path_noise(&original))
         {
             skipped_encoding_noise += 1;
@@ -762,6 +803,7 @@ fn run_low_frequency_audit(args: &Args) -> Result<()> {
         if !args.keep_encoding_noise
             && (has_encoding_noise(&original)
                 || has_non_anime_noise(&original)
                 || has_abstract_path_noise(&original))
         {
             continue;
@@ -921,6 +963,7 @@ fn run_rich_annotations(args: &Args) -> Result<()> {
             if !args.keep_encoding_noise
                 && (has_encoding_noise(original)
                     || has_non_anime_noise(original)
                     || has_abstract_path_noise(original))
             {
                 return None;
@@ -987,6 +1030,7 @@ fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
     let (key, tokens, _classes, groups) = template_key_for_filename(segment);
     let suggested = suggested_roles(&key);
     let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
     let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
     json!({
         "index": index,
@@ -1024,7 +1068,8 @@ fn rich_candidates_for_segment(
             continue;
         }
         output.push(json!({
-            "role": fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()),
             "coarse_role": "TITLE",
             "text": text,
             "group_start": start,
@@ -1032,7 +1077,7 @@ fn rich_candidates_for_segment(
         }));
     }
     for (group_index, role) in roles.iter().enumerate() {
-        if role == "TITLE" || role == "O" || role == "HASH" {
             continue;
         }
         let text = group_text(tokens, &groups[group_index]);
@@ -1054,6 +1099,21 @@ fn rich_candidates_for_segment(
     output
 }
 fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
     let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
         return String::new();
@@ -1101,6 +1161,8 @@ fn fine_non_title_role(role: &str) -> &'static str {
         "GROUP" => "RELEASE_GROUP",
         "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
         "SEASON" => "SEASON",
         "SPECIAL" | "VOLUME" => "SPECIAL",
         "RESOLUTION" => "RESOLUTION",
         "SOURCE" => "SOURCE",
@@ -1139,11 +1201,11 @@ fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
 fn audit_warnings(record: &Record) -> Vec<String> {
     let mut warnings = Vec::new();
-    let title_texts = entity_texts(&record.tokens, &record.labels, "TITLE");
     let title_spans = title_texts.len();
     if title_spans == 0 {
         warnings.push("no_title".to_string());
-    } else if title_spans > 1 {
         warnings.push("multiple_title_spans".to_string());
     }
     if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
@@ -1186,14 +1248,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
         warnings.push("encoding_noise_survived".to_string());
     }
     for (index, token) in record.tokens.iter().enumerate() {
-        let entity = record.labels.get(index).and_then(|label| label_entity(label));
         let cleaned = strip_wrapper(token);
         if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
             warnings.push("hash_labeled".to_string());
             break;
         }
-        if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned))
-            && entity != Some("EPISODE")
         {
             warnings.push("episode_version_missing_label".to_string());
         }
@@ -1213,18 +1277,23 @@ fn label_entity(label: &str) -> Option<&str> {
         .or_else(|| label.strip_prefix("I-"))
 }
-fn entity_texts(tokens: &[String], labels: &[String], target: &str) -> Vec<String> {
     let mut spans = Vec::new();
     let mut current = String::new();
     for (token, label) in tokens.iter().zip(labels.iter()) {
-        let entity = label_entity(label);
-        if entity == Some(target) {
             current.push_str(token);
-        } else if !current.trim().is_empty() {
-            spans.push(current.trim().to_string());
-            current.clear();
         } else {
             current.clear();
         }
     }
     if !current.trim().is_empty() {
@@ -1233,11 +1302,28 @@ fn entity_texts(tokens: &[String], labels: &[String], target: &str) -> Vec<Strin
     spans
 }
 fn generic_title_text(text: &str) -> bool {
     matches!(
         text.trim().to_ascii_lowercase().as_str(),
-        "tv"
-            | "movie"
             | "mov"
             | "sample"
             | "commercial"
@@ -1297,6 +1383,14 @@ fn process_filename(
     recipes: &HashMap<String, Recipe>,
     sample_counters: &HashMap<String, AtomicUsize>,
 ) -> Processed {
     if !args.keep_encoding_noise
         && (has_encoding_noise(original)
             || has_non_anime_noise(original)
@@ -1305,6 +1399,8 @@ fn process_filename(
         return Processed::Skipped {
             reason: "encoding_noise",
             trimmed_parent: false,
         };
     }
     let (training_filename, trimmed_parent) = training_filename_for(original);
@@ -1315,6 +1411,8 @@ fn process_filename(
             return Processed::Skipped {
                 reason: "no_recipe",
                 trimmed_parent,
             }
         }
     };
@@ -1324,6 +1422,8 @@ fn process_filename(
             return Processed::Skipped {
                 reason: "sample_cap",
                 trimmed_parent,
             };
         }
     }
@@ -1331,6 +1431,8 @@ fn process_filename(
         return Processed::Skipped {
             reason: "role_mismatch",
             trimmed_parent,
         };
     }
     let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
@@ -1339,6 +1441,8 @@ fn process_filename(
             return Processed::Skipped {
                 reason: "role_mismatch",
                 trimmed_parent,
             }
         }
     };
@@ -1347,6 +1451,8 @@ fn process_filename(
         return Processed::Skipped {
             reason: "low_frequency_audit_warning",
             trimmed_parent,
         };
     }
     if trimmed_parent {
@@ -1768,9 +1874,49 @@ fn suggested_roles(template: &str) -> Vec<String> {
     roles
 }
 fn filename_has_title(filename: &str) -> bool {
     let (key, _, _, _) = template_key_for_filename(filename);
-    suggested_roles(&key).iter().any(|role| role == "TITLE")
 }
 fn training_filename_for(original: &str) -> (String, bool) {
@@ -1785,21 +1931,13 @@ fn training_filename_for(original: &str) -> (String, bool) {
                 && path_segment_starts_with_episode(parts[parts.len() - 1])
                 && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
     {
-        if let Some(parent) = parts[..parts.len() - 1]
-            .iter()
-            .rev()
-            .find(|part| {
-                let trimmed = trim_parent_title_segment(part);
-                filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
-            })
-        {
             let parent = trim_parent_title_segment(parent.trim());
             return (
-                format!(
-                    "{} {}",
-                    parent,
-                    parts[parts.len() - 1].trim()
-                ),
                 true,
             );
         }
@@ -1895,13 +2033,12 @@ fn has_encoding_noise(value: &str) -> bool {
         return true;
     }
     let markers = [
-        "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
-        "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
-        "伄", "椋", "伓", "姘", "帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒",
-        "銇", "銈", "銉", "偅", "偗", "儱", "儫", "兗", "仧", "鏉变", "鍠靛",
-        "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒",
-        "瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "兇銈", "銉砡",
-        "銉砕", "杩风", "硦澶", "銇淬", "仧銉", "銉嗐", "偅銈", "銈躲",
     ];
     let marker_hits = markers
         .iter()
@@ -1912,7 +2049,8 @@ fn has_encoding_noise(value: &str) -> bool {
         .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
         .count();
     let latin_mojibake = value.split_whitespace().any(|part| {
-        part.chars().any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛'))
             && part.chars().any(|ch| ch.is_ascii_alphabetic())
     });
     marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
@@ -1920,7 +2058,9 @@ fn has_encoding_noise(value: &str) -> bool {
 fn has_non_anime_noise(value: &str) -> bool {
     let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
-    normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
         || value.contains("[旅游")
         || value.contains("[旅游番")
         || normalized.contains("tokyo deep")
@@ -1935,6 +2075,166 @@ fn normalized_path_segment(value: &str) -> String {
         .to_ascii_lowercase()
 }
 fn path_segment_is_episodeish(value: &str) -> bool {
     let (_, _, _, groups) = template_key_for_filename(value);
     let structural: Vec<&String> = groups
@@ -1943,14 +2243,12 @@ fn path_segment_is_episodeish(value: &str) -> bool {
         .filter(|item| item.as_str() != "SEP")
         .collect();
     !structural.is_empty()
-        && structural
-            .iter()
-            .all(|item| {
-                item.starts_with("EPISODE")
-                    || item.as_str() == "SPECIAL"
-                    || item.as_str() == "VOLUME"
-                    || item.as_str() == "BRACKET_VOLUME"
-            })
 }
 fn path_segment_starts_with_episode(value: &str) -> bool {
@@ -2042,12 +2340,14 @@ fn has_abstract_path_noise(value: &str) -> bool {
 fn role_label(role: &str) -> String {
     let entity = match role {
         "GROUP" => Some("GROUP"),
-        "TITLE" => Some("TITLE"),
         "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
         "SEASON" => Some("SEASON"),
         "SPECIAL" | "VOLUME" => Some("SPECIAL"),
         "RESOLUTION" => Some("RESOLUTION"),
         "SOURCE" => Some("SOURCE"),
         _ => None,
     };
     entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
@@ -2390,6 +2690,44 @@ fn looks_like_release_group(text: &str) -> bool {
         || normalized.contains("字幕組")
 }
 const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
     &["SPY", "x", "FAMILY"],
     &["Spy", "x", "Family"],
@@ -2517,7 +2855,8 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
         });
         if !first_is_known_group {
             if let Some(groupish_index) = (1..groups.len()).find(|&index| {
-                output[index] == "TITLE" && looks_like_release_group(&group_text(tokens, &groups[index]))
             }) {
                 output[0] = "TITLE".to_string();
                 output[groupish_index] = "GROUP".to_string();
@@ -2622,9 +2961,14 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
         }
         if roles[index].starts_with("EPISODE")
             && index >= 2
-            && matches!(group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X")
             && output[index - 2] == "TITLE"
-            && !roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
         {
             output[index] = "TITLE".to_string();
             if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
@@ -2635,7 +2979,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             continue;
         }
         if roles[index].starts_with("EPISODE")
-            && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
             && group_text(
                 tokens,
                 &groups[(0..index)
@@ -2648,36 +2994,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             output[index] = "TITLE".to_string();
             continue;
         }
-        if output[index] == "TITLE"
-            && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
         {
             let next_source_lang = (index + 1..roles.len())
                 .find(|&cursor| groups[cursor].class_name != "SEP")
                 .is_some_and(|cursor| {
-                    output[cursor] == "SOURCE"
-                        && group_text(tokens, &groups[cursor]).contains('语')
                 });
             if next_source_lang {
                 output[index] = "SOURCE".to_string();
                 continue;
             }
         }
         if roles[index].starts_with("EPISODE")
             && index >= 1
             && output[index - 1] == "TITLE"
             && groups[index - 1].class_name != "SEP"
             && text.chars().all(|ch| ch.is_ascii_digit())
-            && (text.len() <= 2
-                || (text.len() <= 3
-                    && group_text(tokens, &groups[index - 1])
-                        .chars()
-                        .any(|ch| !ch.is_ascii())
-                    && !group_text(tokens, &groups[index - 1]).ends_with('第')))
             && roles[index + 1..]
                 .iter()
                 .any(|role| role.starts_with("EPISODE"))
         {
-            output[index] = "TITLE".to_string();
             continue;
         }
         if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
@@ -2715,17 +3073,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                 && output[index - 1] == "TITLE"
                 && groups[index - 1].class_name != "SEP"
                 && text.chars().all(|ch| ch.is_ascii_digit())
-                && (text.len() <= 2
-                    || (text.len() <= 3
-                        && group_text(tokens, &groups[index - 1])
-                            .chars()
-                            .any(|ch| !ch.is_ascii())
-                        && !group_text(tokens, &groups[index - 1]).ends_with('第')))
                 && roles[index + 1..]
                     .iter()
                     .any(|role| role.starts_with("EPISODE"))
             {
-                output[index] = "TITLE".to_string();
                 continue;
             }
             if !output[..index].iter().any(|role| role == "TITLE")
@@ -2759,31 +3119,43 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                 && previous_text.len() <= 48
                 && previous_text.chars().any(|ch| ch.is_alphabetic())
                 && text.chars().all(|ch| ch.is_ascii_digit())
-                && text.len() <= 3
                 && !(index + 2 < roles.len()
                     && groups[index + 1].class_name == "SEP"
                     && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
                 && (next_episode
                     || (next_special
                         && (text.parse::<u16>().is_ok_and(|value| value >= 100)
                             || (previous_text.len() <= 4
                                 && previous_text.is_ascii()
-                                && previous_text
-                                    .chars()
-                                    .all(|ch| ch.is_ascii_alphabetic())))))
             {
-                output[index] = "TITLE".to_string();
                 continue;
             }
         }
         if roles[index].starts_with("EPISODE")
             && (text.chars().all(|ch| ch.is_ascii_digit())
-                || matches!(
-                    classify_atom(&text).as_str(),
-                    "EPISODE" | "EPISODE_VERSION"
-                ))
             && output[..index].iter().any(|role| role == "SPECIAL")
-            && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
         {
             let previous_structural = (0..index)
                 .rev()
@@ -2863,9 +3235,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
         }
         if roles[index] == "TITLE"
             && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
-            && output.iter().enumerate().any(|(other, role)| {
-                other != index && role == "TITLE"
-            })
         {
             output[index] = "O".to_string();
             continue;
@@ -2881,9 +3254,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             continue;
         }
         if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
-            let later_special = output[index + 1..]
-                .iter()
-                .any(|role| role == "SPECIAL");
             if later_special {
                 output[index] = "SPECIAL".to_string();
                 continue;
@@ -2896,7 +3267,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
         }
         if output[index] == "O"
             && groups[index].class_name == "TEXT"
-            && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
             && text.chars().any(|ch| ch.is_alphabetic())
             && !ep_markers.contains(&text.as_str())
         {
@@ -3010,8 +3383,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             if matches!(
                 previous_real_text.to_ascii_lowercase().as_str(),
                 "lesson" | "part" | "no"
-            )
-            {
                 output[index] = "O".to_string();
                 continue;
             }
@@ -3022,13 +3394,12 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                 continue;
             }
             if output[..index].iter().any(|role| role == "TITLE")
-                && (output[..index]
                     .iter()
-                    .enumerate()
-                    .any(|(cursor, role)| {
-                        role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
-                    }))
-                && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
                 && text.chars().all(|ch| ch.is_ascii_digit())
                 && text.len() <= 3
             {
@@ -3061,7 +3432,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
     let mut candidates = Vec::new();
     let mut index = 0;
     while index < roles.len() {
-        if roles[index] != "TITLE" {
             index += 1;
             continue;
         }
@@ -3069,7 +3440,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
         index += 1;
         loop {
             if index < roles.len()
-                && roles[index] == "TITLE"
                 && !(groups[index - 1].class_name == "BRACKET_TEXT"
                     && groups[index].class_name == "BRACKET_TEXT")
             {
@@ -3079,7 +3450,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
             if index + 1 < roles.len()
                 && roles[index] == "O"
                 && groups[index].class_name == "SEP"
-                && roles[index + 1] == "TITLE"
             {
                 index += 2;
                 continue;
@@ -3106,7 +3477,7 @@ fn enforce_single_title_candidate(
             role.starts_with("EPISODE")
                 || matches!(
                     role.as_str(),
-                    "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
                 )
         })
         .unwrap_or(roles.len());
@@ -3115,30 +3486,42 @@ fn enforce_single_title_candidate(
         .copied()
         .filter(|(_, end)| *end <= first_anchor)
         .collect();
-    let selected_pool = if before_anchor.is_empty() {
         &candidates
     } else {
         &before_anchor
     };
-    let selected = selected_pool
-    .iter()
-    .max_by_key(|(start, end)| {
-        (
-            title_candidate_score(tokens, groups, *start, *end),
-            *end,
             end - start,
-        )
-    })
-    .copied()
-    .unwrap();
     let mut output = roles.to_vec();
     let mut dropped = Vec::new();
     for (start, end) in candidates {
-        if (start, end) == selected {
             continue;
         }
         for index in start..end {
-            if output[index] == "TITLE" {
                 output[index] = "O".to_string();
                 dropped.push(index.to_string());
             }
@@ -3147,6 +3530,26 @@ fn enforce_single_title_candidate(
     (output, dropped)
 }
 fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
     let text = (start..end)
         .filter(|&index| roles_candidate_text_group(&groups[index]))
@@ -3284,6 +3687,13 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
         if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
             let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
             let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
             if !before.is_empty() {
                 output_pieces.push(before.to_string());
                 labels.push("B-TITLE".to_string());
@@ -3371,8 +3781,9 @@ fn project_refined_tokens(
                     | "SOURCE"
                     | "RESOLUTION"
                     | "SEASON"
             ) {
-                if role == "SEASON" {
                     if let Some((pieces, labels)) = split_season_token(token) {
                         output_tokens.extend(pieces);
                         output_labels.extend(labels);
@@ -3417,13 +3828,13 @@ fn project_refined_tokens(
                     output_labels.extend(labels);
                 }
             } else {
-                if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
                 {
                     output_tokens.push(token.clone());
                     output_labels.push("O".to_string());
                     continue;
                 }
-                if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 {
                     let trimmed = token.trim_end_matches('第').to_string();
                     let (pieces, labels) = normalize_generated_tokens(
                         &[trimmed, "第".to_string()],
@@ -3433,7 +3844,7 @@ fn project_refined_tokens(
                     output_labels.extend(labels);
                     continue;
                 }
-                if role == "TITLE" {
                     let (pieces, labels) = normalize_title_token(token);
                     output_tokens.extend(pieces);
                     output_labels.extend(labels);
@@ -3451,17 +3862,17 @@ fn project_refined_tokens(
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
-        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
-        "？", ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")",
-        "（", "）", "[", "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」", "《", "》",
-        "☆", "♪", "`", "@", "‐", "‑", "–", "—", "−", "$", "＄", "∽", "꞉", "♥",
     ];
     let title_terminal_punctuation = ["!", "！", "?", "？"];
     let entity_joiners = [
-        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
-        "？", ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")",
-        "（", "���", "[", "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」", "《", "》",
-        "☆", "♪", "`", "@", "&", "＆", "‐", "‑", "–", "—", "−", "$", "＄", "∽", "꞉", "♥",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -3498,7 +3909,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                     .any(|item| item.eq_ignore_ascii_case("lupin"));
             if nearby_lupin
                 && next_number.is_some_and(|cursor| {
-                    tokens[cursor].chars().all(|ch| ch.is_ascii_digit()) && tokens[cursor].len() <= 2
                 })
             {
                 output[index] = "B-SEASON".to_string();
@@ -3515,20 +3927,21 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
             let mut cursor = index + 1;
             while cursor < tokens.len() {
                 output[cursor] = "O".to_string();
-                if matches!(tokens[cursor].as_str(), "」" | "｣" | "\"" | "'") && cursor > index + 1 {
                     break;
                 }
                 cursor += 1;
             }
             continue;
         }
-        if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") {
-            let next_word = (index + 1..tokens.len()).find(|&cursor| {
-                tokens[cursor].chars().any(|ch| ch.is_alphanumeric())
-            });
-            if next_word.is_some_and(|cursor| {
-                labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
-            }) {
                 output[index] = "B-SOURCE".to_string();
                 continue;
             }
@@ -3549,15 +3962,15 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                         .chars()
                         .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
             });
-            let later_episode = (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
             if previous_title_word.is_none() && later_episode {
                 output[index] = "B-SEASON".to_string();
                 continue;
             }
-            let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
-            if previous_title_word.is_some()
-                && !matches!(previous_word.as_deref(), Some("lupin"))
-            {
                 output[index] = "B-SEASON".to_string();
                 continue;
             }
@@ -3617,14 +4030,13 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 continue;
             }
             if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
-                && next_non_space
-                    .is_some_and(|cursor| {
-                        matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
-                            || tokens[cursor].starts_with('话')
-                            || tokens[cursor].starts_with('話')
-                            || tokens[cursor].starts_with('回')
-                            || tokens[cursor].starts_with('集')
-                    })
             {
                 if let Some(cursor) = previous_non_space {
                     output[cursor] = "B-EPISODE".to_string();
@@ -3675,13 +4087,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
             let followed_by_title_word = (index + 1..tokens.len())
                 .find(|&cursor| {
                     !joiners.contains(&tokens[cursor].as_str())
-                        && !matches!(tokens[cursor].as_str(), "-" | "－" | "," | "，" | ":" | "：")
                 })
                 .is_some_and(|cursor| {
-                    !matches!(tokens[cursor].as_str(), "[" | "【" | "(" | "（" | "]" | "】")
-                        && output
-                            .get(cursor)
-                            .is_some_and(|label| label == "B-TITLE")
                         && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                 });
             if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
@@ -3715,17 +4130,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 continue;
             }
         }
-        if label == "O"
-            && token.chars().all(|ch| ch.is_ascii_digit())
-            && token.len() <= 3
-        {
             let previous_non_space = (0..index)
                 .rev()
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
             let next_non_space = (index + 1..tokens.len())
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
-            if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
-                && next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
                 && output[..index].iter().any(|label| label == "B-TITLE")
                 && output[index + 1..]
                     .iter()
@@ -3734,7 +4148,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 output[index] = "B-EPISODE".to_string();
                 continue;
             }
-            if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "－"))
                 && output[..index].iter().any(|label| label == "B-TITLE")
                 && output[index + 1..]
                     .iter()
@@ -3763,8 +4178,9 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
             let next_non_space = (index + 1..tokens.len())
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
             if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
-                && next_non_space
-                    .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集"))
             {
                 if let Some(cursor) = previous_non_space {
                     output[cursor] = "B-EPISODE".to_string();
@@ -3783,8 +4199,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 if left_title {
                     output[index] = "B-TITLE".to_string();
                     if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
-                        labels[cursor] == "O"
-                            && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                     }) {
                         output[next_word] = "B-TITLE".to_string();
                     }
@@ -3848,8 +4263,10 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 output[index] = "B-TITLE".to_string();
             }
         }
-        if matches!(token.as_str(), "]" | "】" | ")" | "）" | ">" | "＞" | "｣" | "」")
-            && index > 0
             && output[index - 1] == "B-TITLE"
             && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
         {
@@ -3885,16 +4302,105 @@ fn closer_matches_opener(closer: &str, opener: &str) -> bool {
     )
 }
 fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
     let (key, tokens, _classes, groups) = template_key_for_filename(filename);
     if groups.len() != roles.len() {
         return None;
     }
     let roles = adjust_contextual_roles(&tokens, &groups, roles);
     let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
     let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
     let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
     let labels = smooth_title_spans(&tokens, &labels);
     if tokens.len() != labels.len() {
         return None;
     }
@@ -3918,13 +4424,37 @@ fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Re
 mod tests {
     use super::*;
-    fn labels_for(filename: &str) -> Vec<(String, String)> {
         let (key, _, _, _) = template_key_for_filename(filename);
         let roles = suggested_roles(&key);
         let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
         record.tokens.into_iter().zip(record.labels).collect()
     }
     #[test]
     fn rich_title_candidates_keep_readable_spacing() {
         let row = rich_annotation_for(
@@ -3937,10 +4467,93 @@ mod tests {
         );
     }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");
-        assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string())));
         assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
         assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
@@ -3989,9 +4602,7 @@ mod tests {
         assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
         let episode_version_lang =
             labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
-        assert!(
-            episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string()))
-        );
         assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
         let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
@@ -4034,11 +4645,13 @@ mod tests {
         let music_title =
             labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000%  第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
         assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
-        let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
         assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
         assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
-        let hdma_block =
-            labels_for("[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]");
         assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
         assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
         assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
@@ -4068,14 +4681,14 @@ mod tests {
         assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
         assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
-        let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
-        assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
-        assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
         assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
-        let happy = labels_for(
-            "My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG",
-        );
         assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
         assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
@@ -4091,8 +4704,9 @@ mod tests {
         assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
         assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
-        let doraemon =
-            labels_for("[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了");
         assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
         assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
         assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
@@ -4114,8 +4728,9 @@ mod tests {
         assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
-        let basket =
-            labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
         assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
         assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
         assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
@@ -4131,14 +4746,17 @@ mod tests {
         assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
-        let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
         assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
         let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
         assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
         assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
-        assert!(ddp.iter().any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
         let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
         assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
@@ -4156,7 +4774,8 @@ mod tests {
         assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
         assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
-        let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
         assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
         assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
         assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
@@ -4202,7 +4821,8 @@ mod tests {
         assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
         assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
-        let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
         assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
         assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
         assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
@@ -4210,14 +4830,17 @@ mod tests {
         assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
         assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
-        let spy_s3 = labels_for("[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]");
         assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
         assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
         assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
         assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
         assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
-        let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
         assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
         assert!(
             slime.contains(&("300".to_string(), "B-TITLE".to_string())),
@@ -4296,7 +4919,8 @@ mod tests {
         assert!(was_trimmed);
         assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
-        let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
         let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
         assert!(was_trimmed);
         assert_eq!(
@@ -4311,12 +4935,17 @@ mod tests {
             "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
         let (trimmed, was_trimmed) = training_filename_for(menu_parent);
         assert!(was_trimmed);
-        assert_eq!(trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)");
         assert!(has_encoding_noise(
             "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
         ));
-        assert!(has_encoding_noise("ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"));
         assert!(has_encoding_noise(
             "[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
         ));
@@ -4373,7 +5002,8 @@ mod tests {
             "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
-        let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
         let (trimmed, was_trimmed) = training_filename_for(najica);
         assert!(was_trimmed);
         assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
@@ -4385,10 +5015,7 @@ mod tests {
         let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
         let (trimmed, was_trimmed) = training_filename_for(galient);
         assert!(was_trimmed);
-        assert_eq!(
-            trimmed,
-            "[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
-        );
         let galient_labels = labels_for(&trimmed);
         assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
         assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
@@ -4397,9 +5024,13 @@ mod tests {
         let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
         let (trimmed, was_trimmed) = training_filename_for(nced);
         assert!(was_trimmed);
-        assert_eq!(trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED");
-        let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
         let (trimmed, was_trimmed) = training_filename_for(sakura);
         assert!(was_trimmed);
         assert_eq!(
@@ -4418,8 +5049,9 @@ mod tests {
         assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
         assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
-        let aria_notice =
-            labels_for("[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)");
         assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
         assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
         assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
@@ -4465,7 +5097,9 @@ mod tests {
         assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
         assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
-        let kitaro = labels_for("[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり");
         assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
         assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
         assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -4521,7 +5155,8 @@ mod tests {
         assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
         assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
-        let tv_spot = labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
         assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
         assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
         assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
@@ -4536,18 +5171,21 @@ mod tests {
         assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
         assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
-        let souten =
-            labels_for("[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]");
         assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
         assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
         assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
-        let bonjour =
-            labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
         assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
-        let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
         assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
         assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
         assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
@@ -4567,13 +5205,15 @@ mod tests {
         assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
         assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
-        let conan_movie =
-            labels_for("[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]");
         assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
         assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
-        let madoka_movie =
-            labels_for("[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]");
         assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
         assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
@@ -4593,7 +5233,8 @@ mod tests {
         assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
         assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
-        let rezero = labels_for("TVアニメ『Re：ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
         assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
         assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
         assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
@@ -4604,9 +5245,8 @@ mod tests {
         assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
         assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
-        let creditless = labels_for(
-            "[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)",
-        );
         assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
         assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
@@ -4614,7 +5254,9 @@ mod tests {
         assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
         assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
-        let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
         assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
         assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
@@ -4639,7 +5281,8 @@ mod tests {
         assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
         assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
-        let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
         assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
         assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
         assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
@@ -4662,7 +5305,8 @@ mod tests {
         assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
         assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
-        let kage = labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
         assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
         assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
         assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
@@ -4677,15 +5321,19 @@ mod tests {
         assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
         assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
-        let lupin_part =
-            labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
         assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
         assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
         assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
         assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
         assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
-        let roman_leaf = dmhy_record("Ⅰ 001 魯邦燃起了鬥志", "tpl_test", &suggested_roles("TEXT SEP EPISODE SEP TEXT")).unwrap();
         assert!(roman_leaf
             .tokens
             .iter()
@@ -4735,11 +5383,14 @@ mod tests {
         assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
         assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
-        let eien = labels_for("[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]");
         assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
         assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
-        let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
         assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
     }
 }

 struct Stats {
     seen: usize,
     skipped_encoding_noise: usize,
+    skipped_music_audio_collection: usize,
     trimmed_parent_path: usize,
     skipped_no_recipe: usize,
     skipped_sample_cap: usize,
     Skipped {
         reason: &'static str,
         trimmed_parent: bool,
+        example: Option<String>,
+        warnings: Vec<String>,
     },
 }
 });
 static EPISODE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
+static DECIMAL_EPISODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
 static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
 static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
 });
 static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
+static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap());
 static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
 static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
     Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
         .unwrap()
 });
+static SEASON_WORD_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
+static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap()
+});
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 static DATE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
 static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(
+        r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$",
+    )
+    .unwrap()
 });
 static CJK_DATE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
 static SIMPLE_EPISODE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
 static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
+static MUSIC_COLLECTION_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(
+        r"(?i)(?:^|[^A-Z0-9])(?:MUSIC\s*CLIP|MUSIC\s+COLLECTION|SOUNDTRACK|OST|CHARACTER\s+SONG|DRAMA\s+CD|CD\s+ALBUM|BONUS\s+CD)(?:$|[^A-Z0-9])",
+    )
+    .unwrap()
+});
 fn main() -> Result<()> {
     let args = Args::parse();
     let mut label_counts: HashMap<String, usize> = HashMap::new();
     let mut template_counts: HashMap<String, usize> = HashMap::new();
     let mut examples = Vec::new();
+    let mut skipped_music_audio_collection_examples = Vec::new();
+    let mut skipped_low_frequency_audit_warning_counts: HashMap<String, usize> = HashMap::new();
+    let mut skipped_low_frequency_audit_warning_examples: HashMap<String, Vec<String>> =
+        HashMap::new();
     let mut writer = BufWriter::new(File::create(&args.output)?);
     for item in processed {
         match item {
             Processed::Skipped {
                 reason,
                 trimmed_parent,
+                example,
+                warnings,
             } => {
                 if trimmed_parent {
                     stats.trimmed_parent_path += 1;
                 }
                 match reason {
                     "encoding_noise" => stats.skipped_encoding_noise += 1,
+                    "music_audio_collection" => {
+                        stats.skipped_music_audio_collection += 1;
+                        if let Some(example) = example {
+                            if skipped_music_audio_collection_examples.len() < 20 {
+                                skipped_music_audio_collection_examples.push(example);
+                            }
+                        }
+                    }
                     "no_recipe" => stats.skipped_no_recipe += 1,
                     "sample_cap" => stats.skipped_sample_cap += 1,
                     "role_mismatch" => stats.skipped_role_mismatch += 1,
                     "low_frequency_audit_warning" => {
+                        stats.skipped_low_frequency_audit_warning += 1;
+                        for warning in warnings {
+                            *skipped_low_frequency_audit_warning_counts
+                                .entry(warning.clone())
+                                .or_default() += 1;
+                            if let Some(example) = example.as_ref() {
+                                let bucket = skipped_low_frequency_audit_warning_examples
+                                    .entry(warning)
+                                    .or_default();
+                                if bucket.len() < 10 {
+                                    bucket.push(example.clone());
+                                }
+                            }
+                        }
                     }
                     _ => {}
                 }
         "label_counts": label_counts,
         "top_template_counts": top_template_counts,
         "examples": examples,
+        "skipped_music_audio_collection_examples": skipped_music_audio_collection_examples,
+        "skipped_low_frequency_audit_warning_counts": skipped_low_frequency_audit_warning_counts,
+        "skipped_low_frequency_audit_warning_examples": skipped_low_frequency_audit_warning_examples,
         "implementation": "rust_dmhy_template_apply"
     });
     fs::write(
     if !path.exists() {
         return Ok(Vec::new());
     }
+    let file =
+        File::open(path).with_context(|| format!("failed to open whitelist {}", path.display()))?;
     let mut lines = Vec::new();
     for line in BufReader::new(file).lines() {
         let line = line?;
         if !args.keep_encoding_noise
             && (has_encoding_noise(&original)
                 || has_non_anime_noise(&original)
+                || has_music_collection_noise(&original)
                 || has_abstract_path_noise(&original))
         {
             skipped_encoding_noise += 1;
         if !args.keep_encoding_noise
             && (has_encoding_noise(&original)
                 || has_non_anime_noise(&original)
+                || has_music_collection_noise(&original)
                 || has_abstract_path_noise(&original))
         {
             continue;
             if !args.keep_encoding_noise
                 && (has_encoding_noise(original)
                     || has_non_anime_noise(original)
+                    || has_music_collection_noise(original)
                     || has_abstract_path_noise(original))
             {
                 return None;
     let (key, tokens, _classes, groups) = template_key_for_filename(segment);
     let suggested = suggested_roles(&key);
     let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
+    let roles = refine_semantic_roles(&tokens, &groups, &roles);
     let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
     json!({
         "index": index,
             continue;
         }
         output.push(json!({
+            "role": fine_title_role_for_candidate(&roles, start, end)
+                .unwrap_or_else(|| fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()).to_string()),
             "coarse_role": "TITLE",
             "text": text,
             "group_start": start,
         }));
     }
     for (group_index, role) in roles.iter().enumerate() {
+        if is_title_role(role) || role == "O" || role == "HASH" {
             continue;
         }
         let text = group_text(tokens, &groups[group_index]);
     output
 }
+fn fine_title_role_for_candidate(roles: &[String], start: usize, end: usize) -> Option<String> {
+    let mut entities: Vec<&str> = roles[start..end]
+        .iter()
+        .filter_map(|role| title_entity_from_role(role))
+        .filter(|entity| *entity != "TITLE")
+        .collect();
+    entities.sort();
+    entities.dedup();
+    match entities.len() {
+        0 => None,
+        1 => Some(entities[0].to_string()),
+        _ => Some("TITLE_MIXED".to_string()),
+    }
+}
 fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
     let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
         return String::new();
         "GROUP" => "RELEASE_GROUP",
         "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
         "SEASON" => "SEASON",
+        "PATH_SEASON" => "PATH_SEASON",
+        "TAG" => "TAG",
         "SPECIAL" | "VOLUME" => "SPECIAL",
         "RESOLUTION" => "RESOLUTION",
         "SOURCE" => "SOURCE",
 fn audit_warnings(record: &Record) -> Vec<String> {
     let mut warnings = Vec::new();
+    let title_texts = title_entity_texts(&record.tokens, &record.labels);
     let title_spans = title_texts.len();
     if title_spans == 0 {
         warnings.push("no_title".to_string());
+    } else if repeated_title_entity_spans(&record.labels) {
         warnings.push("multiple_title_spans".to_string());
     }
     if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
         warnings.push("encoding_noise_survived".to_string());
     }
     for (index, token) in record.tokens.iter().enumerate() {
+        let entity = record
+            .labels
+            .get(index)
+            .and_then(|label| label_entity(label));
         let cleaned = strip_wrapper(token);
         if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
             warnings.push("hash_labeled".to_string());
             break;
         }
+        if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned)) && entity != Some("EPISODE")
         {
             warnings.push("episode_version_missing_label".to_string());
         }
         .or_else(|| label.strip_prefix("I-"))
 }
+fn title_entity_texts(tokens: &[String], labels: &[String]) -> Vec<String> {
     let mut spans = Vec::new();
     let mut current = String::new();
+    let mut current_entity: Option<String> = None;
     for (token, label) in tokens.iter().zip(labels.iter()) {
+        let entity = label_entity(label).filter(|entity| is_title_entity(entity));
+        if entity.is_some() && current_entity.as_deref() == entity {
             current.push_str(token);
         } else {
+            if !current.trim().is_empty() {
+                spans.push(current.trim().to_string());
+            }
             current.clear();
+            current_entity = entity.map(str::to_string);
+            if entity.is_some() {
+                current.push_str(token);
+            }
         }
     }
     if !current.trim().is_empty() {
     spans
 }
+fn repeated_title_entity_spans(labels: &[String]) -> bool {
+    let mut seen = HashSet::new();
+    let mut previous: Option<String> = None;
+    for label in labels {
+        let entity = label_entity(label)
+            .filter(|entity| is_title_entity(entity))
+            .map(str::to_string);
+        if entity.is_some() && entity != previous {
+            let entity = entity.clone().unwrap();
+            if !seen.insert(entity) {
+                return true;
+            }
+        }
+        previous = entity;
+    }
+    false
+}
 fn generic_title_text(text: &str) -> bool {
     matches!(
         text.trim().to_ascii_lowercase().as_str(),
+        "tv" | "movie"
             | "mov"
             | "sample"
             | "commercial"
     recipes: &HashMap<String, Recipe>,
     sample_counters: &HashMap<String, AtomicUsize>,
 ) -> Processed {
+    if !args.keep_encoding_noise && has_music_collection_noise(original) {
+        return Processed::Skipped {
+            reason: "music_audio_collection",
+            trimmed_parent: false,
+            example: Some(original.to_string()),
+            warnings: Vec::new(),
+        };
+    }
     if !args.keep_encoding_noise
         && (has_encoding_noise(original)
             || has_non_anime_noise(original)
         return Processed::Skipped {
             reason: "encoding_noise",
             trimmed_parent: false,
+            example: None,
+            warnings: Vec::new(),
         };
     }
     let (training_filename, trimmed_parent) = training_filename_for(original);
             return Processed::Skipped {
                 reason: "no_recipe",
                 trimmed_parent,
+                example: None,
+                warnings: Vec::new(),
             }
         }
     };
             return Processed::Skipped {
                 reason: "sample_cap",
                 trimmed_parent,
+                example: None,
+                warnings: Vec::new(),
             };
         }
     }
         return Processed::Skipped {
             reason: "role_mismatch",
             trimmed_parent,
+            example: None,
+            warnings: Vec::new(),
         };
     }
     let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
             return Processed::Skipped {
                 reason: "role_mismatch",
                 trimmed_parent,
+                example: None,
+                warnings: Vec::new(),
             }
         }
     };
         return Processed::Skipped {
             reason: "low_frequency_audit_warning",
             trimmed_parent,
+            example: Some(record.filename.clone()),
+            warnings,
         };
     }
     if trimmed_parent {
     roles
 }
+fn refine_semantic_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
+    let mut output = roles.to_vec();
+    let mut segment_end = groups
+        .iter()
+        .position(|group| group.class_name == "PATH")
+        .unwrap_or(groups.len());
+    let mut is_path_segment = segment_end < groups.len();
+    for index in 0..groups.len() {
+        if groups[index].class_name == "PATH" {
+            segment_end = groups[index + 1..]
+                .iter()
+                .position(|group| group.class_name == "PATH")
+                .map(|offset| index + 1 + offset)
+                .unwrap_or(groups.len());
+            is_path_segment = segment_end < groups.len();
+            continue;
+        }
+        let text = group_text(tokens, &groups[index]);
+        let bracketed = is_bracket_group(&groups[index]);
+        if is_category_tag_text(&text, bracketed, is_path_segment)
+            && matches!(output[index].as_str(), "O" | "TITLE" | "GROUP" | "SPECIAL")
+        {
+            output[index] = "TAG".to_string();
+            continue;
+        }
+        if output[index] == "SEASON" && is_path_segment {
+            output[index] = "PATH_SEASON".to_string();
+            continue;
+        }
+        if output[index] == "TITLE" {
+            output[index] = title_role_for_text(&text, is_path_segment);
+        }
+    }
+    output
+}
 fn filename_has_title(filename: &str) -> bool {
     let (key, _, _, _) = template_key_for_filename(filename);
+    suggested_roles(&key).iter().any(|role| is_title_role(role))
 }
 fn training_filename_for(original: &str) -> (String, bool) {
                 && path_segment_starts_with_episode(parts[parts.len() - 1])
                 && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
     {
+        if let Some(parent) = parts[..parts.len() - 1].iter().rev().find(|part| {
+            let trimmed = trim_parent_title_segment(part);
+            filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
+        }) {
             let parent = trim_parent_title_segment(parent.trim());
             return (
+                format!("{} {}", parent, parts[parts.len() - 1].trim()),
                 true,
             );
         }
         return true;
     }
     let markers = [
+        "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛", "楀", "箷",
+        "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲", "伄", "椋", "伓", "姘",
+        "帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒", "銇", "銈", "銉", "偅", "偗", "儱", "儫",
+        "兗", "仧", "鏉变", "鍠靛", "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒",
+        "瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "兇銈", "銉砡", "銉砕", "杩风", "硦澶",
+        "銇淬", "仧銉", "銉嗐", "偅銈", "銈躲",
     ];
     let marker_hits = markers
         .iter()
         .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
         .count();
     let latin_mojibake = value.split_whitespace().any(|part| {
+        part.chars()
+            .any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛'))
             && part.chars().any(|ch| ch.is_ascii_alphabetic())
     });
     marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
 fn has_non_anime_noise(value: &str) -> bool {
     let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
+    normalized == "mtv"
+        || normalized.starts_with("mtv/")
+        || normalized.contains("/mtv/")
         || value.contains("[旅游")
         || value.contains("[旅游番")
         || normalized.contains("tokyo deep")
         .to_ascii_lowercase()
 }
+fn normalized_tag_text(value: &str) -> String {
+    value
+        .replace(['_', '.', '-', '・'], " ")
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+        .trim()
+        .to_ascii_lowercase()
+}
+fn compact_tag_text(value: &str) -> String {
+    value
+        .chars()
+        .filter(|ch| ch.is_alphanumeric())
+        .collect::<String>()
+        .to_ascii_lowercase()
+}
+fn is_bracket_group(group: &Group) -> bool {
+    group.class_name.starts_with("BRACKET_")
+}
+fn is_category_tag_text(text: &str, bracketed: bool, path_segment: bool) -> bool {
+    let cleaned = strip_wrapper(text);
+    let trimmed = cleaned.trim();
+    if trimmed.is_empty() {
+        return false;
+    }
+    if (bracketed || path_segment) && (DATE_RE.is_match(trimmed) || YEAR_RANGE_RE.is_match(trimmed))
+    {
+        return true;
+    }
+    if (bracketed || path_segment)
+        && matches!(
+            trimmed,
+            "国漫" | "國漫" | "日漫" | "剧场版" | "劇場版" | "新番"
+        )
+    {
+        return true;
+    }
+    if (bracketed || path_segment)
+        && (trimmed.ends_with("月新番") || trimmed.ends_with("月新番合集"))
+    {
+        return true;
+    }
+    let normalized = normalized_tag_text(trimmed);
+    (bracketed || path_segment)
+        && matches!(
+            normalized.as_str(),
+            "anime" | "gekijouban" | "movie" | "movies" | "the movie" | "tv" | "tv series"
+        )
+}
+fn has_music_collection_noise(value: &str) -> bool {
+    let normalized = value
+        .replace(['_', '.', '-', '・', '/', '\\'], " ")
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ");
+    let compact = compact_tag_text(value);
+    MUSIC_COLLECTION_RE.is_match(&normalized) || compact.contains("musicclip")
+}
+fn is_title_role(role: &str) -> bool {
+    role == "TITLE" || role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_")
+}
+fn is_path_title_role(role: &str) -> bool {
+    role.starts_with("PATH_TITLE_")
+}
+fn title_entity_from_role(role: &str) -> Option<&str> {
+    if role == "TITLE" {
+        Some("TITLE")
+    } else if role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_") {
+        Some(role)
+    } else {
+        None
+    }
+}
+fn is_title_entity(entity: &str) -> bool {
+    entity == "TITLE"
+        || matches!(
+            entity,
+            "TITLE_CHS"
+                | "TITLE_CHT"
+                | "TITLE_JPN"
+                | "TITLE_LATIN"
+                | "TITLE_MIXED"
+                | "PATH_TITLE_CHS"
+                | "PATH_TITLE_CHT"
+                | "PATH_TITLE_JPN"
+                | "PATH_TITLE_LATIN"
+                | "PATH_TITLE_MIXED"
+        )
+}
+fn is_title_label(label: &str) -> bool {
+    label_entity(label).is_some_and(is_title_entity)
+}
+fn title_language_suffix(text: &str) -> &'static str {
+    let mut has_latin = false;
+    let mut has_han = false;
+    let mut has_kana = false;
+    for ch in text.chars() {
+        if ch.is_ascii_alphabetic() {
+            has_latin = true;
+        } else if ('\u{3040}'..='\u{30ff}').contains(&ch) || ('\u{31f0}'..='\u{31ff}').contains(&ch)
+        {
+            has_kana = true;
+        } else if ('\u{4e00}'..='\u{9fff}').contains(&ch) {
+            has_han = true;
+        }
+    }
+    if has_kana {
+        return "JPN";
+    }
+    if has_latin && has_han {
+        return "MIXED";
+    }
+    if has_han {
+        return cjk_title_language_suffix(text);
+    }
+    if has_latin {
+        return "LATIN";
+    }
+    "MIXED"
+}
+fn cjk_title_language_suffix(text: &str) -> &'static str {
+    let japanese_markers = [
+        '々', 'ヶ', '君', '戦', '気', '辺', '沢', '桜', '竜', '広', '処', '歩', '黒', '円',
+    ];
+    if text.chars().any(|ch| japanese_markers.contains(&ch)) {
+        return "JPN";
+    }
+    let simplified_markers = [
+        '国', '剧', '场', '农', '闲', '汉', '龙', '门', '击', '战', '体', '后', '爱', '边', '声',
+        '岛', '学', '万',
+    ];
+    if text.chars().any(|ch| simplified_markers.contains(&ch)) {
+        return "CHS";
+    }
+    let traditional_markers = [
+        '國', '劇', '場', '農', '閒', '漢', '龍', '門', '擊', '戰', '體', '後', '愛', '邊', '聲',
+        '島', '學', '萬', '縛', '異', '臺', '灣', '搖', '滾',
+    ];
+    if text.chars().any(|ch| traditional_markers.contains(&ch)) {
+        return "CHT";
+    }
+    "CHS"
+}
+fn title_role_for_text(text: &str, path_title: bool) -> String {
+    let prefix = if path_title { "PATH_TITLE" } else { "TITLE" };
+    format!("{prefix}_{}", title_language_suffix(text))
+}
 fn path_segment_is_episodeish(value: &str) -> bool {
     let (_, _, _, groups) = template_key_for_filename(value);
     let structural: Vec<&String> = groups
         .filter(|item| item.as_str() != "SEP")
         .collect();
     !structural.is_empty()
+        && structural.iter().all(|item| {
+            item.starts_with("EPISODE")
+                || item.as_str() == "SPECIAL"
+                || item.as_str() == "VOLUME"
+                || item.as_str() == "BRACKET_VOLUME"
+        })
 }
 fn path_segment_starts_with_episode(value: &str) -> bool {
 fn role_label(role: &str) -> String {
     let entity = match role {
         "GROUP" => Some("GROUP"),
+        role if is_title_role(role) => Some("TITLE"),
         "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
         "SEASON" => Some("SEASON"),
+        "PATH_SEASON" => Some("PATH_SEASON"),
         "SPECIAL" | "VOLUME" => Some("SPECIAL"),
         "RESOLUTION" => Some("RESOLUTION"),
         "SOURCE" => Some("SOURCE"),
+        "TAG" => Some("TAG"),
         _ => None,
     };
     entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
         || normalized.contains("字幕組")
 }
+fn title_context_before(
+    tokens: &[String],
+    groups: &[Group],
+    roles: &[String],
+    index: usize,
+) -> String {
+    (0..index)
+        .filter(|&cursor| roles[cursor] == "TITLE")
+        .map(|cursor| group_text(tokens, &groups[cursor]))
+        .collect::<Vec<_>>()
+        .join(" ")
+}
+fn short_number_title_exception(context: &str, number: &str) -> bool {
+    let normalized = normalized_tag_text(context);
+    let compact = compact_tag_text(context);
+    matches!(
+        (normalized.as_str(), number),
+        ("kamisama hajimemashita", "2") | ("ghiblies episode", "2") | ("r", "15")
+    ) || (normalized.contains("91 days") && number == "91")
+        || (context.contains("銀河鉄道") && number == "999")
+        || compact.contains("highschooldd")
+        || (context.contains("機動戦士ガンダム") && number == "00")
+}
+fn group_followed_by_quote(tokens: &[String], groups: &[Group], index: usize) -> bool {
+    let Some(last_token) = groups.get(index).and_then(|group| group.indices.last()) else {
+        return false;
+    };
+    for token in &tokens[*last_token + 1..] {
+        if token.chars().all(char::is_whitespace) {
+            continue;
+        }
+        return matches!(token.as_str(), "「" | "｢" | "\"" | "'");
+    }
+    false
+}
 const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
     &["SPY", "x", "FAMILY"],
     &["Spy", "x", "Family"],
         });
         if !first_is_known_group {
             if let Some(groupish_index) = (1..groups.len()).find(|&index| {
+                output[index] == "TITLE"
+                    && looks_like_release_group(&group_text(tokens, &groups[index]))
             }) {
                 output[0] = "TITLE".to_string();
                 output[groupish_index] = "GROUP".to_string();
         }
         if roles[index].starts_with("EPISODE")
             && index >= 2
+            && matches!(
+                group_text(tokens, &groups[index - 1]).as_str(),
+                "×" | "x" | "X"
+            )
             && output[index - 2] == "TITLE"
+            && !roles[index + 1..]
+                .iter()
+                .any(|role| role.starts_with("EPISODE"))
         {
             output[index] = "TITLE".to_string();
             if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
             continue;
         }
         if roles[index].starts_with("EPISODE")
+            && !output[..index]
+                .iter()
+                .any(|role| role.starts_with("EPISODE"))
             && group_text(
                 tokens,
                 &groups[(0..index)
             output[index] = "TITLE".to_string();
             continue;
         }
+        if output[index] == "TITLE" && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
         {
             let next_source_lang = (index + 1..roles.len())
                 .find(|&cursor| groups[cursor].class_name != "SEP")
                 .is_some_and(|cursor| {
+                    output[cursor] == "SOURCE" && group_text(tokens, &groups[cursor]).contains('语')
                 });
             if next_source_lang {
                 output[index] = "SOURCE".to_string();
                 continue;
             }
         }
+        if roles[index].starts_with("EPISODE")
+            && index >= 1
+            && output[..index].iter().any(|role| role == "TITLE")
+            && text.chars().all(|ch| ch.is_ascii_digit())
+            && short_number_title_exception(
+                &title_context_before(tokens, groups, &output, index),
+                &text,
+            )
+        {
+            output[index] = "TITLE".to_string();
+            continue;
+        }
         if roles[index].starts_with("EPISODE")
             && index >= 1
             && output[index - 1] == "TITLE"
             && groups[index - 1].class_name != "SEP"
             && text.chars().all(|ch| ch.is_ascii_digit())
+            && text.len() <= 2
             && roles[index + 1..]
                 .iter()
                 .any(|role| role.starts_with("EPISODE"))
+            && !group_followed_by_quote(tokens, groups, index)
         {
+            let context = title_context_before(tokens, groups, &output, index);
+            output[index] = if short_number_title_exception(&context, &text) {
+                "TITLE"
+            } else {
+                "SEASON"
+            }
+            .to_string();
             continue;
         }
         if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
                 && output[index - 1] == "TITLE"
                 && groups[index - 1].class_name != "SEP"
                 && text.chars().all(|ch| ch.is_ascii_digit())
+                && text.len() <= 2
                 && roles[index + 1..]
                     .iter()
                     .any(|role| role.starts_with("EPISODE"))
+                && !group_followed_by_quote(tokens, groups, index)
             {
+                let context = title_context_before(tokens, groups, &output, index);
+                output[index] = if short_number_title_exception(&context, &text) {
+                    "TITLE"
+                } else {
+                    "SEASON"
+                }
+                .to_string();
                 continue;
             }
             if !output[..index].iter().any(|role| role == "TITLE")
                 && previous_text.len() <= 48
                 && previous_text.chars().any(|ch| ch.is_alphabetic())
                 && text.chars().all(|ch| ch.is_ascii_digit())
+                && text.len() <= 2
                 && !(index + 2 < roles.len()
                     && groups[index + 1].class_name == "SEP"
                     && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
+                && !(index + 1 < roles.len()
+                    && groups[index + 1].class_name == "SEP"
+                    && group_text(tokens, &groups[index + 1])
+                        .chars()
+                        .any(|ch| matches!(ch, '「' | '｢' | '"' | '\'')))
+                && !group_followed_by_quote(tokens, groups, index)
                 && (next_episode
                     || (next_special
                         && (text.parse::<u16>().is_ok_and(|value| value >= 100)
                             || (previous_text.len() <= 4
                                 && previous_text.is_ascii()
+                                && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())))))
             {
+                output[index] = if next_episode
+                    && !short_number_title_exception(
+                        &title_context_before(tokens, groups, &output, index),
+                        &text,
+                    ) {
+                    "SEASON"
+                } else {
+                    "TITLE"
+                }
+                .to_string();
                 continue;
             }
         }
         if roles[index].starts_with("EPISODE")
             && (text.chars().all(|ch| ch.is_ascii_digit())
+                || matches!(classify_atom(&text).as_str(), "EPISODE" | "EPISODE_VERSION"))
             && output[..index].iter().any(|role| role == "SPECIAL")
+            && !output[..index]
+                .iter()
+                .any(|role| role.starts_with("EPISODE"))
         {
             let previous_structural = (0..index)
                 .rev()
         }
         if roles[index] == "TITLE"
             && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
+            && output
+                .iter()
+                .enumerate()
+                .any(|(other, role)| other != index && role == "TITLE")
         {
             output[index] = "O".to_string();
             continue;
             continue;
         }
         if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
+            let later_special = output[index + 1..].iter().any(|role| role == "SPECIAL");
             if later_special {
                 output[index] = "SPECIAL".to_string();
                 continue;
         }
         if output[index] == "O"
             && groups[index].class_name == "TEXT"
+            && roles[index + 1..]
+                .iter()
+                .any(|role| role.starts_with("EPISODE"))
             && text.chars().any(|ch| ch.is_alphabetic())
             && !ep_markers.contains(&text.as_str())
         {
             if matches!(
                 previous_real_text.to_ascii_lowercase().as_str(),
                 "lesson" | "part" | "no"
+            ) {
                 output[index] = "O".to_string();
                 continue;
             }
                 continue;
             }
             if output[..index].iter().any(|role| role == "TITLE")
+                && (output[..index].iter().enumerate().any(|(cursor, role)| {
+                    role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
+                }))
+                && !output[..index]
                     .iter()
+                    .any(|role| role.starts_with("EPISODE"))
                 && text.chars().all(|ch| ch.is_ascii_digit())
                 && text.len() <= 3
             {
     let mut candidates = Vec::new();
     let mut index = 0;
     while index < roles.len() {
+        if !is_title_role(&roles[index]) {
             index += 1;
             continue;
         }
         index += 1;
         loop {
             if index < roles.len()
+                && is_title_role(&roles[index])
                 && !(groups[index - 1].class_name == "BRACKET_TEXT"
                     && groups[index].class_name == "BRACKET_TEXT")
             {
             if index + 1 < roles.len()
                 && roles[index] == "O"
                 && groups[index].class_name == "SEP"
+                && is_title_role(&roles[index + 1])
             {
                 index += 2;
                 continue;
             role.starts_with("EPISODE")
                 || matches!(
                     role.as_str(),
+                    "SEASON" | "PATH_SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
                 )
         })
         .unwrap_or(roles.len());
         .copied()
         .filter(|(_, end)| *end <= first_anchor)
         .collect();
+    let before_anchor_only_path_titles = !before_anchor.is_empty()
+        && before_anchor.iter().all(|(start, end)| {
+            (*start..*end)
+                .all(|index| !is_title_role(&roles[index]) || is_path_title_role(&roles[index]))
+        });
+    let selected_pool = if before_anchor.is_empty() || before_anchor_only_path_titles {
         &candidates
     } else {
         &before_anchor
     };
+    let mut selected_by_kind: HashMap<String, ((usize, usize), (isize, usize, usize))> =
+        HashMap::new();
+    for (start, end) in selected_pool.iter().copied() {
+        let score = (
+            title_candidate_score(tokens, groups, start, end),
+            end,
             end - start,
+        );
+        let key = title_candidate_key(tokens, groups, roles, start, end);
+        match selected_by_kind.get(&key) {
+            Some((_, best_score)) if *best_score >= score => {}
+            _ => {
+                selected_by_kind.insert(key, ((start, end), score));
+            }
+        }
+    }
+    let selected: HashSet<(usize, usize)> =
+        selected_by_kind.values().map(|(range, _)| *range).collect();
     let mut output = roles.to_vec();
     let mut dropped = Vec::new();
     for (start, end) in candidates {
+        if selected.contains(&(start, end)) {
             continue;
         }
         for index in start..end {
+            if is_title_role(&output[index]) {
                 output[index] = "O".to_string();
                 dropped.push(index.to_string());
             }
     (output, dropped)
 }
+fn title_candidate_key(
+    tokens: &[String],
+    groups: &[Group],
+    roles: &[String],
+    start: usize,
+    end: usize,
+) -> String {
+    let mut entities: Vec<String> = (start..end)
+        .filter_map(|index| title_entity_from_role(&roles[index]).map(str::to_string))
+        .filter(|entity| entity != "TITLE")
+        .collect();
+    entities.sort();
+    entities.dedup();
+    if entities.is_empty() {
+        let text = candidate_text(tokens, groups, start, end);
+        return title_role_for_text(&text, false);
+    }
+    entities.join("+")
+}
 fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
     let text = (start..end)
         .filter(|&index| roles_candidate_text_group(&groups[index]))
         if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
             let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
             let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
+            if before.contains("銀河鉄道") && episode == "999" {
+                output_pieces.push(before.to_string());
+                labels.push("B-TITLE".to_string());
+                output_pieces.push(episode.to_string());
+                labels.push("B-TITLE".to_string());
+                continue;
+            }
             if !before.is_empty() {
                 output_pieces.push(before.to_string());
                 labels.push("B-TITLE".to_string());
                     | "SOURCE"
                     | "RESOLUTION"
                     | "SEASON"
+                    | "PATH_SEASON"
             ) {
+                if matches!(role, "SEASON" | "PATH_SEASON") {
                     if let Some((pieces, labels)) = split_season_token(token) {
                         output_tokens.extend(pieces);
                         output_labels.extend(labels);
                     output_labels.extend(labels);
                 }
             } else {
+                if is_title_role(role) && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
                 {
                     output_tokens.push(token.clone());
                     output_labels.push("O".to_string());
                     continue;
                 }
+                if is_title_role(role) && token.ends_with('第') && token.chars().count() > 1 {
                     let trimmed = token.trim_end_matches('第').to_string();
                     let (pieces, labels) = normalize_generated_tokens(
                         &[trimmed, "第".to_string()],
                     output_labels.extend(labels);
                     continue;
                 }
+                if is_title_role(role) {
                     let (pieces, labels) = normalize_title_token(token);
                     output_tokens.extend(pieces);
                     output_labels.extend(labels);
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?", "？",
+        ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")", "（", "）", "[",
+        "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」", "《", "》", "☆", "♪", "`",
+        "@", "‐", "‑", "–", "—", "−", "$", "＄", "∽", "꞉", "♥",
     ];
     let title_terminal_punctuation = ["!", "！", "?", "？"];
     let entity_joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?", "？",
+        ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")", "（", "）", "[",
+        "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」", "《", "》", "☆", "♪", "`",
+        "@", "&", "＆", "‐", "‑", "–", "—", "−", "$", "＄", "∽", "꞉", "♥",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
                     .any(|item| item.eq_ignore_ascii_case("lupin"));
             if nearby_lupin
                 && next_number.is_some_and(|cursor| {
+                    tokens[cursor].chars().all(|ch| ch.is_ascii_digit())
+                        && tokens[cursor].len() <= 2
                 })
             {
                 output[index] = "B-SEASON".to_string();
             let mut cursor = index + 1;
             while cursor < tokens.len() {
                 output[cursor] = "O".to_string();
+                if matches!(tokens[cursor].as_str(), "」" | "｣" | "\"" | "'") && cursor > index + 1
+                {
                     break;
                 }
                 cursor += 1;
             }
             continue;
         }
+        if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英")
+        {
+            let next_word = (index + 1..tokens.len())
+                .find(|&cursor| tokens[cursor].chars().any(|ch| ch.is_alphanumeric()));
+            if next_word
+                .is_some_and(|cursor| labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语'))
+            {
                 output[index] = "B-SOURCE".to_string();
                 continue;
             }
                         .chars()
                         .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
             });
+            let later_episode =
+                (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
             if previous_title_word.is_none() && later_episode {
                 output[index] = "B-SEASON".to_string();
                 continue;
             }
+            let previous_word =
+                previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
+            if previous_title_word.is_some() && !matches!(previous_word.as_deref(), Some("lupin")) {
                 output[index] = "B-SEASON".to_string();
                 continue;
             }
                 continue;
             }
             if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
+                && next_non_space.is_some_and(|cursor| {
+                    matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
+                        || tokens[cursor].starts_with('话')
+                        || tokens[cursor].starts_with('話')
+                        || tokens[cursor].starts_with('回')
+                        || tokens[cursor].starts_with('集')
+                })
             {
                 if let Some(cursor) = previous_non_space {
                     output[cursor] = "B-EPISODE".to_string();
             let followed_by_title_word = (index + 1..tokens.len())
                 .find(|&cursor| {
                     !joiners.contains(&tokens[cursor].as_str())
+                        && !matches!(
+                            tokens[cursor].as_str(),
+                            "-" | "－" | "," | "，" | ":" | "："
+                        )
                 })
                 .is_some_and(|cursor| {
+                    !matches!(
+                        tokens[cursor].as_str(),
+                        "[" | "【" | "(" | "（" | "]" | "】"
+                    ) && output.get(cursor).is_some_and(|label| label == "B-TITLE")
                         && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                 });
             if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
                 continue;
             }
         }
+        if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() <= 3 {
             let previous_non_space = (0..index)
                 .rev()
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
             let next_non_space = (index + 1..tokens.len())
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
+            if previous_non_space
+                .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
+                && next_non_space
+                    .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
                 && output[..index].iter().any(|label| label == "B-TITLE")
                 && output[index + 1..]
                     .iter()
                 output[index] = "B-EPISODE".to_string();
                 continue;
             }
+            if previous_non_space
+                .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "－"))
                 && output[..index].iter().any(|label| label == "B-TITLE")
                 && output[index + 1..]
                     .iter()
             let next_non_space = (index + 1..tokens.len())
                 .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
             if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
+                && next_non_space.is_some_and(|cursor| {
+                    matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
+                })
             {
                 if let Some(cursor) = previous_non_space {
                     output[cursor] = "B-EPISODE".to_string();
                 if left_title {
                     output[index] = "B-TITLE".to_string();
                     if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
+                        labels[cursor] == "O" && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                     }) {
                         output[next_word] = "B-TITLE".to_string();
                     }
                 output[index] = "B-TITLE".to_string();
             }
         }
+        if matches!(
+            token.as_str(),
+            "]" | "】" | ")" | "）" | ">" | "＞" | "｣" | "」"
+        ) && index > 0
             && output[index - 1] == "B-TITLE"
             && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
         {
     )
 }
+fn retag_semantic_labels(tokens: &[String], labels: &[String]) -> Vec<String> {
+    let last_path = tokens
+        .iter()
+        .rposition(|token| token == "/" || token == "\\");
+    let mut output = labels.to_vec();
+    for index in 0..labels.len() {
+        let Some(entity) = label_entity(&labels[index]) else {
+            continue;
+        };
+        let prefix = if labels[index].starts_with("I-") {
+            "I"
+        } else {
+            "B"
+        };
+        if entity == "TITLE" {
+            let path_title = last_path.is_some_and(|path_index| index < path_index);
+            let suffix = title_suffix_for_label_index(tokens, labels, index);
+            output[index] = format!(
+                "{prefix}-{}_{}",
+                if path_title { "PATH_TITLE" } else { "TITLE" },
+                suffix
+            );
+        } else if entity == "SEASON" && last_path.is_some_and(|path_index| index < path_index) {
+            output[index] = format!("{prefix}-PATH_SEASON");
+        }
+    }
+    output
+}
+fn title_suffix_for_label_index(
+    tokens: &[String],
+    labels: &[String],
+    index: usize,
+) -> &'static str {
+    if let Some(suffix) = direct_title_suffix(&tokens[index]) {
+        return suffix;
+    }
+    let left = nearest_title_suffix(tokens, labels, index, true);
+    let right = nearest_title_suffix(tokens, labels, index, false);
+    match (left, right) {
+        (Some(left), Some(right)) if left == right => left,
+        (Some(left), None) => left,
+        (None, Some(right)) => right,
+        _ => "MIXED",
+    }
+}
+fn nearest_title_suffix(
+    tokens: &[String],
+    labels: &[String],
+    index: usize,
+    search_left: bool,
+) -> Option<&'static str> {
+    let mut cursor = index as isize;
+    loop {
+        cursor += if search_left { -1 } else { 1 };
+        if cursor < 0 || cursor as usize >= tokens.len() {
+            return None;
+        }
+        let cursor = cursor as usize;
+        if !is_title_label(&labels[cursor]) {
+            if tokens[cursor]
+                .chars()
+                .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
+            {
+                continue;
+            }
+            return None;
+        }
+        if let Some(suffix) = direct_title_suffix(&tokens[cursor]) {
+            return Some(suffix);
+        }
+    }
+}
+fn direct_title_suffix(token: &str) -> Option<&'static str> {
+    if !token.chars().any(|ch| {
+        ch.is_ascii_alphabetic()
+            || ('\u{3040}'..='\u{30ff}').contains(&ch)
+            || ('\u{31f0}'..='\u{31ff}').contains(&ch)
+            || ('\u{4e00}'..='\u{9fff}').contains(&ch)
+    }) {
+        return None;
+    }
+    Some(title_language_suffix(token))
+}
 fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
     let (key, tokens, _classes, groups) = template_key_for_filename(filename);
     if groups.len() != roles.len() {
         return None;
     }
     let roles = adjust_contextual_roles(&tokens, &groups, roles);
+    let roles = refine_semantic_roles(&tokens, &groups, &roles);
     let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
     let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
     let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
     let labels = smooth_title_spans(&tokens, &labels);
+    let labels = retag_semantic_labels(&tokens, &labels);
     if tokens.len() != labels.len() {
         return None;
     }
 mod tests {
     use super::*;
+    fn schema_labels_for(filename: &str) -> Vec<(String, String)> {
         let (key, _, _, _) = template_key_for_filename(filename);
         let roles = suggested_roles(&key);
         let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
         record.tokens.into_iter().zip(record.labels).collect()
     }
+    fn labels_for(filename: &str) -> Vec<(String, String)> {
+        schema_labels_for(filename)
+            .into_iter()
+            .map(|(token, label)| (token, legacy_label(&label)))
+            .collect()
+    }
+    fn legacy_label(label: &str) -> String {
+        let Some(entity) = label_entity(label) else {
+            return label.to_string();
+        };
+        let prefix = if label.starts_with("I-") { "I" } else { "B" };
+        if is_title_entity(entity) {
+            return format!("{prefix}-TITLE");
+        }
+        if entity == "PATH_SEASON" {
+            return format!("{prefix}-SEASON");
+        }
+        if entity == "TAG" {
+            return format!("{prefix}-SPECIAL");
+        }
+        label.to_string()
+    }
     #[test]
     fn rich_title_candidates_keep_readable_spacing() {
         let row = rich_annotation_for(
         );
     }
+    #[test]
+    fn semantic_schema_roles_cover_multilingual_tags_paths_and_music_skips() {
+        let gm = schema_labels_for(
+            "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4",
+        );
+        assert!(gm.contains(&("GM".to_string(), "B-GROUP".to_string())));
+        assert!(gm.contains(&("国漫".to_string(), "B-TAG".to_string())));
+        assert!(gm.contains(&("神印王座".to_string(), "B-TITLE_CHS".to_string())));
+        assert!(gm.contains(&("Throne".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(gm.contains(&("Seal".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(gm.contains(&("2022".to_string(), "B-TAG".to_string())));
+        assert!(gm.contains(&("200".to_string(), "B-EPISODE".to_string())));
+        let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
+        assert!(sky.contains(&("Skytree".to_string(), "B-GROUP".to_string())));
+        assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
+        assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
+        let farming = schema_labels_for("異世界悠閒農家 2 - 06");
+        assert!(farming.contains(&("異世界悠閒農家".to_string(), "B-TITLE_CHT".to_string())));
+        assert!(farming.contains(&("2".to_string(), "B-SEASON".to_string())));
+        assert!(farming.contains(&("06".to_string(), "B-EPISODE".to_string())));
+        let hanako = schema_labels_for("地縛少年花子君 2 - 13");
+        assert!(hanako.contains(&("地縛少年花子君".to_string(), "B-TITLE_JPN".to_string())));
+        assert!(hanako.contains(&("2".to_string(), "B-SEASON".to_string())));
+        assert!(hanako.contains(&("13".to_string(), "B-EPISODE".to_string())));
+        let one_piece = schema_labels_for("One.Piece.1110");
+        assert!(one_piece.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(one_piece.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(one_piece.contains(&("1110".to_string(), "B-EPISODE".to_string())));
+        assert!(!one_piece.contains(&("1110".to_string(), "B-SEASON".to_string())));
+        let nekomoe_prefix = schema_labels_for("[喵萌奶茶屋][7月新番][Lycoris Recoil][01][1080P]");
+        assert!(nekomoe_prefix.contains(&("喵萌奶茶屋".to_string(), "B-GROUP".to_string())));
+        assert!(nekomoe_prefix.contains(&("7月新番".to_string(), "B-TAG".to_string())));
+        assert!(nekomoe_prefix.contains(&("Lycoris".to_string(), "B-TITLE_LATIN".to_string())));
+        let subtitle_group = schema_labels_for("[桜都字幕组][Title][01][1080P]");
+        assert!(subtitle_group.contains(&("桜都字幕组".to_string(), "B-GROUP".to_string())));
+        let path = schema_labels_for("海贼王/Season 2/One Piece - 01 [1080P]");
+        assert!(path.contains(&("海贼王".to_string(), "B-PATH_TITLE_CHS".to_string())));
+        assert!(path.contains(&("2".to_string(), "B-PATH_SEASON".to_string())));
+        assert!(path.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(path.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let tags = schema_labels_for("[日漫][剧场版][Movie][TV][2024][Title][01][1080P]");
+        assert!(tags.contains(&("日漫".to_string(), "B-TAG".to_string())));
+        assert!(tags.contains(&("剧场版".to_string(), "B-TAG".to_string())));
+        assert!(tags.contains(&("Movie".to_string(), "B-TAG".to_string())));
+        assert!(tags.contains(&("TV".to_string(), "B-TAG".to_string())));
+        assert!(tags.contains(&("2024".to_string(), "B-TAG".to_string())));
+        assert!(tags.contains(&("Title".to_string(), "B-TITLE_LATIN".to_string())));
+        for skipped in [
+            "[Group] Title OST [FLAC]",
+            "[Group] Title MUSICCLIP [BDRip]",
+            "[Group] Title Music Collection [FLAC]",
+            "[Group] Title Character Song [MP3]",
+            "[Group] Title Drama CD [FLAC]",
+            "[Group] Title CD Album [FLAC]",
+            "[Group] Title Bonus CD [FLAC]",
+            "[Group] Title Soundtrack [FLAC]",
+        ] {
+            assert!(has_music_collection_noise(skipped), "{skipped}");
+        }
+        for preserved in [
+            "[Group] Title OP [FLAC]",
+            "[Group] Title ED [FLAC]",
+            "[Group] Title NCOP [FLAC]",
+            "[Group] Title NCED [FLAC]",
+            "[Group] Title PV [1080P]",
+            "[Group] Title CM [1080P]",
+            "[Group] Title Menu [1080P]",
+            "[Group] Title Trailer [1080P]",
+        ] {
+            assert!(!has_music_collection_noise(preserved), "{preserved}");
+        }
+    }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");
+        assert!(title_91.contains(&("91".to_string(), "B-SEASON".to_string())));
         assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
         assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
         let episode_version_lang =
             labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
+        assert!(episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string())));
         assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
         let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
         let music_title =
             labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000%  第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
         assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
+        let cm_version =
+            labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
         assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
         assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
+        let hdma_block = labels_for(
+            "[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]",
+        );
         assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
         assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
         assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
         assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
         assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
+        let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
+        assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
+        assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
+        assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
         assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
+        let happy =
+            labels_for("My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG");
         assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
         assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
         assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
         assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
+        let doraemon = labels_for(
+            "[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了",
+        );
         assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
         assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
         assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
         assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
+        let basket = labels_for(
+            "[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]",
+        );
         assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
         assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
         assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
         assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
+        let r18 =
+            labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
         assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
         let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
         assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
         assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        assert!(ddp
+            .iter()
+            .any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
         let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
         assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
         assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
         assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let decimal_episode =
+            labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
         assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
         assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
         assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
         assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
         assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let spy =
+            labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
         assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
         assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
         assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
         assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
         assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
+        let spy_s3 = labels_for(
+            "[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]",
+        );
         assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
         assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
         assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
         assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
         assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let slime =
+            labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
         assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
         assert!(
             slime.contains(&("300".to_string(), "B-TITLE".to_string())),
         assert!(was_trimmed);
         assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
+        let plain_season_dir =
+            "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
         let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
         assert!(was_trimmed);
         assert_eq!(
             "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
         let (trimmed, was_trimmed) = training_filename_for(menu_parent);
         assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)"
+        );
         assert!(has_encoding_noise(
             "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
         ));
+        assert!(has_encoding_noise(
+            "ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"
+        ));
         assert!(has_encoding_noise(
             "[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
         ));
             "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
+        let najica =
+            "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
         let (trimmed, was_trimmed) = training_filename_for(najica);
         assert!(was_trimmed);
         assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
         let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
         let (trimmed, was_trimmed) = training_filename_for(galient);
         assert!(was_trimmed);
+        assert_eq!(trimmed, "[1984-1985] Galient_機甲界(機甲界ガリアン) 01");
         let galient_labels = labels_for(&trimmed);
         assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
         assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
         let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
         let (trimmed, was_trimmed) = training_filename_for(nced);
         assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED"
+        );
+        let sakura =
+            "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
         let (trimmed, was_trimmed) = training_filename_for(sakura);
         assert!(was_trimmed);
         assert_eq!(
         assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
         assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let aria_notice = labels_for(
+            "[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)",
+        );
         assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
         assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
         assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
         assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
         assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let kitaro = labels_for(
+            "[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり",
+        );
         assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
         assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
         assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
         assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
         assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
+        let tv_spot =
+            labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
         assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
         assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
         assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
         assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
         assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
+        let souten = labels_for(
+            "[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]",
+        );
         assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
         assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
         assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
+        let bonjour = labels_for(
+            "(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)",
+        );
         assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        let durarara =
+            labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
         assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
         assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
         assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
         assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
         assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
+        let conan_movie = labels_for(
+            "[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]",
+        );
         assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
         assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
+        let madoka_movie = labels_for(
+            "[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]",
+        );
         assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
         assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
         assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
         assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
+        let rezero =
+            labels_for("TVアニメ『Re：ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
         assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
         assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
         assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
         assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
         assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
+        let creditless =
+            labels_for("[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)");
         assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
         assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
         assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
         assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        let bilingual = labels_for(
+            "辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]",
+        );
         assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
         assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
         assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
         assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
+        let jade =
+            labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
         assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
         assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
         assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
         assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
         assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
+        let kage =
+            labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
         assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
         assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
         assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
         assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
         assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
+        let lupin_part = labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
         assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
         assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
         assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
         assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
         assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
+        let roman_leaf = dmhy_record(
+            "Ⅰ 001 魯邦燃起了鬥志",
+            "tpl_test",
+            &suggested_roles("TEXT SEP EPISODE SEP TEXT"),
+        )
+        .unwrap();
         assert!(roman_leaf
             .tokens
             .iter()
         assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
         assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
+        let eien = labels_for(
+            "[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]",
+        );
         assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
         assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
+        let ep_only =
+            dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
         assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
     }
 }

tools/virtual_dataset_generator/src/bin/case_combo_generator.rs CHANGED Viewed

@@ -51,6 +51,22 @@ struct CharRow {
     source: Option<String>,
 }
 fn main() -> Result<()> {
     let args = Args::parse();
     let target_re = Regex::new(
@@ -215,7 +231,7 @@ fn failure_filenames(report_paths: &[PathBuf]) -> Result<HashSet<String>> {
 fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
     let entities = extract_entities_from_labels(&row.tokens, &row.labels);
-    let title = first_value(&entities, "TITLE");
     let season = first_value(&entities, "SEASON");
     let episode = first_value(&entities, "EPISODE");
     let special = first_value(&entities, "SPECIAL");
@@ -223,17 +239,17 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
     let source = first_value(&entities, "SOURCE");
     let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
-    if let Some(title) = title.clone() {
         specs.push((
             title.clone(),
-            vec![(title.clone(), "TITLE".to_string())],
             "combo_title",
         ));
         if let Some(season) = season.clone() {
             specs.push((
                 format!("{title} {season}"),
                 vec![
-                    (title.clone(), "TITLE".to_string()),
                     (season.clone(), "SEASON".to_string()),
                 ],
                 "combo_title_season",
@@ -242,7 +258,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
                 specs.push((
                     format!("{title} {season} {episode}"),
                     vec![
-                        (title.clone(), "TITLE".to_string()),
                         (season.clone(), "SEASON".to_string()),
                         (episode.clone(), "EPISODE".to_string()),
                     ],
@@ -252,7 +268,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
                     specs.push((
                         format!("{title} {season} {episode} [{resolution}][{source}]"),
                         vec![
-                            (title.clone(), "TITLE".to_string()),
                             (season.clone(), "SEASON".to_string()),
                             (episode.clone(), "EPISODE".to_string()),
                             (resolution.clone(), "RESOLUTION".to_string()),
@@ -294,11 +310,11 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
             "combo_special_only",
         ));
     }
-    if let (Some(title), Some(special)) = (title.clone(), special.clone()) {
         specs.push((
             format!("{title} - {special}"),
             vec![
-                (title.clone(), "TITLE".to_string()),
                 (special.clone(), "SPECIAL".to_string()),
             ],
             "combo_title_special",
@@ -307,7 +323,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
             specs.push((
                 format!("{title} - {special} [{episode}]"),
                 vec![
-                    (title.clone(), "TITLE".to_string()),
                     (special.clone(), "SPECIAL".to_string()),
                     (episode, "EPISODE".to_string()),
                 ],
@@ -318,7 +334,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
             specs.push((
                 format!("{title} - {special} [{resolution}][{source}]"),
                 vec![
-                    (title, "TITLE".to_string()),
                     (special, "SPECIAL".to_string()),
                     (resolution.clone(), "RESOLUTION".to_string()),
                     (source, "SOURCE".to_string()),
@@ -327,13 +343,13 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
             ));
         }
     }
-    if let (Some(title), Some(resolution), Some(source)) =
         (title, resolution.clone(), source.clone())
     {
         specs.push((
             format!("{title} [{resolution}][{source}]"),
             vec![
-                (title.clone(), "TITLE".to_string()),
                 (resolution.clone(), "RESOLUTION".to_string()),
                 (source, "SOURCE".to_string()),
             ],
@@ -362,55 +378,105 @@ fn extract_entities_from_labels(
     let mut active_entity: Option<String> = None;
     let mut active_tokens: Vec<String> = Vec::new();
     for (token, label) in tokens.iter().zip(labels.iter()) {
         if let Some(rest) = label.strip_prefix("B-") {
-            if let Some(entity) = active_entity.take() {
-                if !active_tokens.is_empty() {
-                    entities
-                        .entry(entity)
-                        .or_default()
-                        .push(active_tokens.join(""));
-                }
-            }
-            active_entity = Some(rest.to_string());
             active_tokens = vec![token.clone()];
         } else if let Some(rest) = label.strip_prefix("I-") {
-            if active_entity.as_deref() == Some(rest) {
                 active_tokens.push(token.clone());
             } else {
-                if let Some(entity) = active_entity.take() {
-                    if !active_tokens.is_empty() {
-                        entities
-                            .entry(entity)
-                            .or_default()
-                            .push(active_tokens.join(""));
-                    }
-                }
-                active_entity = Some(rest.to_string());
                 active_tokens = vec![token.clone()];
             }
         } else {
-            if let Some(entity) = active_entity.take() {
-                if !active_tokens.is_empty() {
-                    entities
-                        .entry(entity)
-                        .or_default()
-                        .push(active_tokens.join(""));
-                }
-            }
-            active_tokens.clear();
         }
     }
-    if let Some(entity) = active_entity.take() {
-        if !active_tokens.is_empty() {
-            entities
-                .entry(entity)
-                .or_default()
-                .push(active_tokens.join(""));
         }
     }
-    entities
 }
 fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
@@ -428,6 +494,8 @@ fn char_item(filename: &str, spans: &[(String, String)], source: &str) -> Option
         if text.is_empty() {
             continue;
         }
         if let Some(start) = find_substring(filename, text, cursor) {
             let end = start + text.chars().count();
             if start < labels.len() {
@@ -535,7 +603,7 @@ mod tests {
         let row = make_row(
             "One Piece Season 21 1110 [1080p][WEB-DL].mkv",
             &[
-                ("One Piece".to_string(), "TITLE".to_string()),
                 ("Season 21".to_string(), "SEASON".to_string()),
                 ("1110".to_string(), "EPISODE".to_string()),
                 ("1080p".to_string(), "RESOLUTION".to_string()),
@@ -555,8 +623,15 @@ mod tests {
         assert_eq!(
             &combo.labels[0..9],
             &[
-                "B-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE",
-                "I-TITLE", "I-TITLE"
             ]
         );
         assert_eq!(
@@ -591,5 +666,9 @@ mod tests {
         assert_eq!(combo.labels[31], "O");
         assert_eq!(combo.labels[32], "O");
         assert_eq!(combo.labels[39], "O");
     }
 }

     source: Option<String>,
 }
+const FILE_TITLE_ENTITIES: [&str; 5] = [
+    "TITLE_CHS",
+    "TITLE_CHT",
+    "TITLE_JPN",
+    "TITLE_LATIN",
+    "TITLE_MIXED",
+];
+const PATH_TITLE_ENTITIES: [&str; 5] = [
+    "PATH_TITLE_CHS",
+    "PATH_TITLE_CHT",
+    "PATH_TITLE_JPN",
+    "PATH_TITLE_LATIN",
+    "PATH_TITLE_MIXED",
+];
 fn main() -> Result<()> {
     let args = Args::parse();
     let target_re = Regex::new(
 fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
     let entities = extract_entities_from_labels(&row.tokens, &row.labels);
+    let title = first_title_value(&entities);
     let season = first_value(&entities, "SEASON");
     let episode = first_value(&entities, "EPISODE");
     let special = first_value(&entities, "SPECIAL");
     let source = first_value(&entities, "SOURCE");
     let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
+    if let Some((title, title_entity)) = title.clone() {
         specs.push((
             title.clone(),
+            vec![(title.clone(), title_entity.clone())],
             "combo_title",
         ));
         if let Some(season) = season.clone() {
             specs.push((
                 format!("{title} {season}"),
                 vec![
+                    (title.clone(), title_entity.clone()),
                     (season.clone(), "SEASON".to_string()),
                 ],
                 "combo_title_season",
                 specs.push((
                     format!("{title} {season} {episode}"),
                     vec![
+                        (title.clone(), title_entity.clone()),
                         (season.clone(), "SEASON".to_string()),
                         (episode.clone(), "EPISODE".to_string()),
                     ],
                     specs.push((
                         format!("{title} {season} {episode} [{resolution}][{source}]"),
                         vec![
+                            (title.clone(), title_entity.clone()),
                             (season.clone(), "SEASON".to_string()),
                             (episode.clone(), "EPISODE".to_string()),
                             (resolution.clone(), "RESOLUTION".to_string()),
             "combo_special_only",
         ));
     }
+    if let (Some((title, title_entity)), Some(special)) = (title.clone(), special.clone()) {
         specs.push((
             format!("{title} - {special}"),
             vec![
+                (title.clone(), title_entity.clone()),
                 (special.clone(), "SPECIAL".to_string()),
             ],
             "combo_title_special",
             specs.push((
                 format!("{title} - {special} [{episode}]"),
                 vec![
+                    (title.clone(), title_entity.clone()),
                     (special.clone(), "SPECIAL".to_string()),
                     (episode, "EPISODE".to_string()),
                 ],
             specs.push((
                 format!("{title} - {special} [{resolution}][{source}]"),
                 vec![
+                    (title, title_entity),
                     (special, "SPECIAL".to_string()),
                     (resolution.clone(), "RESOLUTION".to_string()),
                     (source, "SOURCE".to_string()),
             ));
         }
     }
+    if let (Some((title, title_entity)), Some(resolution), Some(source)) =
         (title, resolution.clone(), source.clone())
     {
         specs.push((
             format!("{title} [{resolution}][{source}]"),
             vec![
+                (title.clone(), title_entity),
                 (resolution.clone(), "RESOLUTION".to_string()),
                 (source, "SOURCE".to_string()),
             ],
     let mut active_entity: Option<String> = None;
     let mut active_tokens: Vec<String> = Vec::new();
+    let flush = |entities: &mut HashMap<String, Vec<String>>,
+                 active_entity: &mut Option<String>,
+                 active_tokens: &mut Vec<String>| {
+        if let Some(entity) = active_entity.take() {
+            if !active_tokens.is_empty() {
+                push_entity_value(entities, &entity, active_tokens.join(""));
+            }
+        }
+        active_tokens.clear();
+    };
     for (token, label) in tokens.iter().zip(labels.iter()) {
         if let Some(rest) = label.strip_prefix("B-") {
+            flush(&mut entities, &mut active_entity, &mut active_tokens);
+            active_entity = canonical_entity(rest);
             active_tokens = vec![token.clone()];
         } else if let Some(rest) = label.strip_prefix("I-") {
+            let entity = canonical_entity(rest);
+            if active_entity == entity {
                 active_tokens.push(token.clone());
             } else {
+                flush(&mut entities, &mut active_entity, &mut active_tokens);
+                active_entity = entity;
                 active_tokens = vec![token.clone()];
             }
         } else {
+            flush(&mut entities, &mut active_entity, &mut active_tokens);
         }
     }
+    flush(&mut entities, &mut active_entity, &mut active_tokens);
+    entities
+}
+fn canonical_entity(entity: &str) -> Option<String> {
+    match entity {
+        "TITLE" | "TITLE_MIXED" => Some("TITLE_MIXED".to_string()),
+        "PATH_TITLE" | "PATH_TITLE_MIXED" => Some("PATH_TITLE_MIXED".to_string()),
+        "TITLE_CHS" | "TITLE_CHT" | "TITLE_JPN" | "TITLE_LATIN" | "PATH_TITLE_CHS"
+        | "PATH_TITLE_CHT" | "PATH_TITLE_JPN" | "PATH_TITLE_LATIN" | "SEASON" | "PATH_SEASON"
+        | "EPISODE" | "SPECIAL" | "GROUP" | "RESOLUTION" | "SOURCE" | "TAG" => {
+            Some(entity.to_string())
         }
+        _ => None,
+    }
+}
+fn path_title_to_file_title(entity: &str) -> Option<String> {
+    entity
+        .strip_prefix("PATH_TITLE_")
+        .map(|suffix| format!("TITLE_{suffix}"))
+}
+fn file_title_to_path_title(entity: &str) -> Option<String> {
+    entity
+        .strip_prefix("TITLE_")
+        .map(|suffix| format!("PATH_TITLE_{suffix}"))
+}
+fn push_entity_value(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: String) {
+    let value = value.trim();
+    if value.is_empty() {
+        return;
+    }
+    push_unique(entities, entity, value);
+    if let Some(file_title) = path_title_to_file_title(entity) {
+        push_unique(entities, &file_title, value);
+    }
+    if let Some(path_title) = file_title_to_path_title(entity) {
+        push_unique(entities, &path_title, value);
+    }
+    match entity {
+        "PATH_SEASON" => push_unique(entities, "SEASON", value),
+        "SEASON" => push_unique(entities, "PATH_SEASON", value),
+        _ => {}
+    }
+}
+fn push_unique(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: &str) {
+    let values = entities.entry(entity.to_string()).or_default();
+    if !values.iter().any(|existing| existing == value) {
+        values.push(value.to_string());
     }
+}
+fn first_title_value(entities: &HashMap<String, Vec<String>>) -> Option<(String, String)> {
+    for entity in FILE_TITLE_ENTITIES {
+        if let Some(value) = first_value(entities, entity) {
+            return Some((value, entity.to_string()));
+        }
+    }
+    for entity in PATH_TITLE_ENTITIES {
+        if let Some(value) = first_value(entities, entity) {
+            let file_entity =
+                path_title_to_file_title(entity).unwrap_or_else(|| "TITLE_MIXED".to_string());
+            return Some((value, file_entity));
+        }
+    }
+    None
 }
 fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
         if text.is_empty() {
             continue;
         }
+        let entity = canonical_entity(entity)
+            .and_then(|value| path_title_to_file_title(&value).or(Some(value)))?;
         if let Some(start) = find_substring(filename, text, cursor) {
             let end = start + text.chars().count();
             if start < labels.len() {
         let row = make_row(
             "One Piece Season 21 1110 [1080p][WEB-DL].mkv",
             &[
+                ("One Piece".to_string(), "TITLE_LATIN".to_string()),
                 ("Season 21".to_string(), "SEASON".to_string()),
                 ("1110".to_string(), "EPISODE".to_string()),
                 ("1080p".to_string(), "RESOLUTION".to_string()),
         assert_eq!(
             &combo.labels[0..9],
             &[
+                "B-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN",
+                "I-TITLE_LATIN"
             ]
         );
         assert_eq!(
         assert_eq!(combo.labels[31], "O");
         assert_eq!(combo.labels[32], "O");
         assert_eq!(combo.labels[39], "O");
+        assert!(!combo
+            .labels
+            .iter()
+            .any(|label| label == "B-TITLE" || label == "I-TITLE"));
     }
 }

tools/virtual_dataset_generator/src/main.rs CHANGED Viewed

@@ -11,18 +11,93 @@ use std::collections::{HashMap, HashSet};
 use std::fs::{self, File};
 use std::io::{BufRead, BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
 use std::time::Instant;
-const ENTITIES: [Entity; 7] = [
     Entity::Group,
-    Entity::Title,
     Entity::Season,
     Entity::Episode,
     Entity::Special,
     Entity::Resolution,
     Entity::Source,
 ];
 #[derive(Parser, Debug)]
 #[command(
     about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
@@ -131,36 +206,53 @@ impl PathStyle {
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
 enum Entity {
     Group,
-    Title,
     Season,
     Episode,
     Special,
     Resolution,
     Source,
 }
 impl Entity {
     fn index(self) -> usize {
-        match self {
-            Entity::Group => 0,
-            Entity::Title => 1,
-            Entity::Season => 2,
-            Entity::Episode => 3,
-            Entity::Special => 4,
-            Entity::Resolution => 5,
-            Entity::Source => 6,
-        }
     }
     fn from_name(name: &str) -> Option<Self> {
         match name {
             "GROUP" => Some(Entity::Group),
-            "TITLE" => Some(Entity::Title),
             "SEASON" => Some(Entity::Season),
             "EPISODE" => Some(Entity::Episode),
             "SPECIAL" => Some(Entity::Special),
             "RESOLUTION" => Some(Entity::Resolution),
             "SOURCE" => Some(Entity::Source),
             _ => None,
         }
     }
@@ -168,24 +260,104 @@ impl Entity {
     fn b_label(self) -> &'static str {
         match self {
             Entity::Group => "B-GROUP",
-            Entity::Title => "B-TITLE",
             Entity::Season => "B-SEASON",
             Entity::Episode => "B-EPISODE",
             Entity::Special => "B-SPECIAL",
             Entity::Resolution => "B-RESOLUTION",
             Entity::Source => "B-SOURCE",
         }
     }
     fn i_label(self) -> &'static str {
         match self {
             Entity::Group => "I-GROUP",
-            Entity::Title => "I-TITLE",
             Entity::Season => "I-SEASON",
             Entity::Episode => "I-EPISODE",
             Entity::Special => "I-SPECIAL",
             Entity::Resolution => "I-RESOLUTION",
             Entity::Source => "I-SOURCE",
         }
     }
 }
@@ -627,18 +799,40 @@ fn load_samples(path: &Path, limit_rows: usize) -> Result<Vec<SourceSample>> {
             );
         }
         let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
-        let fields = extract_fields(&row.tokens, &row.labels);
         samples.push(SourceSample {
             row_index: idx,
             filename,
             tokens: row.tokens,
-            labels: row.labels,
             fields,
         });
     }
     Ok(samples)
 }
 fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
     let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
     let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
@@ -651,9 +845,7 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
                  seen: &mut Vec<HashSet<String>>| {
         if let Some(entity) = entity {
             let value = text.trim().to_string();
-            if !value.is_empty() && seen[entity.index()].insert(value.clone()) {
-                fields[entity.index()].push(value);
-            }
         }
         text.clear();
     };
@@ -680,14 +872,73 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
     fields
 }
 fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
     let mut count = if cfg.include_original { 1 } else { 0 };
     count += count_path_variants(sample, cfg) as u128;
-    let available = ENTITIES
-        .iter()
-        .copied()
-        .filter(|entity| !sample.fields[entity.index()].is_empty())
-        .collect::<Vec<_>>();
     let n = available.len();
     if n == 0 || !cfg.include_bio_variants {
         return count;
@@ -728,7 +979,10 @@ fn count_path_variants(sample: &SourceSample, cfg: &GenConfig) -> usize {
     if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
         return 0;
     }
-    if sample.fields[Entity::Title.index()].is_empty() {
         return 0;
     }
     if sample.fields[Entity::Episode.index()].is_empty()
@@ -776,11 +1030,7 @@ fn generate_for_sample(
         return Ok(());
     }
-    let available = ENTITIES
-        .iter()
-        .copied()
-        .filter(|entity| !sample.fields[entity.index()].is_empty())
-        .collect::<Vec<_>>();
     let n = available.len();
     for mask in 1usize..(1usize << n) {
         let mut selected = available
@@ -807,11 +1057,7 @@ fn generate_sampled_variants(
     let mut rng = StdRng::seed_from_u64(
         cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
     );
-    let available = ENTITIES
-        .iter()
-        .copied()
-        .filter(|entity| !sample.fields[entity.index()].is_empty())
-        .collect::<Vec<_>>();
     if available.is_empty() {
         return Ok(());
     }
@@ -823,15 +1069,15 @@ fn generate_sampled_variants(
     let mut attempts = 0usize;
     let mut templates: Vec<Vec<PartChoice>> = Vec::new();
-    if let Some(title) = sample.fields[Entity::Title.index()].first() {
         templates.push(vec![PartChoice {
-            entity: Entity::Title,
             value: title.clone(),
         }]);
         if let Some(season) = sample.fields[Entity::Season.index()].first() {
             templates.push(vec![
                 PartChoice {
-                    entity: Entity::Title,
                     value: title.clone(),
                 },
                 PartChoice {
@@ -853,13 +1099,13 @@ fn generate_sampled_variants(
             value: special.clone(),
         }]);
     }
-    if let (Some(title), Some(special)) = (
-        sample.fields[Entity::Title.index()].first(),
         sample.fields[Entity::Special.index()].first(),
     ) {
         templates.push(vec![
             PartChoice {
-                entity: Entity::Title,
                 value: title.clone(),
             },
             PartChoice {
@@ -902,15 +1148,12 @@ fn generate_sampled_variants(
             .copied()
             .collect::<Vec<_>>();
         chosen.shuffle(&mut rng);
-        if !chosen
-            .iter()
-            .any(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
-        {
-            if let Some(fallback) = available
-                .iter()
-                .copied()
-                .find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
-            {
                 if !chosen.contains(&fallback) {
                     chosen.push(fallback);
                 }
@@ -952,15 +1195,12 @@ fn generate_sampled_variants(
             .copied()
             .collect::<Vec<_>>();
         chosen.shuffle(&mut rng);
-        if !chosen
-            .iter()
-            .any(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
-        {
-            if let Some(fallback) = available
-                .iter()
-                .copied()
-                .find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
-            {
                 if !chosen.contains(&fallback) {
                     chosen.push(fallback);
                 }
@@ -1125,12 +1365,12 @@ fn build_path_context_pieces(
     cfg: &GenConfig,
     rng: &mut StdRng,
 ) -> Option<Vec<LabeledPiece>> {
-    let title = choose_field(sample, Entity::Title, rng)?;
     let style = *cfg.path_styles.choose(rng)?;
     let sep = style.separator();
     let mut components = path_prefix_components(style, rng);
-    components.push(vec![entity_piece(title.clone(), Entity::Title)]);
     let season_component = choose_path_season_component(sample, rng);
     if let Some(season) = season_component {
@@ -1164,7 +1404,9 @@ fn build_path_context_pieces(
             components.push(meta_file_component(sample, rng));
         }
         3 => components.push(compact_file_component(endpoint, sample, rng)),
-        4 => components.push(grouped_release_file_component(&title, endpoint, sample, rng)),
         _ => {
             components.push(vec![endpoint]);
             if rng.gen_bool(0.55) {
@@ -1236,17 +1478,19 @@ fn choose_path_season_component(
     sample: &SourceSample,
     rng: &mut StdRng,
 ) -> Option<Vec<LabeledPiece>> {
-    let season = if let Some(source_season) = choose_field(sample, Entity::Season, rng) {
         random_season_path_text(&source_season, rng)
     } else {
-        let synthetic = ["Season 1", "Season 01", "S01", "第1季"];
         synthetic
             .choose(rng)
             .copied()
             .unwrap_or("Season 1")
             .to_string()
     };
-    Some(vec![entity_piece(season, Entity::Season)])
 }
 fn path_file_component(
@@ -1335,6 +1579,14 @@ fn append_path_meta(pieces: &mut Vec<LabeledPiece>, sample: &SourceSample, rng:
             }
         }
     }
 }
 fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
@@ -1365,6 +1617,7 @@ fn random_special_path_text(value: &str, rng: &mut StdRng) -> String {
 fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
     let mut variants = vec![value.trim().to_string()];
     if let Some(number) = first_ascii_number(value) {
         variants.push(format!("Season {number}"));
         variants.push(format!("Season {number:02}"));
         variants.push(format!("S{number:02}"));
@@ -1783,24 +2036,55 @@ fn token_id(vocab: &Vocab, token: &str) -> u16 {
 }
 fn label_id(label: &str) -> Option<i16> {
-    Some(match label {
-        "O" => 0,
-        "B-TITLE" => 1,
-        "I-TITLE" => 2,
-        "B-SEASON" => 3,
-        "I-SEASON" => 4,
-        "B-EPISODE" => 5,
-        "I-EPISODE" => 6,
-        "B-SPECIAL" => 7,
-        "I-SPECIAL" => 8,
-        "B-GROUP" => 9,
-        "I-GROUP" => 10,
-        "B-RESOLUTION" => 11,
-        "I-RESOLUTION" => 12,
-        "B-SOURCE" => 13,
-        "I-SOURCE" => 14,
-        _ => return None,
-    })
 }
 fn built_in_specials() -> Vec<String> {
@@ -1904,7 +2188,8 @@ mod tests {
     fn sample_without_season() -> SourceSample {
         let mut fields = vec![Vec::new(); ENTITIES.len()];
-        fields[Entity::Title.index()] = vec!["Example Show".to_string()];
         fields[Entity::Episode.index()] = vec!["1".to_string()];
         fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
         fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
@@ -1936,10 +2221,7 @@ mod tests {
                 assert!(
                     non_empty_components >= 2,
                     "expected at least two noise directories for {style:?}: {}",
-                    render_labeled_pieces(&join_path_components(
-                        &components,
-                        style.separator()
-                    ))
                 );
                 assert!(components
                     .iter()
@@ -1949,6 +2231,57 @@ mod tests {
         }
     }
     #[test]
     fn path_context_synthesizes_season_between_title_and_episode() {
         let sample = sample_without_season();
@@ -1960,7 +2293,10 @@ mod tests {
         let text = render_labeled_pieces(&pieces);
         assert!(text.contains("Example Show"));
         assert!(
-            text.contains("Season") || text.contains("S01") || text.contains("第1季"),
             "missing synthetic season directory in {text}"
         );
@@ -1970,8 +2306,8 @@ mod tests {
         for piece in &pieces {
             match piece.entity {
                 None if !seen_title => {}
-                Some(Entity::Title) => seen_title = true,
-                Some(Entity::Season) if seen_title => seen_season_after_title = true,
                 Some(Entity::Episode) if seen_season_after_title => {
                     seen_episode_after_season = true
                 }
@@ -1983,6 +2319,49 @@ mod tests {
         assert!(seen_episode_after_season);
     }
     #[test]
     fn grouped_path_file_labels_group_but_not_duplicate_title() {
         let sample = sample_with_group();
@@ -1994,8 +2373,12 @@ mod tests {
         assert!(text.contains("[Erai-raws]"));
         assert!(text.contains("Example Show"));
         assert!(text.contains("01"));
-        assert!(pieces.iter().any(|piece| piece.entity == Some(Entity::Group)));
-        assert!(pieces.iter().any(|piece| piece.entity == Some(Entity::Episode)));
         assert!(pieces
             .iter()
             .any(|piece| piece.text == "Example Show" && piece.entity.is_none()));

 use std::fs::{self, File};
 use std::io::{BufRead, BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
 use std::time::Instant;
+const FILE_TITLE_ENTITIES: [Entity; 5] = [
+    Entity::TitleChs,
+    Entity::TitleCht,
+    Entity::TitleJpn,
+    Entity::TitleLatin,
+    Entity::TitleMixed,
+];
+const PATH_TITLE_ENTITIES: [Entity; 5] = [
+    Entity::PathTitleChs,
+    Entity::PathTitleCht,
+    Entity::PathTitleJpn,
+    Entity::PathTitleLatin,
+    Entity::PathTitleMixed,
+];
+const ENTITIES: [Entity; 18] = [
     Entity::Group,
+    Entity::TitleChs,
+    Entity::TitleCht,
+    Entity::TitleJpn,
+    Entity::TitleLatin,
+    Entity::TitleMixed,
+    Entity::PathTitleChs,
+    Entity::PathTitleCht,
+    Entity::PathTitleJpn,
+    Entity::PathTitleLatin,
+    Entity::PathTitleMixed,
+    Entity::PathSeason,
     Entity::Season,
     Entity::Episode,
     Entity::Special,
     Entity::Resolution,
     Entity::Source,
+    Entity::Tag,
+];
+const FALLBACK_LABELS: [&str; 37] = [
+    "O",
+    "B-TITLE_CHS",
+    "I-TITLE_CHS",
+    "B-TITLE_CHT",
+    "I-TITLE_CHT",
+    "B-TITLE_JPN",
+    "I-TITLE_JPN",
+    "B-TITLE_LATIN",
+    "I-TITLE_LATIN",
+    "B-TITLE_MIXED",
+    "I-TITLE_MIXED",
+    "B-PATH_TITLE_CHS",
+    "I-PATH_TITLE_CHS",
+    "B-PATH_TITLE_CHT",
+    "I-PATH_TITLE_CHT",
+    "B-PATH_TITLE_JPN",
+    "I-PATH_TITLE_JPN",
+    "B-PATH_TITLE_LATIN",
+    "I-PATH_TITLE_LATIN",
+    "B-PATH_TITLE_MIXED",
+    "I-PATH_TITLE_MIXED",
+    "B-PATH_SEASON",
+    "I-PATH_SEASON",
+    "B-SEASON",
+    "I-SEASON",
+    "B-EPISODE",
+    "I-EPISODE",
+    "B-SPECIAL",
+    "I-SPECIAL",
+    "B-GROUP",
+    "I-GROUP",
+    "B-RESOLUTION",
+    "I-RESOLUTION",
+    "B-SOURCE",
+    "I-SOURCE",
+    "B-TAG",
+    "I-TAG",
 ];
+static LABEL_IDS: OnceLock<HashMap<String, i16>> = OnceLock::new();
+#[derive(Debug, Deserialize)]
+struct LabelSchema {
+    labels: Vec<String>,
+}
 #[derive(Parser, Debug)]
 #[command(
     about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
 enum Entity {
     Group,
+    TitleChs,
+    TitleCht,
+    TitleJpn,
+    TitleLatin,
+    TitleMixed,
+    PathTitleChs,
+    PathTitleCht,
+    PathTitleJpn,
+    PathTitleLatin,
+    PathTitleMixed,
+    PathSeason,
     Season,
     Episode,
     Special,
     Resolution,
     Source,
+    Tag,
 }
 impl Entity {
     fn index(self) -> usize {
+        ENTITIES
+            .iter()
+            .position(|entity| *entity == self)
+            .expect("entity missing from ENTITIES")
     }
     fn from_name(name: &str) -> Option<Self> {
         match name {
             "GROUP" => Some(Entity::Group),
+            "TITLE" | "TITLE_MIXED" => Some(Entity::TitleMixed),
+            "TITLE_CHS" => Some(Entity::TitleChs),
+            "TITLE_CHT" => Some(Entity::TitleCht),
+            "TITLE_JPN" => Some(Entity::TitleJpn),
+            "TITLE_LATIN" => Some(Entity::TitleLatin),
+            "PATH_TITLE" | "PATH_TITLE_MIXED" => Some(Entity::PathTitleMixed),
+            "PATH_TITLE_CHS" => Some(Entity::PathTitleChs),
+            "PATH_TITLE_CHT" => Some(Entity::PathTitleCht),
+            "PATH_TITLE_JPN" => Some(Entity::PathTitleJpn),
+            "PATH_TITLE_LATIN" => Some(Entity::PathTitleLatin),
+            "PATH_SEASON" => Some(Entity::PathSeason),
             "SEASON" => Some(Entity::Season),
             "EPISODE" => Some(Entity::Episode),
             "SPECIAL" => Some(Entity::Special),
             "RESOLUTION" => Some(Entity::Resolution),
             "SOURCE" => Some(Entity::Source),
+            "TAG" => Some(Entity::Tag),
             _ => None,
         }
     }
     fn b_label(self) -> &'static str {
         match self {
             Entity::Group => "B-GROUP",
+            Entity::TitleChs => "B-TITLE_CHS",
+            Entity::TitleCht => "B-TITLE_CHT",
+            Entity::TitleJpn => "B-TITLE_JPN",
+            Entity::TitleLatin => "B-TITLE_LATIN",
+            Entity::TitleMixed => "B-TITLE_MIXED",
+            Entity::PathTitleChs => "B-PATH_TITLE_CHS",
+            Entity::PathTitleCht => "B-PATH_TITLE_CHT",
+            Entity::PathTitleJpn => "B-PATH_TITLE_JPN",
+            Entity::PathTitleLatin => "B-PATH_TITLE_LATIN",
+            Entity::PathTitleMixed => "B-PATH_TITLE_MIXED",
+            Entity::PathSeason => "B-PATH_SEASON",
             Entity::Season => "B-SEASON",
             Entity::Episode => "B-EPISODE",
             Entity::Special => "B-SPECIAL",
             Entity::Resolution => "B-RESOLUTION",
             Entity::Source => "B-SOURCE",
+            Entity::Tag => "B-TAG",
         }
     }
     fn i_label(self) -> &'static str {
         match self {
             Entity::Group => "I-GROUP",
+            Entity::TitleChs => "I-TITLE_CHS",
+            Entity::TitleCht => "I-TITLE_CHT",
+            Entity::TitleJpn => "I-TITLE_JPN",
+            Entity::TitleLatin => "I-TITLE_LATIN",
+            Entity::TitleMixed => "I-TITLE_MIXED",
+            Entity::PathTitleChs => "I-PATH_TITLE_CHS",
+            Entity::PathTitleCht => "I-PATH_TITLE_CHT",
+            Entity::PathTitleJpn => "I-PATH_TITLE_JPN",
+            Entity::PathTitleLatin => "I-PATH_TITLE_LATIN",
+            Entity::PathTitleMixed => "I-PATH_TITLE_MIXED",
+            Entity::PathSeason => "I-PATH_SEASON",
             Entity::Season => "I-SEASON",
             Entity::Episode => "I-EPISODE",
             Entity::Special => "I-SPECIAL",
             Entity::Resolution => "I-RESOLUTION",
             Entity::Source => "I-SOURCE",
+            Entity::Tag => "I-TAG",
+        }
+    }
+    fn is_file_title(self) -> bool {
+        matches!(
+            self,
+            Entity::TitleChs
+                | Entity::TitleCht
+                | Entity::TitleJpn
+                | Entity::TitleLatin
+                | Entity::TitleMixed
+        )
+    }
+    fn is_path_title(self) -> bool {
+        matches!(
+            self,
+            Entity::PathTitleChs
+                | Entity::PathTitleCht
+                | Entity::PathTitleJpn
+                | Entity::PathTitleLatin
+                | Entity::PathTitleMixed
+        )
+    }
+    fn is_ordinary_variant_entity(self) -> bool {
+        !self.is_path_title() && self != Entity::PathSeason
+    }
+    fn as_path_title(self) -> Option<Self> {
+        match self {
+            Entity::TitleChs => Some(Entity::PathTitleChs),
+            Entity::TitleCht => Some(Entity::PathTitleCht),
+            Entity::TitleJpn => Some(Entity::PathTitleJpn),
+            Entity::TitleLatin => Some(Entity::PathTitleLatin),
+            Entity::TitleMixed => Some(Entity::PathTitleMixed),
+            Entity::PathTitleChs
+            | Entity::PathTitleCht
+            | Entity::PathTitleJpn
+            | Entity::PathTitleLatin
+            | Entity::PathTitleMixed => Some(self),
+            _ => None,
+        }
+    }
+    fn as_file_title(self) -> Option<Self> {
+        match self {
+            Entity::PathTitleChs => Some(Entity::TitleChs),
+            Entity::PathTitleCht => Some(Entity::TitleCht),
+            Entity::PathTitleJpn => Some(Entity::TitleJpn),
+            Entity::PathTitleLatin => Some(Entity::TitleLatin),
+            Entity::PathTitleMixed => Some(Entity::TitleMixed),
+            Entity::TitleChs
+            | Entity::TitleCht
+            | Entity::TitleJpn
+            | Entity::TitleLatin
+            | Entity::TitleMixed => Some(self),
+            _ => None,
         }
     }
 }
             );
         }
         let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
+        let labels = row
+            .labels
+            .iter()
+            .map(|label| canonical_bio_label(label))
+            .collect::<Vec<_>>();
+        let fields = extract_fields(&row.tokens, &labels);
         samples.push(SourceSample {
             row_index: idx,
             filename,
             tokens: row.tokens,
+            labels,
             fields,
         });
     }
     Ok(samples)
 }
+fn canonical_bio_label(label: &str) -> String {
+    if label == "O" {
+        return "O".to_string();
+    }
+    let Some((prefix, entity_name)) = label.split_once('-') else {
+        return label.to_string();
+    };
+    let Some(entity) = Entity::from_name(entity_name) else {
+        return label.to_string();
+    };
+    match prefix {
+        "B" => entity.b_label().to_string(),
+        "I" => entity.i_label().to_string(),
+        _ => label.to_string(),
+    }
+}
 fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
     let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
     let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
                  seen: &mut Vec<HashSet<String>>| {
         if let Some(entity) = entity {
             let value = text.trim().to_string();
+            push_extracted_field(fields, seen, entity, value);
         }
         text.clear();
     };
     fields
 }
+fn push_extracted_field(
+    fields: &mut [Vec<String>],
+    seen: &mut [HashSet<String>],
+    entity: Entity,
+    value: String,
+) {
+    fn add(fields: &mut [Vec<String>], seen: &mut [HashSet<String>], entity: Entity, value: &str) {
+        if !value.is_empty() && seen[entity.index()].insert(value.to_string()) {
+            fields[entity.index()].push(value.to_string());
+        }
+    }
+    let value = value.trim();
+    if value.is_empty() {
+        return;
+    }
+    add(fields, seen, entity, value);
+    if let Some(path_title) = entity.as_path_title() {
+        add(fields, seen, path_title, value);
+    }
+    if let Some(file_title) = entity.as_file_title() {
+        add(fields, seen, file_title, value);
+    }
+    match entity {
+        Entity::Season => add(fields, seen, Entity::PathSeason, value),
+        Entity::PathSeason => add(fields, seen, Entity::Season, value),
+        _ => {}
+    }
+}
+fn ordinary_available_entities(sample: &SourceSample) -> Vec<Entity> {
+    ENTITIES
+        .iter()
+        .copied()
+        .filter(|entity| {
+            entity.is_ordinary_variant_entity() && !sample.fields[entity.index()].is_empty()
+        })
+        .collect()
+}
+fn first_file_title_field(sample: &SourceSample) -> Option<(Entity, String)> {
+    FILE_TITLE_ENTITIES.iter().copied().find_map(|entity| {
+        sample.fields[entity.index()]
+            .iter()
+            .find(|value| !value.trim().is_empty())
+            .map(|value| (entity, value.trim().to_string()))
+    })
+}
+fn choose_path_title_field(sample: &SourceSample, rng: &mut StdRng) -> Option<(Entity, String)> {
+    let mut candidates = Vec::new();
+    for entity in PATH_TITLE_ENTITIES {
+        for value in &sample.fields[entity.index()] {
+            let value = value.trim();
+            if !value.is_empty() {
+                candidates.push((entity, value.to_string()));
+            }
+        }
+    }
+    candidates.choose(rng).cloned()
+}
 fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
     let mut count = if cfg.include_original { 1 } else { 0 };
     count += count_path_variants(sample, cfg) as u128;
+    let available = ordinary_available_entities(sample);
     let n = available.len();
     if n == 0 || !cfg.include_bio_variants {
         return count;
     if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
         return 0;
     }
+    if !PATH_TITLE_ENTITIES
+        .iter()
+        .any(|entity| !sample.fields[entity.index()].is_empty())
+    {
         return 0;
     }
     if sample.fields[Entity::Episode.index()].is_empty()
         return Ok(());
     }
+    let available = ordinary_available_entities(sample);
     let n = available.len();
     for mask in 1usize..(1usize << n) {
         let mut selected = available
     let mut rng = StdRng::seed_from_u64(
         cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
     );
+    let available = ordinary_available_entities(sample);
     if available.is_empty() {
         return Ok(());
     }
     let mut attempts = 0usize;
     let mut templates: Vec<Vec<PartChoice>> = Vec::new();
+    if let Some((title_entity, title)) = first_file_title_field(sample) {
         templates.push(vec![PartChoice {
+            entity: title_entity,
             value: title.clone(),
         }]);
         if let Some(season) = sample.fields[Entity::Season.index()].first() {
             templates.push(vec![
                 PartChoice {
+                    entity: title_entity,
                     value: title.clone(),
                 },
                 PartChoice {
             value: special.clone(),
         }]);
     }
+    if let (Some((title_entity, title)), Some(special)) = (
+        first_file_title_field(sample),
         sample.fields[Entity::Special.index()].first(),
     ) {
         templates.push(vec![
             PartChoice {
+                entity: title_entity,
                 value: title.clone(),
             },
             PartChoice {
             .copied()
             .collect::<Vec<_>>();
         chosen.shuffle(&mut rng);
+        if !chosen.iter().any(|entity| {
+            entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
+        }) {
+            if let Some(fallback) = available.iter().copied().find(|entity| {
+                entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
+            }) {
                 if !chosen.contains(&fallback) {
                     chosen.push(fallback);
                 }
             .copied()
             .collect::<Vec<_>>();
         chosen.shuffle(&mut rng);
+        if !chosen.iter().any(|entity| {
+            entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
+        }) {
+            if let Some(fallback) = available.iter().copied().find(|entity| {
+                entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
+            }) {
                 if !chosen.contains(&fallback) {
                     chosen.push(fallback);
                 }
     cfg: &GenConfig,
     rng: &mut StdRng,
 ) -> Option<Vec<LabeledPiece>> {
+    let (title_entity, title) = choose_path_title_field(sample, rng)?;
     let style = *cfg.path_styles.choose(rng)?;
     let sep = style.separator();
     let mut components = path_prefix_components(style, rng);
+    components.push(vec![entity_piece(title.clone(), title_entity)]);
     let season_component = choose_path_season_component(sample, rng);
     if let Some(season) = season_component {
             components.push(meta_file_component(sample, rng));
         }
         3 => components.push(compact_file_component(endpoint, sample, rng)),
+        4 => components.push(grouped_release_file_component(
+            &title, endpoint, sample, rng,
+        )),
         _ => {
             components.push(vec![endpoint]);
             if rng.gen_bool(0.55) {
     sample: &SourceSample,
     rng: &mut StdRng,
 ) -> Option<Vec<LabeledPiece>> {
+    let season = if let Some(source_season) = choose_field(sample, Entity::PathSeason, rng)
+        .or_else(|| choose_field(sample, Entity::Season, rng))
+    {
         random_season_path_text(&source_season, rng)
     } else {
+        let synthetic = ["01", "Season 1", "Season 01", "S01", "第1季"];
         synthetic
             .choose(rng)
             .copied()
             .unwrap_or("Season 1")
             .to_string()
     };
+    Some(vec![entity_piece(season, Entity::PathSeason)])
 }
 fn path_file_component(
             }
         }
     }
+    if let Some(tag) = choose_field(sample, Entity::Tag, rng) {
+        if rng.gen_bool(0.55) {
+            pieces.push(o_piece("[".to_string()));
+            pieces.push(entity_piece(tag, Entity::Tag));
+            pieces.push(o_piece("]".to_string()));
+        }
+    }
 }
 fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
 fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
     let mut variants = vec![value.trim().to_string()];
     if let Some(number) = first_ascii_number(value) {
+        variants.push(format!("{number:02}"));
         variants.push(format!("Season {number}"));
         variants.push(format!("Season {number:02}"));
         variants.push(format!("S{number:02}"));
 }
 fn label_id(label: &str) -> Option<i16> {
+    label_ids().get(label).copied()
+}
+fn label_ids() -> &'static HashMap<String, i16> {
+    LABEL_IDS.get_or_init(load_label_ids)
+}
+fn load_label_ids() -> HashMap<String, i16> {
+    let labels = read_schema_labels().unwrap_or_else(|| {
+        FALLBACK_LABELS
+            .iter()
+            .map(|label| (*label).to_string())
+            .collect()
+    });
+    labels
+        .into_iter()
+        .enumerate()
+        .map(|(idx, label)| (label, idx as i16))
+        .collect()
+}
+fn read_schema_labels() -> Option<Vec<String>> {
+    for path in label_schema_candidates() {
+        let Ok(text) = fs::read_to_string(path) else {
+            continue;
+        };
+        let Ok(schema) = serde_json::from_str::<LabelSchema>(&text) else {
+            continue;
+        };
+        if schema.labels.is_empty() || schema.labels.iter().any(|label| label.trim().is_empty()) {
+            continue;
+        }
+        return Some(schema.labels);
+    }
+    None
+}
+fn label_schema_candidates() -> Vec<PathBuf> {
+    let mut candidates = Vec::new();
+    if let Ok(current_dir) = std::env::current_dir() {
+        candidates.push(current_dir.join("label_schema.json"));
+    }
+    candidates.push(
+        Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("..")
+            .join("..")
+            .join("label_schema.json"),
+    );
+    candidates
 }
 fn built_in_specials() -> Vec<String> {
     fn sample_without_season() -> SourceSample {
         let mut fields = vec![Vec::new(); ENTITIES.len()];
+        fields[Entity::TitleLatin.index()] = vec!["Example Show".to_string()];
+        fields[Entity::PathTitleLatin.index()] = vec!["Example Show".to_string()];
         fields[Entity::Episode.index()] = vec!["1".to_string()];
         fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
         fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
                 assert!(
                     non_empty_components >= 2,
                     "expected at least two noise directories for {style:?}: {}",
+                    render_labeled_pieces(&join_path_components(&components, style.separator()))
                 );
                 assert!(components
                     .iter()
         }
     }
+    #[test]
+    fn fixed_label_schema_ids_match_v2_order() {
+        assert_eq!(label_id("O"), Some(0));
+        assert_eq!(label_id("B-TITLE_CHS"), Some(1));
+        assert_eq!(label_id("I-TITLE_MIXED"), Some(10));
+        assert_eq!(label_id("B-PATH_TITLE_CHS"), Some(11));
+        assert_eq!(label_id("I-PATH_TITLE_MIXED"), Some(20));
+        assert_eq!(label_id("B-PATH_SEASON"), Some(21));
+        assert_eq!(label_id("B-SEASON"), Some(23));
+        assert_eq!(label_id("B-EPISODE"), Some(25));
+        assert_eq!(label_id("B-GROUP"), Some(29));
+        assert_eq!(label_id("B-SOURCE"), Some(33));
+        assert_eq!(label_id("B-TAG"), Some(35));
+        assert_eq!(label_id("I-TAG"), Some(36));
+        assert_eq!(label_id("B-TITLE"), None);
+    }
+    #[test]
+    fn legacy_source_title_labels_canonicalize_to_mixed_schema() {
+        assert_eq!(canonical_bio_label("B-TITLE"), "B-TITLE_MIXED");
+        assert_eq!(canonical_bio_label("I-TITLE"), "I-TITLE_MIXED");
+        assert_eq!(canonical_bio_label("B-PATH_TITLE"), "B-PATH_TITLE_MIXED");
+        assert_eq!(canonical_bio_label("B-SEASON"), "B-SEASON");
+    }
+    #[test]
+    fn generated_entities_do_not_emit_legacy_title_labels() {
+        for entity in ENTITIES {
+            assert_ne!(entity.b_label(), "B-TITLE");
+            assert_ne!(entity.i_label(), "I-TITLE");
+        }
+    }
+    #[test]
+    fn extraction_preserves_file_and_path_title_candidates() {
+        let tokens = ["A", "/", "僕", "ら"]
+            .iter()
+            .map(|value| value.to_string())
+            .collect::<Vec<_>>();
+        let labels = ["B-TITLE_LATIN", "O", "B-PATH_TITLE_JPN", "I-PATH_TITLE_JPN"]
+            .iter()
+            .map(|value| value.to_string())
+            .collect::<Vec<_>>();
+        let fields = extract_fields(&tokens, &labels);
+        assert_eq!(fields[Entity::TitleLatin.index()], vec!["A"]);
+        assert_eq!(fields[Entity::PathTitleLatin.index()], vec!["A"]);
+        assert_eq!(fields[Entity::PathTitleJpn.index()], vec!["僕ら"]);
+        assert_eq!(fields[Entity::TitleJpn.index()], vec!["僕ら"]);
+    }
     #[test]
     fn path_context_synthesizes_season_between_title_and_episode() {
         let sample = sample_without_season();
         let text = render_labeled_pieces(&pieces);
         assert!(text.contains("Example Show"));
         assert!(
+            text.contains("Season")
+                || text.contains("S01")
+                || text.contains("第1季")
+                || text.contains("01"),
             "missing synthetic season directory in {text}"
         );
         for piece in &pieces {
             match piece.entity {
                 None if !seen_title => {}
+                Some(Entity::PathTitleLatin) => seen_title = true,
+                Some(Entity::PathSeason) if seen_title => seen_season_after_title = true,
                 Some(Entity::Episode) if seen_season_after_title => {
                     seen_episode_after_season = true
                 }
         assert!(seen_episode_after_season);
     }
+    #[test]
+    fn path_context_can_label_bare_numeric_path_season() {
+        let mut sample = sample_without_season();
+        sample.fields[Entity::Episode.index()] = vec!["3".to_string()];
+        let mut cfg = test_config();
+        cfg.path_styles = vec![PathStyle::Unix];
+        let mut found = None;
+        for seed in 0..2048 {
+            let mut rng = StdRng::seed_from_u64(seed);
+            let pieces = build_path_context_pieces(&sample, &cfg, &mut rng)
+                .expect("expected path context pieces");
+            let text = render_labeled_pieces(&pieces);
+            if text.contains("Example Show/01/03.mkv") {
+                found = Some(pieces);
+                break;
+            }
+        }
+        let pieces = found.expect("expected a Title/01/03.mkv-style path context");
+        assert!(pieces
+            .iter()
+            .any(|piece| piece.text == "01" && piece.entity == Some(Entity::PathSeason)));
+        assert!(pieces
+            .iter()
+            .any(|piece| piece.text == "03" && piece.entity == Some(Entity::Episode)));
+    }
+    #[test]
+    fn path_season_variants_include_common_directory_forms() {
+        let mut variants = HashSet::new();
+        for seed in 0..128 {
+            let mut rng = StdRng::seed_from_u64(seed);
+            variants.insert(random_season_path_text("S01", &mut rng));
+        }
+        assert!(variants.contains("S01"));
+        assert!(variants.contains("01"));
+        assert!(variants.contains("Season 1"));
+        assert!(variants.contains("Season 01"));
+    }
     #[test]
     fn grouped_path_file_labels_group_but_not_duplicate_title() {
         let sample = sample_with_group();
         assert!(text.contains("[Erai-raws]"));
         assert!(text.contains("Example Show"));
         assert!(text.contains("01"));
+        assert!(pieces
+            .iter()
+            .any(|piece| piece.entity == Some(Entity::Group)));
+        assert!(pieces
+            .iter()
+            .any(|piece| piece.entity == Some(Entity::Episode)));
         assert!(pieces
             .iter()
             .any(|piece| piece.text == "Example Show" && piece.entity.is_none()));