Add DMHY prefix graph annotation workflow

Browse files

Files changed (8) hide show

datasets/AnimeName +1 -1
tools/annotate_dmhy_prefix_dag.py +349 -0
tools/annotate_dmhy_prefix_graph.py +715 -0
tools/convert_annotated_dmhy_dataset.py +302 -0
tools/dmhy_prefix_grouper/Cargo.lock +347 -0
tools/dmhy_prefix_grouper/Cargo.toml +12 -0
tools/dmhy_prefix_grouper/src/main.rs +1070 -0
tools/test_annotated_dmhy_workflow.py +445 -0

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~2ea069cd2c6f4c8b085bdfaddc5659781623cf45~~


1	+ Subproject commit ab3fbcad1a4bf889090d050248130c7d763c457e

tools/annotate_dmhy_prefix_dag.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""Build annotation units from a DMHY prefix DAG.
+The DAG producer keeps repeated suffix structure shared across many raw
+prefixes. This tool turns those shared nodes into compact, traceable units for
+LLM or human review without calling any remote service.
+"""
+from __future__ import annotations
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+from tools.annotate_dmhy_prefix_graph import (
+    heuristic_patch,
+    string_list,
+)
+DEFAULT_DAG = Path("datasets/AnimeName/dmhy_prefix_dag.json")
+DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_dag.annotation_units.jsonl")
+@dataclass(frozen=True)
+class Args:
+    dag: Path
+    output: Path
+    min_reachable_terminals: int
+    min_incoming_count: int
+    limit: int | None
+    example_count: int
+def parse_args() -> Args:
+    parser = argparse.ArgumentParser(
+        description="Emit DAG-aware DMHY prefix annotation units as JSONL"
+    )
+    parser.add_argument("--dag", type=Path, default=DEFAULT_DAG, help="Input dmhy_prefix_dag.json")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=DEFAULT_OUTPUT,
+        help="Output annotation unit JSONL",
+    )
+    parser.add_argument(
+        "--min-reachable-terminals",
+        type=int,
+        default=2,
+        help="Select non-root nodes reaching at least this many terminals",
+    )
+    parser.add_argument(
+        "--min-incoming-count",
+        type=int,
+        default=2,
+        help="Select nodes with at least this many incoming DAG edges",
+    )
+    parser.add_argument("--limit", type=int, default=None, help="Maximum units to write")
+    parser.add_argument(
+        "--example-count",
+        type=int,
+        default=5,
+        help="Maximum examples retained per example field",
+    )
+    ns = parser.parse_args()
+    return Args(
+        dag=ns.dag,
+        output=ns.output,
+        min_reachable_terminals=max(1, ns.min_reachable_terminals),
+        min_incoming_count=max(1, ns.min_incoming_count),
+        limit=ns.limit,
+        example_count=max(1, ns.example_count),
+    )
+def load_dag(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise SystemExit(f"DAG not found: {path}")
+    try:
+        dag = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise SystemExit(f"invalid DAG JSON in {path}: {exc}") from exc
+    if not isinstance(dag, dict):
+        raise SystemExit(f"invalid DAG schema in {path}: root must be an object")
+    if not isinstance(dag.get("nodes"), list):
+        raise SystemExit(f"invalid DAG schema in {path}: missing nodes list")
+    if not isinstance(dag.get("terminals"), list):
+        raise SystemExit(f"invalid DAG schema in {path}: missing terminals list")
+    return dag
+def node_id(node: dict[str, Any], fallback: int) -> int:
+    value = node.get("id", fallback)
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        raise SystemExit(f"invalid node id: {value!r}") from None
+def terminal_id(terminal: dict[str, Any], fallback: int) -> str:
+    value = terminal.get("terminal_id", terminal.get("id", fallback))
+    return str(value)
+def int_field(row: dict[str, Any], key: str, default: int = 0) -> int:
+    try:
+        return int(row.get(key, default) or default)
+    except (TypeError, ValueError):
+        return default
+def build_indexes(dag: dict[str, Any]) -> tuple[dict[int, dict[str, Any]], dict[int, list[dict[str, Any]]]]:
+    nodes: dict[int, dict[str, Any]] = {}
+    for fallback, node in enumerate(dag["nodes"]):
+        if not isinstance(node, dict):
+            continue
+        nodes[node_id(node, fallback)] = node
+    terminals_by_node: dict[int, list[dict[str, Any]]] = {}
+    for fallback, terminal in enumerate(dag["terminals"]):
+        if not isinstance(terminal, dict):
+            continue
+        try:
+            terminal_node_id = int(terminal.get("node_id"))
+        except (TypeError, ValueError):
+            continue
+        terminal = dict(terminal)
+        terminal["_terminal_id"] = terminal_id(terminal, fallback)
+        terminal["_terminal_index"] = fallback
+        terminals_by_node.setdefault(terminal_node_id, []).append(terminal)
+    return nodes, terminals_by_node
+def reachable_terminals(
+    start_node_id: int,
+    nodes: dict[int, dict[str, Any]],
+    terminals_by_node: dict[int, list[dict[str, Any]]],
+) -> list[dict[str, Any]]:
+    memo: dict[int, list[dict[str, Any]]] = {}
+    visiting: set[int] = set()
+    def visit(current_id: int) -> list[dict[str, Any]]:
+        if current_id in memo:
+            return memo[current_id]
+        if current_id in visiting:
+            raise SystemExit(f"cycle detected while traversing DAG at node {current_id}")
+        visiting.add(current_id)
+        found = list(terminals_by_node.get(current_id, []))
+        node = nodes.get(current_id, {})
+        for edge in node.get("children") or []:
+            if not isinstance(edge, dict):
+                continue
+            try:
+                target = int(edge.get("target"))
+            except (TypeError, ValueError):
+                continue
+            found.extend(visit(target))
+        visiting.remove(current_id)
+        deduped = dedupe_terminals(found)
+        memo[current_id] = deduped
+        return deduped
+    return visit(start_node_id)
+def dedupe_terminals(terminals: Iterable[dict[str, Any]]) -> list[dict[str, Any]]:
+    seen: set[str] = set()
+    result: list[dict[str, Any]] = []
+    for terminal in terminals:
+        tid = str(terminal.get("_terminal_id") or terminal.get("terminal_id") or "")
+        if not tid or tid in seen:
+            continue
+        seen.add(tid)
+        result.append(terminal)
+    return result
+def limited_unique(values: Iterable[str], limit: int) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in values:
+        if not value or not value.strip() or value in seen:
+            continue
+        seen.add(value)
+        result.append(value)
+        if len(result) >= limit:
+            break
+    return result
+def edge_labels(node: dict[str, Any], limit: int) -> list[str]:
+    labels = []
+    for edge in node.get("children") or []:
+        if isinstance(edge, dict) and edge.get("label") is not None:
+            labels.append(str(edge["label"]))
+    return limited_unique(labels, limit)
+def aggregate_examples(terminals: list[dict[str, Any]], key: str, limit: int) -> list[str]:
+    values: list[str] = []
+    for terminal in terminals:
+        if key == "prefix":
+            value = terminal.get("prefix")
+            if value is not None:
+                values.append(str(value))
+        else:
+            values.extend(string_list(terminal.get(key)))
+    return limited_unique(values, limit)
+def aggregate_needs_review(terminals: list[dict[str, Any]]) -> bool:
+    for index, terminal in enumerate(terminals):
+        annotations = terminal.get("annotations")
+        if isinstance(annotations, dict) and annotations.get("needs_llm_review") is not None:
+            if bool(annotations["needs_llm_review"]):
+                return True
+            continue
+        if heuristic_patch(terminal, index)["needs_llm_review"]:
+            return True
+    return False
+def annotation_template() -> dict[str, Any]:
+    return {
+        "episode_title_suffixes": [],
+        "media_suffixes": [],
+        "title_candidates": [],
+        "notes": None,
+    }
+def recommended_prompt(kind: str, terminal_count: int) -> str:
+    if kind == "shared_suffix":
+        return (
+            "Review the shared DAG suffix examples and mark episode-title text, media metadata, "
+            f"and possible title candidates for {terminal_count} linked terminals."
+        )
+    return "Review this terminal cluster and mark episode-title text, media metadata, and title candidates."
+def make_unit(
+    node: dict[str, Any],
+    node_id_value: int,
+    terminals: list[dict[str, Any]],
+    kind: str,
+    example_count: int,
+) -> dict[str, Any]:
+    terminal_ids = [str(terminal["_terminal_id"]) for terminal in terminals]
+    reachable_weight = int_field(node, "reachable_weight")
+    if reachable_weight <= 0:
+        reachable_weight = sum(int_field(terminal, "weight", int_field(terminal, "count", 1)) for terminal in terminals)
+    return {
+        "unit_id": f"dag-node-{node_id_value}",
+        "node_id": node_id_value,
+        "kind": kind,
+        "incoming_count": int_field(node, "incoming_count"),
+        "reachable_terminals": len(terminals),
+        "reachable_weight": reachable_weight,
+        "terminal_ids": terminal_ids,
+        "prefix_examples": aggregate_examples(terminals, "prefix", example_count),
+        "value_examples": aggregate_examples(terminals, "value_examples", example_count),
+        "suffix_examples": aggregate_examples(terminals, "suffix_examples", example_count),
+        "common_edge_labels": edge_labels(node, example_count),
+        "needs_llm_review": aggregate_needs_review(terminals),
+        "recommended_prompt": recommended_prompt(kind, len(terminals)),
+        "annotations": annotation_template(),
+    }
+def selected_units(dag: dict[str, Any], args: Args) -> list[dict[str, Any]]:
+    nodes, terminals_by_node = build_indexes(dag)
+    root = int(dag.get("root", 0) or 0)
+    candidates: list[tuple[tuple[int, int, int, int, int], dict[str, Any]]] = []
+    for current_id, node in nodes.items():
+        terminals = reachable_terminals(current_id, nodes, terminals_by_node)
+        if not terminals:
+            continue
+        incoming_count = int_field(node, "incoming_count")
+        reachable_count = len(terminals)
+        by_shared_incoming = incoming_count >= args.min_incoming_count
+        by_reachable = current_id != root and reachable_count >= args.min_reachable_terminals
+        if by_shared_incoming or by_reachable:
+            unit = make_unit(node, current_id, terminals, "shared_suffix", args.example_count)
+            sort_key = (
+                0,
+                -unit["reachable_weight"],
+                -unit["reachable_terminals"],
+                -unit["incoming_count"],
+                current_id,
+            )
+            candidates.append((sort_key, unit))
+    covered_terminal_ids = {
+        terminal_id
+        for _sort_key, unit in candidates
+        for terminal_id in unit["terminal_ids"]
+    }
+    for current_id, terminals in terminals_by_node.items():
+        uncovered = [terminal for terminal in terminals if terminal["_terminal_id"] not in covered_terminal_ids]
+        if not uncovered:
+            continue
+        node = nodes.get(current_id, {"id": current_id})
+        unit = make_unit(node, current_id, uncovered, "terminal_cluster", args.example_count)
+        sort_key = (
+            1,
+            -unit["reachable_weight"],
+            -unit["reachable_terminals"],
+            -unit["incoming_count"],
+            current_id,
+        )
+        candidates.append((sort_key, unit))
+    candidates.sort(key=lambda item: item[0])
+    units = [unit for _sort_key, unit in candidates]
+    if args.limit is not None:
+        units = units[: max(0, args.limit)]
+    return units
+def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with path.open("w", encoding="utf-8", newline="\n") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
+            count += 1
+    return count
+def main() -> None:
+    args = parse_args()
+    dag = load_dag(args.dag)
+    units = selected_units(dag, args)
+    count = write_jsonl(args.output, units)
+    summary = {
+        "dag": str(args.dag),
+        "output": str(args.output),
+        "annotation_units": count,
+        "min_reachable_terminals": args.min_reachable_terminals,
+        "min_incoming_count": args.min_incoming_count,
+        "example_count": args.example_count,
+    }
+    print(json.dumps(summary, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()

tools/annotate_dmhy_prefix_graph.py ADDED Viewed

	@@ -0,0 +1,715 @@

+"""Annotate DMHY prefix graph terminals and emit weak-label dataset rows.
+The graph producer intentionally leaves terminal.annotations empty. This tool
+adds a deterministic suffix-format layer without depending on network access:
+- classify suffix examples into episode-title text vs media/hash metadata
+- optionally ask an OpenAI-compatible Responses API for a second opinion
+- write dmhy_weak-compatible JSONL records: filename, tokens, labels
+- optionally write graph annotation patch JSONL and/or a merged graph JSON
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import sys
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable
+from anifilebert.tokenizer import AnimeTokenizer
+from tools.dmhy_dataset import weak_label_filename
+DEFAULT_GRAPH = Path("datasets/AnimeName/dmhy_prefix_graph.json")
+DEFAULT_SOURCE_LIST = Path("datasets/AnimeName/dmhy_list.jsonl")
+DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
+DEFAULT_PATCH_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_graph.annotations.jsonl")
+DEFAULT_MODEL = "gpt-5.4-mini"
+SOURCE = "heuristic-v1"
+LLM_SOURCE = "responses-v1"
+TRAILING_HASH_RE = re.compile(r"^[A-Fa-f0-9]{8,}$")
+RESOLUTION_RE = re.compile(r"(?i)(?:\b\d{3,4}p\b|\b\dk\b|\b\d{3,4}[xX×]\d{3,4}\b)")
+MEDIA_WORD_RE = re.compile(
+    r"(?i)\b(?:"
+    r"web[-_. ]?dl|web[-_. ]?rip|bdrip|blu[-_. ]?ray|bdmv|bd|dvd[-_. ]?rip|dvd|"
+    r"hdtv|tvrip|remux|x26[45]|h\.?26[45]|hevc|avc|av1|aac\d*(?:\.\d+)?|"
+    r"flac|mp3|dts|opus|10[-_. ]?bit|8[-_. ]?bit|hi10p|ma10p|yuv\d+p?\d*|"
+    r"chs|cht|gb|big5|jpn?|eng|m?subs?|assx?\d*|srtx?\d*|vfr|cfr|"
+    r"nf|netflix|amzn|baha|cr|abema|dsnp|hulu"
+    r")\b"
+)
+LANG_CJK_RE = re.compile(r"(?:字幕|简体|繁体|简中|繁中|日语|英语|双语|内封|外挂)")
+QUOTED_RE = re.compile(r"[「『\"“](.+?)[」』\"”]")
+BRACKET_SEGMENT_RE = re.compile(r"(\[[^\]]+\]|\([^)]+\)|【[^】]+】|《[^》]+》)")
+PATH_EPISODE_TITLE_RE = re.compile(
+    r"(?i)(?:^|[/\\])[^/\\]*?(?:"
+    r"S\d{1,2}E\d{1,4}|\d{1,2}x\d{1,4}|EP?\.?\s*\d{1,4}|ACT\.?\d{1,4}|第\s*\d{1,4}\s*[話话集回]"
+    r")\s*[-_ ]+(?P<title>[^/\\\[\(【《]+)"
+)
+@dataclass
+class Args:
+    graph: Path
+    source_list: Path
+    output: Path
+    patch_output: Path | None
+    merge_output: Path | None
+    limit: int | None
+    min_weight: int | None
+    only_needs_review: bool
+    llm: bool
+    base_url: str
+    api_key: str | None
+    model: str
+    max_requests: int | None
+    http_timeout: int
+    preserve_i_labels: bool
+    examples_only: bool
+def parse_args() -> Args:
+    parser = argparse.ArgumentParser(
+        description="Annotate DMHY prefix graph terminals and write dmhy_weak-compatible rows"
+    )
+    parser.add_argument("--graph", type=Path, default=DEFAULT_GRAPH, help="Input dmhy_prefix_graph.json")
+    parser.add_argument(
+        "--source-list",
+        type=Path,
+        default=DEFAULT_SOURCE_LIST,
+        help="Input dmhy_list.jsonl with full raw values; each line must contain a JSON object with value",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=DEFAULT_OUTPUT,
+        help="Output dataset JSONL records compatible with dmhy_weak.jsonl",
+    )
+    parser.add_argument(
+        "--patch-output",
+        default=str(DEFAULT_PATCH_OUTPUT),
+        help="Optional JSONL terminal annotation patches; use empty string to disable",
+    )
+    parser.add_argument("--merge-output", type=Path, default=None, help="Optional full graph JSON with terminal.annotations merged")
+    parser.add_argument("--limit", type=int, default=None, help="Maximum selected terminals to process")
+    parser.add_argument("--min-weight", type=int, default=None, help="Only process terminals with weight >= this value")
+    parser.add_argument("--only-needs-review", action="store_true", help="Only process terminals with ambiguous suffix examples")
+    parser.add_argument("--llm", action="store_true", help="Opt in to Responses API annotation")
+    parser.add_argument(
+        "--base-url",
+        default=os.environ.get("ANIFILEBERT_LLM_BASE_URL", "http://10.137.32.209:8317/v1"),
+        help="OpenAI-compatible API base URL; used only with --llm",
+    )
+    parser.add_argument(
+        "--api-key",
+        default=os.environ.get("ANIFILEBERT_LLM_API_KEY"),
+        help="API key; falls back to ANIFILEBERT_LLM_API_KEY",
+    )
+    parser.add_argument("--model", default=DEFAULT_MODEL, help="Responses API model")
+    parser.add_argument("--max-requests", type=int, default=None, help="Maximum LLM requests; omitted means no cap")
+    parser.add_argument("--http-timeout", type=int, default=120, help="HTTP timeout in seconds per LLM request")
+    parser.add_argument(
+        "--preserve-i-labels",
+        action="store_true",
+        help="Keep I-* labels from weak labeling instead of normalizing generated token labels to B/O only",
+    )
+    parser.add_argument(
+        "--examples-only",
+        action="store_true",
+        help="Use terminal.value_examples only; preserves the old small-sample behavior",
+    )
+    ns = parser.parse_args()
+    patch_output_arg = str(ns.patch_output).strip()
+    patch_output = Path(patch_output_arg) if patch_output_arg else None
+    if patch_output is not None and str(patch_output).strip() == "":
+        patch_output = None
+    return Args(
+        graph=ns.graph,
+        source_list=ns.source_list,
+        output=ns.output,
+        patch_output=patch_output,
+        merge_output=ns.merge_output,
+        limit=ns.limit,
+        min_weight=ns.min_weight,
+        only_needs_review=ns.only_needs_review,
+        llm=ns.llm,
+        base_url=ns.base_url,
+        api_key=ns.api_key,
+        model=ns.model,
+        max_requests=ns.max_requests,
+        http_timeout=ns.http_timeout,
+        preserve_i_labels=ns.preserve_i_labels,
+        examples_only=ns.examples_only,
+    )
+def load_graph(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise SystemExit(f"graph not found: {path}")
+    try:
+        graph = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise SystemExit(f"invalid graph JSON in {path}: {exc}") from exc
+    if not isinstance(graph, dict):
+        raise SystemExit(f"invalid graph schema in {path}: root must be an object")
+    terminals = graph.get("terminals")
+    if not isinstance(terminals, list):
+        raise SystemExit(f"invalid graph schema in {path}: missing terminals list")
+    if not terminals:
+        raise SystemExit(f"graph has no terminals: {path}")
+    return graph
+def terminal_id(terminal: dict[str, Any], index: int) -> str:
+    for key in ("terminal_id", "id", "node_id"):
+        value = terminal.get(key)
+        if value is not None:
+            return str(value)
+    return str(index)
+def string_list(value: Any) -> list[str]:
+    if not isinstance(value, list):
+        return []
+    return [str(item) for item in value if str(item).strip()]
+def unique_keep_order(values: Iterable[str]) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in values:
+        cleaned = normalize_space(value)
+        if not cleaned or cleaned in seen:
+            continue
+        seen.add(cleaned)
+        result.append(cleaned)
+    return result
+def normalize_space(value: str) -> str:
+    return re.sub(r"\s+", " ", value).strip()
+def clean_candidate(value: str) -> str:
+    value = normalize_space(value)
+    value = value.strip("-_ .~/\\|")
+    value = value.strip("[]()【】《》「」『』\"“”")
+    return normalize_space(value.replace("_", " "))
+def is_media_fragment(value: str) -> bool:
+    text = clean_candidate(value)
+    if not text:
+        return False
+    if TRAILING_HASH_RE.fullmatch(text):
+        return True
+    if RESOLUTION_RE.search(text) or MEDIA_WORD_RE.search(text) or LANG_CJK_RE.search(text):
+        return True
+    if len(text) <= 16 and re.fullmatch(r"[A-Fa-f0-9]{8,}(?:\s*rev)?", text):
+        return True
+    return False
+def split_suffix_fragments(suffix: str) -> tuple[list[str], list[str]]:
+    episode_titles: list[str] = []
+    media: list[str] = []
+    for match in QUOTED_RE.finditer(suffix):
+        candidate = clean_candidate(match.group(1))
+        if candidate and not is_media_fragment(candidate):
+            episode_titles.append(candidate)
+    remainder = suffix
+    for segment in BRACKET_SEGMENT_RE.findall(suffix):
+        cleaned = clean_candidate(segment)
+        if is_media_fragment(cleaned):
+            media.append(segment.strip())
+            remainder = remainder.replace(segment, " ", 1)
+    for match in PATH_EPISODE_TITLE_RE.finditer(suffix):
+        candidate = clean_candidate(match.group("title"))
+        if candidate and not is_media_fragment(candidate):
+            episode_titles.append(candidate)
+    for piece in re.split(r"[/\\]", remainder):
+        cleaned = clean_candidate(piece)
+        if not cleaned:
+            continue
+        if is_media_fragment(cleaned):
+            media.append(cleaned)
+        elif QUOTED_RE.search(piece):
+            continue
+        elif looks_like_plain_episode_title(cleaned):
+            episode_titles.append(cleaned)
+    return unique_keep_order(episode_titles), unique_keep_order(media)
+def looks_like_plain_episode_title(value: str) -> bool:
+    if len(value) < 3 or is_media_fragment(value):
+        return False
+    if re.fullmatch(r"(?i)(?:part|ova|special|season|stage|act)\s*\d+", value):
+        return False
+    if re.fullmatch(r"[\d\s._-]+", value):
+        return False
+    return bool(re.search(r"[A-Za-z\u3040-\u30ff\u3400-\u9fff]", value))
+def heuristic_patch(terminal: dict[str, Any], index: int) -> dict[str, Any]:
+    suffix_examples = string_list(terminal.get("suffix_examples"))
+    value_examples = string_list(terminal.get("value_examples"))
+    episode_titles: list[str] = []
+    media_suffixes: list[str] = []
+    for suffix in suffix_examples:
+        title_bits, media_bits = split_suffix_fragments(suffix)
+        episode_titles.extend(title_bits)
+        media_suffixes.extend(media_bits)
+    if not episode_titles:
+        for value in value_examples:
+            for match in PATH_EPISODE_TITLE_RE.finditer(value):
+                candidate = clean_candidate(match.group("title"))
+                if candidate and not is_media_fragment(candidate):
+                    episode_titles.append(candidate)
+    episode_titles = unique_keep_order(episode_titles)
+    media_suffixes = unique_keep_order(media_suffixes)
+    title_candidates = unique_keep_order(clean_candidate(item) for item in episode_titles)
+    needs_review = needs_llm_review(terminal, episode_titles, media_suffixes)
+    notes = summarize_notes(suffix_examples, episode_titles, media_suffixes, needs_review)
+    return {
+        "terminal_id": terminal_id(terminal, index),
+        "needs_llm_review": needs_review,
+        "episode_title_suffixes": episode_titles,
+        "media_suffixes": media_suffixes,
+        "title_candidates": title_candidates,
+        "llm_label": None,
+        "notes": notes,
+        "source": SOURCE,
+    }
+def needs_llm_review(
+    terminal: dict[str, Any],
+    episode_titles: list[str],
+    media_suffixes: list[str],
+) -> bool:
+    suffix_examples = string_list(terminal.get("suffix_examples"))
+    if not suffix_examples:
+        return False
+    classified = len(episode_titles) + len(media_suffixes)
+    if episode_titles and media_suffixes:
+        return True
+    if classified == 0:
+        return True
+    suffix_text = " ".join(suffix_examples)
+    if "/" in suffix_text or "\\" in suffix_text:
+        return True
+    return False
+def summarize_notes(
+    suffix_examples: list[str],
+    episode_titles: list[str],
+    media_suffixes: list[str],
+    needs_review: bool,
+) -> str:
+    parts = [
+        f"suffix_examples={len(suffix_examples)}",
+        f"episode_title_suffixes={len(episode_titles)}",
+        f"media_suffixes={len(media_suffixes)}",
+    ]
+    if needs_review:
+        parts.append("ambiguous_suffix_layer")
+    return "; ".join(parts)
+def selected_terminals(graph: dict[str, Any], args: Args) -> list[tuple[int, dict[str, Any], dict[str, Any]]]:
+    selected: list[tuple[int, dict[str, Any], dict[str, Any]]] = []
+    for index, terminal in enumerate(graph["terminals"]):
+        if not isinstance(terminal, dict):
+            continue
+        weight = int(terminal.get("weight") or terminal.get("count") or 0)
+        if args.min_weight is not None and weight < args.min_weight:
+            continue
+        patch = heuristic_patch(terminal, index)
+        if args.only_needs_review and not patch["needs_llm_review"]:
+            continue
+        selected.append((index, terminal, patch))
+        if args.limit is not None and len(selected) >= args.limit:
+            break
+    return selected
+def responses_url(base_url: str) -> str:
+    return base_url.rstrip("/") + "/responses"
+def extract_response_text(data: dict[str, Any]) -> str:
+    output_text = data.get("output_text")
+    if isinstance(output_text, str) and output_text.strip():
+        return output_text
+    chunks: list[str] = []
+    for item in data.get("output") or []:
+        if not isinstance(item, dict):
+            continue
+        for content in item.get("content") or []:
+            if not isinstance(content, dict):
+                continue
+            text = content.get("text")
+            if isinstance(text, str):
+                chunks.append(text)
+    return "\n".join(chunks).strip()
+def call_llm(terminal: dict[str, Any], patch: dict[str, Any], args: Args) -> dict[str, Any] | None:
+    if not args.api_key:
+        raise RuntimeError("--llm requires --api-key or ANIFILEBERT_LLM_API_KEY")
+    instructions = (
+        "You annotate anime filename suffix examples. Return strict JSON only with keys "
+        "episode_title_suffixes, media_suffixes, title_candidates, llm_label, notes. "
+        "Classify quoted human episode titles separately from media tags such as resolution, "
+        "codec, source, language, subtitle markers, hashes, and release metadata."
+    )
+    payload = {
+        "model": args.model,
+        "instructions": instructions,
+        "input": json.dumps(
+            {
+                "terminal_id": patch["terminal_id"],
+                "prefix": terminal.get("prefix"),
+                "digit_skeleton": terminal.get("digit_skeleton"),
+                "suffix_examples": string_list(terminal.get("suffix_examples")),
+                "value_examples": string_list(terminal.get("value_examples")),
+                "heuristic_patch": patch,
+            },
+            ensure_ascii=False,
+        ),
+    }
+    request = urllib.request.Request(
+        responses_url(args.base_url),
+        data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
+        headers={
+            "Authorization": f"Bearer {args.api_key}",
+            "Content-Type": "application/json",
+        },
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=args.http_timeout) as response:
+            raw = response.read().decode("utf-8")
+    except urllib.error.HTTPError as exc:
+        body = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"Responses API HTTP {exc.code}: {body[:500]}") from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(f"Responses API request failed: {exc}") from exc
+    try:
+        data = json.loads(raw)
+        text = extract_response_text(data)
+        annotation = json.loads(strip_json_fence(text))
+    except (json.JSONDecodeError, TypeError) as exc:
+        raise RuntimeError(f"Responses API returned non-JSON annotation: {raw[:500]}") from exc
+    if not isinstance(annotation, dict):
+        raise RuntimeError("Responses API annotation must be a JSON object")
+    merged = dict(patch)
+    for key in ("episode_title_suffixes", "media_suffixes", "title_candidates"):
+        if key in annotation:
+            merged[key] = unique_keep_order(str(item) for item in annotation.get(key) or [])
+    if "llm_label" in annotation:
+        merged["llm_label"] = annotation["llm_label"]
+    if "notes" in annotation:
+        merged["notes"] = str(annotation["notes"])
+    merged["source"] = LLM_SOURCE
+    return merged
+def strip_json_fence(text: str) -> str:
+    text = text.strip()
+    text = re.sub(r"^```(?:json)?\s*", "", text)
+    text = re.sub(r"\s*```$", "", text)
+    return text.strip()
+PREFIX_BOUNDARY_CHARS = set(" \t\r\n-_.~/\\|:：[]()【】《》「」『』\"'")
+def prefix_boundary_ok(value: str, prefix: str) -> bool:
+    if not prefix or not value.startswith(prefix):
+        return False
+    if len(value) == len(prefix):
+        return True
+    next_char = value[len(prefix)]
+    last_char = prefix[-1]
+    return next_char in PREFIX_BOUNDARY_CHARS or last_char in PREFIX_BOUNDARY_CHARS
+class PrefixTrieNode:
+    __slots__ = ("children", "terminal_ordinals")
+    def __init__(self) -> None:
+        self.children: dict[str, PrefixTrieNode] = {}
+        self.terminal_ordinals: list[int] = []
+def build_prefix_trie(selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> PrefixTrieNode:
+    root = PrefixTrieNode()
+    for ordinal, (_index, terminal, _patch) in enumerate(selected):
+        prefix = str(terminal.get("prefix") or "")
+        if not prefix:
+            continue
+        node = root
+        for char in prefix:
+            node = node.children.setdefault(char, PrefixTrieNode())
+        node.terminal_ordinals.append(ordinal)
+    return root
+def matching_terminal_ordinal(value: str, trie: PrefixTrieNode, selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> int | None:
+    node = trie
+    best: int | None = None
+    for char in value:
+        node = node.children.get(char)
+        if node is None:
+            break
+        for ordinal in node.terminal_ordinals:
+            prefix = str(selected[ordinal][1].get("prefix") or "")
+            if prefix_boundary_ok(value, prefix):
+                best = ordinal
+    return best
+def source_list_matches(
+    source_list: Path,
+    selected: list[tuple[int, dict[str, Any], dict[str, Any]]],
+) -> dict[int, list[tuple[int, str]]]:
+    if not source_list.exists():
+        raise SystemExit(f"source list not found: {source_list}")
+    trie = build_prefix_trie(selected)
+    matches: dict[int, list[tuple[int, str]]] = {ordinal: [] for ordinal in range(len(selected))}
+    with source_list.open("r", encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            if not line.strip():
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise SystemExit(f"invalid JSON in {source_list}:{line_number}: {exc}") from exc
+            if not isinstance(row, dict):
+                continue
+            value = row.get("value")
+            if not isinstance(value, str) or not value.strip():
+                continue
+            ordinal = matching_terminal_ordinal(value, trie, selected)
+            if ordinal is not None:
+                matches[ordinal].append((line_number, value))
+    return matches
+def dataset_records(
+    terminal: dict[str, Any],
+    index: int,
+    patch: dict[str, Any],
+    tokenizer: AnimeTokenizer,
+    *,
+    filenames: Iterable[tuple[int, str]] | None = None,
+    preserve_i_labels: bool = False,
+) -> list[dict[str, Any]]:
+    records: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    if filenames is None:
+        filenames = enumerate(string_list(terminal.get("value_examples")))
+    for source_index, filename in filenames:
+        if filename in seen:
+            continue
+        seen.add(filename)
+        sample = weak_label_filename(filename, tokenizer)
+        if sample is None:
+            continue
+        tokens, labels = normalize_generated_tokens(
+            sample["tokens"],
+            sample["labels"],
+            preserve_i_labels=preserve_i_labels,
+        )
+        records.append(
+            {
+                "file_id": f"prefix-graph:{patch['terminal_id']}:{source_index}",
+                "filename": filename,
+                "tokens": tokens,
+                "labels": labels,
+                "terminal_id": patch["terminal_id"],
+                "terminal_index": index,
+                "source": patch["source"],
+                "needs_llm_review": patch["needs_llm_review"],
+                "episode_title_suffixes": patch["episode_title_suffixes"],
+                "media_suffixes": patch["media_suffixes"],
+                "title_candidates": patch["title_candidates"],
+                "annotations": {
+                    "terminal_id": patch["terminal_id"],
+                    "terminal_index": index,
+                    "source": patch["source"],
+                    "needs_llm_review": patch["needs_llm_review"],
+                    "episode_title_suffixes": patch["episode_title_suffixes"],
+                    "media_suffixes": patch["media_suffixes"],
+                    "title_candidates": patch["title_candidates"],
+                    "llm_label": patch["llm_label"],
+                    "notes": patch["notes"],
+                },
+            }
+        )
+    return records
+def is_standalone_separator(token: str) -> bool:
+    return len(token) == 1 and (token.isspace() or not token.isalnum())
+def split_generated_token(token: str) -> list[str]:
+    pieces: list[str] = []
+    current: list[str] = []
+    for char in token:
+        if char.isspace() or not char.isalnum():
+            if current:
+                pieces.append("".join(current))
+                current.clear()
+            pieces.append(char)
+        else:
+            current.append(char)
+    if current:
+        pieces.append("".join(current))
+    return pieces
+def b_only_label(label: str) -> str:
+    if label.startswith(("B-", "I-")):
+        return "B-" + label.split("-", 1)[1]
+    return "O" if label == "O" else str(label)
+def normalize_generated_tokens(
+    tokens: list[str],
+    labels: list[str],
+    *,
+    preserve_i_labels: bool = False,
+) -> tuple[list[str], list[str]]:
+    normalized_tokens: list[str] = []
+    normalized_labels: list[str] = []
+    for token, label in zip(tokens, labels):
+        source_label = str(label)
+        entity_label = source_label if preserve_i_labels else b_only_label(source_label)
+        for piece in split_generated_token(str(token)):
+            normalized_tokens.append(piece)
+            if entity_label == "O" or is_standalone_separator(piece):
+                normalized_labels.append("O")
+            else:
+                normalized_labels.append(entity_label)
+    return normalized_tokens, normalized_labels
+def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with path.open("w", encoding="utf-8", newline="\n") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
+            count += 1
+    return count
+def merge_annotations(graph: dict[str, Any], patches_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
+    merged = json.loads(json.dumps(graph, ensure_ascii=False))
+    for index, terminal in enumerate(merged.get("terminals") or []):
+        if not isinstance(terminal, dict):
+            continue
+        patch = patches_by_id.get(terminal_id(terminal, index))
+        if patch is None:
+            continue
+        terminal["annotations"] = {
+            "episode_title_suffixes": patch["episode_title_suffixes"],
+            "media_suffixes": patch["media_suffixes"],
+            "title_candidates": patch["title_candidates"],
+            "needs_llm_review": patch["needs_llm_review"],
+            "llm_label": patch["llm_label"],
+            "notes": patch["notes"],
+            "source": patch["source"],
+            "annotated_at": datetime.now(timezone.utc).isoformat(),
+        }
+    return merged
+def main() -> None:
+    args = parse_args()
+    graph = load_graph(args.graph)
+    selected = selected_terminals(graph, args)
+    if not selected:
+        raise SystemExit("no terminals selected; adjust --limit/--min-weight/--only-needs-review")
+    tokenizer = AnimeTokenizer()
+    llm_requests = 0
+    patches: list[dict[str, Any]] = []
+    records: list[dict[str, Any]] = []
+    source_matches = None if args.examples_only else source_list_matches(args.source_list, selected)
+    for ordinal, (index, terminal, patch) in enumerate(selected):
+        if args.llm and patch["needs_llm_review"]:
+            if args.max_requests is None or llm_requests < args.max_requests:
+                try:
+                    llm_patch = call_llm(terminal, patch, args)
+                    if llm_patch is not None:
+                        patch = llm_patch
+                    llm_requests += 1
+                except RuntimeError as exc:
+                    print(f"warning: terminal {patch['terminal_id']}: {exc}; using heuristic patch", file=sys.stderr)
+                    patch["notes"] = f"{patch['notes']}; llm_error={exc}"
+        patches.append(patch)
+        records.extend(
+            dataset_records(
+                terminal,
+                index,
+                patch,
+                tokenizer,
+                filenames=None if args.examples_only else source_matches.get(ordinal, []),
+                preserve_i_labels=args.preserve_i_labels,
+            )
+        )
+    record_count = write_jsonl(args.output, records)
+    patch_count = 0
+    if args.patch_output is not None:
+        patch_count = write_jsonl(args.patch_output, patches)
+    if args.merge_output is not None:
+        args.merge_output.parent.mkdir(parents=True, exist_ok=True)
+        merged = merge_annotations(graph, {patch["terminal_id"]: patch for patch in patches})
+        args.merge_output.write_text(json.dumps(merged, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+    summary = {
+        "graph": str(args.graph),
+        "source_list": None if args.examples_only else str(args.source_list),
+        "output": str(args.output),
+        "patch_output": str(args.patch_output) if args.patch_output is not None else None,
+        "merge_output": str(args.merge_output) if args.merge_output is not None else None,
+        "selected_terminals": len(selected),
+        "examples_only": args.examples_only,
+        "dataset_records": record_count,
+        "patches": patch_count,
+        "llm_enabled": args.llm,
+        "llm_requests": llm_requests,
+    }
+    print(json.dumps(summary, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()

tools/convert_annotated_dmhy_dataset.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""Convert annotated DMHY graph JSONL into the character-tokenized dataset.
+The annotated graph workflow is expected to produce records compatible with
+``dmhy_weak.jsonl``: each row has ``filename``, ``tokens``, and ``labels``.
+This wrapper validates that contract, then reuses ``tools.convert_to_char_dataset``
+for the token-to-character projection and manifest statistics.
+"""
+from __future__ import annotations
+import argparse
+import json
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+from statistics import mean
+from typing import Iterable
+from tools.convert_to_char_dataset import (
+    build_vocab,
+    convert_record,
+    coverage,
+    percentile,
+)
+DEFAULT_INPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
+DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated_char.jsonl")
+DEFAULT_VOCAB_OUTPUT = Path("datasets/AnimeName/vocab.generated.char.json")
+DEFAULT_MANIFEST_OUTPUT = Path(
+    "datasets/AnimeName/dmhy_weak.generated_char.manifest.json"
+)
+REQUIRED_FIELDS = ("filename", "tokens", "labels")
+def is_separator_or_space(char: str) -> bool:
+    return char.isspace() or not char.isalnum()
+def token_has_embedded_separator(token: str) -> bool:
+    return len(token) > 1 and any(is_separator_or_space(char) for char in token)
+def is_bioish_label(label: object) -> bool:
+    if not isinstance(label, str):
+        return False
+    if label == "O":
+        return True
+    prefix, sep, entity = label.partition("-")
+    return sep == "-" and prefix in {"B", "I"} and bool(entity)
+def validate_record(
+    record: object,
+    path: Path,
+    line_no: int,
+    *,
+    check_punctuation: bool = True,
+) -> dict:
+    if not isinstance(record, dict):
+        raise ValueError(f"{path}:{line_no}: record must be a JSON object")
+    missing = [field for field in REQUIRED_FIELDS if field not in record]
+    if missing:
+        raise ValueError(
+            f"{path}:{line_no}: missing required field(s): {', '.join(missing)}"
+        )
+    filename = record["filename"]
+    tokens = record["tokens"]
+    labels = record["labels"]
+    if not isinstance(filename, str) or not filename:
+        raise ValueError(f"{path}:{line_no}: filename must be a non-empty string")
+    if not isinstance(tokens, list):
+        raise ValueError(f"{path}:{line_no}: tokens must be a list")
+    if not isinstance(labels, list):
+        raise ValueError(f"{path}:{line_no}: labels must be a list")
+    if len(tokens) != len(labels):
+        raise ValueError(
+            f"{path}:{line_no}: token/label length mismatch: "
+            f"{len(tokens)} tokens, {len(labels)} labels"
+        )
+    for index, token in enumerate(tokens):
+        if not isinstance(token, str):
+            raise ValueError(f"{path}:{line_no}: tokens[{index}] must be a string")
+        if check_punctuation and token_has_embedded_separator(token):
+            raise ValueError(
+                f"{path}:{line_no}: tokens[{index}] contains punctuation, symbol, or "
+                f"whitespace that should be a standalone token: {token!r}"
+            )
+    for index, label in enumerate(labels):
+        if not is_bioish_label(label):
+            raise ValueError(
+                f"{path}:{line_no}: labels[{index}] is not BIO-ish: {label!r}"
+            )
+    return record
+def iter_validated_jsonl(path: Path, *, check_punctuation: bool = True) -> Iterable[dict]:
+    with path.open("r", encoding="utf-8") as handle:
+        for line_no, line in enumerate(handle, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                record = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
+            yield validate_record(
+                record,
+                path,
+                line_no,
+                check_punctuation=check_punctuation,
+            )
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Validate annotated DMHY graph JSONL and convert it to the "
+            "character-tokenized training format."
+        ),
+        epilog=(
+            "Equivalent projection logic is provided by "
+            "tools.convert_to_char_dataset.convert_record."
+        ),
+    )
+    parser.add_argument(
+        "--input",
+        default=str(DEFAULT_INPUT),
+        help=f"Input dmhy_weak-compatible JSONL (default: {DEFAULT_INPUT})",
+    )
+    parser.add_argument(
+        "--output",
+        default=str(DEFAULT_OUTPUT),
+        help=f"Output character-level JSONL (default: {DEFAULT_OUTPUT})",
+    )
+    parser.add_argument(
+        "--vocab-output",
+        default=str(DEFAULT_VOCAB_OUTPUT),
+        help=f"Output character vocab JSON (default: {DEFAULT_VOCAB_OUTPUT})",
+    )
+    parser.add_argument(
+        "--manifest-output",
+        default=str(DEFAULT_MANIFEST_OUTPUT),
+        help=(
+            "Output conversion manifest JSON "
+            f"(default: {DEFAULT_MANIFEST_OUTPUT})"
+        ),
+    )
+    parser.add_argument(
+        "--max-vocab-size",
+        type=int,
+        default=None,
+        help="Optional vocab cap including special tokens",
+    )
+    parser.add_argument("--limit", type=int, default=None, help="Convert only N rows")
+    parser.add_argument(
+        "--progress",
+        type=int,
+        default=50_000,
+        help="Print progress every N records",
+    )
+    parser.add_argument(
+        "--validate-only",
+        action="store_true",
+        help="Validate input records without writing converted outputs",
+    )
+    parser.add_argument(
+        "--allow-embedded-punctuation",
+        action="store_true",
+        help=(
+            "Skip the generated-workflow check that punctuation and whitespace "
+            "must be standalone tokens."
+        ),
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    vocab_path = Path(args.vocab_output)
+    manifest_path = Path(args.manifest_output)
+    if not input_path.exists():
+        raise FileNotFoundError(f"input JSONL does not exist: {input_path}")
+    if not args.validate_only:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        vocab_path.parent.mkdir(parents=True, exist_ok=True)
+        manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    char_counter: Counter[str] = Counter()
+    label_counter: Counter[str] = Counter()
+    row_count = 0
+    source_token_count = 0
+    char_token_count = 0
+    lengths: list[int] = []
+    examples: list[dict] = []
+    output_handle = None
+    try:
+        if not args.validate_only:
+            output_handle = output_path.open("w", encoding="utf-8", newline="\n")
+        for record in iter_validated_jsonl(
+            input_path,
+            check_punctuation=not args.allow_embedded_punctuation,
+        ):
+            converted = convert_record(record)
+            if output_handle is not None:
+                output_handle.write(
+                    json.dumps(converted, ensure_ascii=False, separators=(",", ":"))
+                    + "\n"
+                )
+            row_count += 1
+            source_token_count += converted["source_token_count"]
+            char_len = converted["char_token_count"]
+            char_token_count += char_len
+            lengths.append(char_len)
+            char_counter.update(converted["tokens"])
+            label_counter.update(converted["labels"])
+            if len(examples) < 5:
+                examples.append(converted)
+            if args.limit is not None and row_count >= args.limit:
+                break
+            if args.progress and row_count % args.progress == 0:
+                print(f"converted {row_count:,} rows; unique chars={len(char_counter):,}")
+    finally:
+        if output_handle is not None:
+            output_handle.close()
+    vocab = build_vocab(char_counter, args.max_vocab_size)
+    manifest = {
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "input": str(input_path),
+        "output": None if args.validate_only else str(output_path),
+        "vocab_output": None if args.validate_only else str(vocab_path),
+        "manifest_output": None if args.validate_only else str(manifest_path),
+        "tokenizer_variant": "char",
+        "source_workflow": "annotated_dmhy_graph",
+        "validation": {
+            "required_fields": list(REQUIRED_FIELDS),
+            "label_contract": "O or B-*/I-* with a non-empty entity name; B/O-only is accepted",
+            "punctuation_standalone": not args.allow_embedded_punctuation,
+        },
+        "projection": {
+            "B-X": "first char keeps B-X; remaining chars become I-X",
+            "I-X": "all chars keep I-X",
+            "O": "all chars keep O",
+        },
+        "row_count": row_count,
+        "source_token_count": source_token_count,
+        "char_token_count": char_token_count,
+        "unique_char_count": len(char_counter),
+        "vocab_size": len(vocab),
+        "max_vocab_size": args.max_vocab_size,
+        "vocab_coverage": coverage(char_counter, vocab),
+        "label_counts": dict(label_counter),
+        "char_length": {
+            "min": min(lengths) if lengths else 0,
+            "mean": mean(lengths) if lengths else 0,
+            "p50": percentile(lengths, 50),
+            "p90": percentile(lengths, 90),
+            "p95": percentile(lengths, 95),
+            "p99": percentile(lengths, 99),
+            "max": max(lengths) if lengths else 0,
+        },
+        "examples": examples,
+    }
+    if not args.validate_only:
+        vocab_path.write_text(
+            json.dumps(vocab, ensure_ascii=False, indent=2) + "\n",
+            encoding="utf-8",
+        )
+        manifest_path.write_text(
+            json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
+            encoding="utf-8",
+        )
+    print(
+        json.dumps(
+            {key: value for key, value in manifest.items() if key != "examples"},
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+if __name__ == "__main__":
+    main()

tools/dmhy_prefix_grouper/Cargo.lock ADDED Viewed

	@@ -0,0 +1,347 @@

+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys",
+]
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys",
+]
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+[[package]]
+name = "clap_derive"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "dmhy_prefix_grouper"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_json",
+]
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+[[package]]
+name = "memchr"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "serde_json"
+version = "1.0.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

tools/dmhy_prefix_grouper/Cargo.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[package]
+name = "dmhy_prefix_grouper"
+version = "0.1.0"
+edition = "2021"
+[dependencies]
+anyhow = "1.0"
+clap = { version = "4.5", features = ["derive"] }
+rayon = "1.10"
+regex = "1.11"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"

tools/dmhy_prefix_grouper/src/main.rs ADDED Viewed

	@@ -0,0 +1,1070 @@

+use std::collections::{BTreeMap, BTreeSet};
+use std::fs::{self, File};
+use std::io::{BufRead, BufReader, BufWriter};
+use std::path::{Path, PathBuf};
+use std::sync::LazyLock;
+use anyhow::{Context, Result};
+use clap::Parser;
+use rayon::prelude::*;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+const RADIX_VERSION: &str = "prefix-radix-v1";
+const DAG_VERSION: &str = "prefix-dag-v1";
+static EPISODE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        Regex::new(r"(?i)\bS\d{1,2}E\d{1,4}\b").unwrap(),
+        Regex::new(r"(?i)\bEP?\.?\s*\d{1,4}\b").unwrap(),
+        Regex::new(r"第\s*\d{1,4}\s*[話话集回]").unwrap(),
+        Regex::new(r"\b\d{1,4}\s*[話话集回]").unwrap(),
+        Regex::new(r"[\[(]\s*\d{1,4}\s*[\])]").unwrap(),
+    ]
+});
+static DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap());
+#[derive(Parser)]
+#[command(about = "Build a deterministic DMHY episode-prefix trie graph")]
+struct Args {
+    #[arg(long, default_value = "datasets\\AnimeName\\dmhy_list.jsonl")]
+    input: PathBuf,
+    #[arg(long, default_value = "datasets\\AnimeName\\dmhy_prefix_graph.json")]
+    output: PathBuf,
+    #[arg(long, default_value_t = 2)]
+    min_count: usize,
+    #[arg(long, default_value_t = 5)]
+    example_count: usize,
+    #[arg(long)]
+    from_prefix_graph: Option<PathBuf>,
+    #[arg(long)]
+    dag_output: Option<PathBuf>,
+}
+#[derive(Clone, Deserialize)]
+struct InputRecord {
+    value: String,
+    #[serde(default)]
+    uses_path: bool,
+    #[serde(default)]
+    has_trailing_hash: bool,
+    #[serde(default, rename = "has_digits")]
+    _has_digits: bool,
+    #[serde(default)]
+    #[serde(rename = "digit_skeleton")]
+    _digit_skeleton: String,
+    #[serde(default = "default_count")]
+    count: usize,
+}
+#[derive(Clone)]
+struct PatternObservation {
+    source_index: usize,
+    prefix: String,
+    digit_skeleton: String,
+    suffix: String,
+    value: String,
+    uses_path: bool,
+    has_trailing_hash: bool,
+    count: usize,
+}
+#[derive(Default)]
+struct GroupBuilder {
+    prefix: String,
+    digit_skeleton: String,
+    count: usize,
+    weight: usize,
+    uses_path_count: usize,
+    has_trailing_hash_count: usize,
+    suffix_examples: Vec<String>,
+    value_examples: Vec<String>,
+}
+struct TrieNode {
+    id: usize,
+    edge_label: String,
+    depth: usize,
+    parent: Option<usize>,
+    children_by_label: BTreeMap<String, usize>,
+    subtree_patterns: usize,
+    subtree_weight: usize,
+}
+#[derive(Serialize)]
+struct GraphOutput {
+    meta: Meta,
+    nodes: Vec<OutputNode>,
+    terminals: Vec<OutputTerminal>,
+}
+#[derive(Serialize)]
+struct Meta {
+    input: String,
+    output: String,
+    input_records: usize,
+    observations: usize,
+    no_episode_prefix: usize,
+    groups: usize,
+    nodes: usize,
+    max_depth: usize,
+    grouped_weight: usize,
+    min_count: usize,
+    example_count: usize,
+    tokenizer: TokenizerMeta,
+}
+#[derive(Serialize)]
+struct TokenizerMeta {
+    version: &'static str,
+    notes: Vec<&'static str>,
+}
+#[derive(Serialize)]
+struct OutputNode {
+    id: usize,
+    edge_label: String,
+    depth: usize,
+    children: Vec<usize>,
+    subtree_patterns: usize,
+    subtree_weight: usize,
+}
+#[derive(Serialize)]
+struct OutputTerminal {
+    node_id: usize,
+    prefix: String,
+    digit_skeleton: String,
+    count: usize,
+    weight: usize,
+    uses_path_count: usize,
+    has_trailing_hash_count: usize,
+    suffix_examples: Vec<String>,
+    value_examples: Vec<String>,
+    annotations: TerminalAnnotations,
+}
+#[derive(Default, Serialize)]
+struct TerminalAnnotations {
+    episode_title_suffixes: Vec<String>,
+    media_suffixes: Vec<String>,
+    title_candidates: Vec<String>,
+    needs_llm_review: bool,
+    llm_label: Option<String>,
+    notes: Option<String>,
+}
+#[derive(Deserialize)]
+struct SourceGraph {
+    #[allow(dead_code)]
+    meta: serde_json::Value,
+    nodes: Vec<SourceNode>,
+    terminals: Vec<SourceTerminal>,
+}
+#[derive(Deserialize)]
+struct SourceNode {
+    id: usize,
+    edge_label: String,
+    #[allow(dead_code)]
+    depth: usize,
+    #[serde(default)]
+    children: Vec<usize>,
+}
+#[derive(Clone, Deserialize, Serialize)]
+struct SourceTerminal {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    terminal_id: Option<String>,
+    node_id: usize,
+    prefix: String,
+    digit_skeleton: String,
+    count: usize,
+    weight: usize,
+    uses_path_count: usize,
+    has_trailing_hash_count: usize,
+    suffix_examples: Vec<String>,
+    value_examples: Vec<String>,
+    #[serde(default = "default_annotations_value")]
+    annotations: serde_json::Value,
+}
+#[derive(Serialize)]
+struct DagOutput {
+    meta: DagMeta,
+    root: usize,
+    nodes: Vec<DagNode>,
+    terminals: Vec<DagTerminal>,
+}
+#[derive(Serialize)]
+struct DagMeta {
+    version: &'static str,
+    input: String,
+    output: String,
+    source_nodes: usize,
+    source_terminals: usize,
+    dag_nodes: usize,
+    dag_edges: usize,
+    root: usize,
+    max_depth: usize,
+    merged_nodes: usize,
+    preserves_digits: bool,
+    merge_strategy: &'static str,
+    notes: Vec<&'static str>,
+}
+#[derive(Clone, Serialize)]
+struct DagNode {
+    id: usize,
+    terminal: bool,
+    children: Vec<DagEdge>,
+    incoming_count: usize,
+    reachable_terminals: usize,
+    reachable_weight: usize,
+}
+#[derive(Clone, Serialize)]
+struct DagEdge {
+    label: String,
+    target: usize,
+}
+#[derive(Serialize)]
+struct DagTerminal {
+    terminal_id: String,
+    node_id: usize,
+    prefix: String,
+    digit_skeleton: String,
+    count: usize,
+    weight: usize,
+    uses_path_count: usize,
+    has_trailing_hash_count: usize,
+    suffix_examples: Vec<String>,
+    value_examples: Vec<String>,
+    annotations: serde_json::Value,
+}
+#[derive(Clone, Eq, Ord, PartialEq, PartialOrd)]
+struct NodeSignature {
+    terminal: bool,
+    children: Vec<(String, usize)>,
+}
+#[derive(Clone)]
+struct TempDagNode {
+    terminal: bool,
+    children: Vec<DagEdge>,
+}
+fn main() -> Result<()> {
+    let args = Args::parse();
+    if args.from_prefix_graph.is_some() || args.dag_output.is_some() {
+        let dag = build_dag_from_args(&args)?;
+        println!("{}", serde_json::to_string_pretty(&dag.meta)?);
+    } else {
+        let graph = build_graph(&args)?;
+        println!("{}", serde_json::to_string_pretty(&graph.meta)?);
+    }
+    Ok(())
+}
+fn build_graph(args: &Args) -> Result<GraphOutput> {
+    ensure_output_parent(&args.output)?;
+    let records = read_records(&args.input)?;
+    let input_records = records.len();
+    let mut observations = records
+        .par_iter()
+        .enumerate()
+        .filter_map(|(source_index, record)| build_observation(source_index, record))
+        .collect::<Vec<_>>();
+    observations.sort_by(|left, right| {
+        left.prefix
+            .cmp(&right.prefix)
+            .then(left.source_index.cmp(&right.source_index))
+    });
+    let no_episode_prefix = input_records.saturating_sub(observations.len());
+    let groups = aggregate_observations(&observations, args.min_count, args.example_count);
+    let (nodes, terminals, max_depth, grouped_weight) = build_trie(&groups);
+    let graph = GraphOutput {
+        meta: Meta {
+            input: display_path(&args.input),
+            output: display_path(&args.output),
+            input_records,
+            observations: observations.len(),
+            no_episode_prefix,
+            groups: terminals.len(),
+            nodes: nodes.len(),
+            max_depth,
+            grouped_weight,
+            min_count: args.min_count,
+            example_count: args.example_count,
+            tokenizer: TokenizerMeta {
+                version: RADIX_VERSION,
+                notes: vec![
+                    "Episode prefixes are detected with the legacy regex/boundary rules.",
+                    "Graph insertion preserves original prefix digits; digit_skeleton is secondary metadata only.",
+                    "Graph nodes form a compressed radix trie over complete original prefix strings.",
+                    "Edge labels split only at actual branch points; punctuation and separators do not force levels.",
+                ],
+            },
+        },
+        nodes,
+        terminals,
+    };
+    let output = File::create(&args.output)
+        .with_context(|| format!("failed to create {}", args.output.display()))?;
+    let writer = BufWriter::new(output);
+    serde_json::to_writer_pretty(writer, &graph)
+        .with_context(|| format!("failed to write {}", args.output.display()))?;
+    Ok(graph)
+}
+fn build_dag_from_args(args: &Args) -> Result<DagOutput> {
+    let input = args
+        .from_prefix_graph
+        .as_ref()
+        .unwrap_or(&args.output)
+        .to_path_buf();
+    let output = args
+        .dag_output
+        .as_ref()
+        .cloned()
+        .unwrap_or_else(|| PathBuf::from("datasets\\AnimeName\\dmhy_prefix_dag.json"));
+    build_dag(&input, &output)
+}
+fn build_dag(input_path: &Path, output_path: &Path) -> Result<DagOutput> {
+    ensure_output_parent(output_path)?;
+    let input = File::open(input_path)
+        .with_context(|| format!("failed to open {}", input_path.display()))?;
+    let reader = BufReader::new(input);
+    let source: SourceGraph = serde_json::from_reader(reader)
+        .with_context(|| format!("failed to parse {}", input_path.display()))?;
+    validate_source_graph(&source)?;
+    let dag = minimize_source_graph(&source, input_path, output_path)?;
+    let output = File::create(output_path)
+        .with_context(|| format!("failed to create {}", output_path.display()))?;
+    let writer = BufWriter::new(output);
+    serde_json::to_writer_pretty(writer, &dag)
+        .with_context(|| format!("failed to write {}", output_path.display()))?;
+    Ok(dag)
+}
+fn validate_source_graph(source: &SourceGraph) -> Result<()> {
+    for (index, node) in source.nodes.iter().enumerate() {
+        anyhow::ensure!(
+            node.id == index,
+            "source node id {} appears at index {}",
+            node.id,
+            index
+        );
+        for &child_id in &node.children {
+            anyhow::ensure!(
+                child_id < source.nodes.len(),
+                "source node {} references missing child {}",
+                node.id,
+                child_id
+            );
+        }
+    }
+    for terminal in &source.terminals {
+        anyhow::ensure!(
+            terminal.node_id < source.nodes.len(),
+            "terminal prefix {:?} references missing node {}",
+            terminal.prefix,
+            terminal.node_id
+        );
+    }
+    Ok(())
+}
+fn minimize_source_graph(
+    source: &SourceGraph,
+    input_path: &Path,
+    output_path: &Path,
+) -> Result<DagOutput> {
+    let terminal_source_nodes = source
+        .terminals
+        .iter()
+        .map(|terminal| terminal.node_id)
+        .collect::<BTreeSet<_>>();
+    let postorder = source_postorder(source);
+    let mut source_to_temp_dag = vec![usize::MAX; source.nodes.len()];
+    let mut signatures = BTreeMap::<NodeSignature, usize>::new();
+    let mut temp_nodes = Vec::<TempDagNode>::new();
+    for source_id in postorder {
+        let source_node = &source.nodes[source_id];
+        let mut children = source_node
+            .children
+            .iter()
+            .map(|&child_id| {
+                let child = &source.nodes[child_id];
+                let target = source_to_temp_dag[child_id];
+                anyhow::ensure!(
+                    target != usize::MAX,
+                    "source child {} was not canonicalized before parent {}",
+                    child_id,
+                    source_id
+                );
+                Ok((child.edge_label.clone(), target))
+            })
+            .collect::<Result<Vec<_>>>()?;
+        children.sort();
+        let signature = NodeSignature {
+            terminal: terminal_source_nodes.contains(&source_id),
+            children,
+        };
+        let temp_id = if let Some(&existing) = signatures.get(&signature) {
+            existing
+        } else {
+            let temp_id = temp_nodes.len();
+            temp_nodes.push(TempDagNode {
+                terminal: signature.terminal,
+                children: signature
+                    .children
+                    .iter()
+                    .map(|(label, target)| DagEdge {
+                        label: label.clone(),
+                        target: *target,
+                    })
+                    .collect(),
+            });
+            signatures.insert(signature, temp_id);
+            temp_id
+        };
+        source_to_temp_dag[source_id] = temp_id;
+    }
+    let root_temp_id = source_to_temp_dag[0];
+    let (temp_to_dag, mut dag_nodes) = renumber_dag(root_temp_id, &temp_nodes);
+    for node in &mut dag_nodes {
+        for edge in &mut node.children {
+            edge.target = temp_to_dag[edge.target];
+        }
+    }
+    let terminals = source
+        .terminals
+        .iter()
+        .enumerate()
+        .map(|(index, terminal)| {
+            let temp_id = source_to_temp_dag[terminal.node_id];
+            DagTerminal {
+                terminal_id: terminal
+                    .terminal_id
+                    .clone()
+                    .unwrap_or_else(|| format!("t{}", index)),
+                node_id: temp_to_dag[temp_id],
+                prefix: terminal.prefix.clone(),
+                digit_skeleton: terminal.digit_skeleton.clone(),
+                count: terminal.count,
+                weight: terminal.weight,
+                uses_path_count: terminal.uses_path_count,
+                has_trailing_hash_count: terminal.has_trailing_hash_count,
+                suffix_examples: terminal.suffix_examples.clone(),
+                value_examples: terminal.value_examples.clone(),
+                annotations: terminal.annotations.clone(),
+            }
+        })
+        .collect::<Vec<_>>();
+    fill_dag_counts(&source, &source_to_temp_dag, &temp_to_dag, &mut dag_nodes);
+    let dag_edges = dag_nodes.iter().map(|node| node.children.len()).sum();
+    let max_depth = dag_max_depth(&dag_nodes);
+    Ok(DagOutput {
+        meta: DagMeta {
+            version: DAG_VERSION,
+            input: display_path(input_path),
+            output: display_path(output_path),
+            source_nodes: source.nodes.len(),
+            source_terminals: source.terminals.len(),
+            dag_nodes: dag_nodes.len(),
+            dag_edges,
+            root: 0,
+            max_depth,
+            merged_nodes: source.nodes.len().saturating_sub(dag_nodes.len()),
+            preserves_digits: true,
+            merge_strategy:
+                "bottom-up suffix equivalence over radix trie nodes; edge labels remain raw substrings",
+            notes: vec![
+                "Terminal prefixes and digit_skeleton values are copied from the source graph; digits are not normalized for merging.",
+                "Node reachable_terminals and reachable_weight count terminal instances mapped onto the shared DAG suffix, so shared nodes report aggregate suffix usage rather than one caller-specific path.",
+            ],
+        },
+        root: 0,
+        nodes: dag_nodes,
+        terminals,
+    })
+}
+fn source_postorder(source: &SourceGraph) -> Vec<usize> {
+    let mut order = Vec::with_capacity(source.nodes.len());
+    let mut stack = vec![(0, false)];
+    while let Some((node_id, expanded)) = stack.pop() {
+        if expanded {
+            order.push(node_id);
+        } else {
+            stack.push((node_id, true));
+            for &child_id in source.nodes[node_id].children.iter().rev() {
+                stack.push((child_id, false));
+            }
+        }
+    }
+    order
+}
+fn renumber_dag(root_temp_id: usize, temp_nodes: &[TempDagNode]) -> (Vec<usize>, Vec<DagNode>) {
+    let mut temp_to_dag = vec![usize::MAX; temp_nodes.len()];
+    let mut dag_nodes = Vec::<DagNode>::new();
+    let mut stack = vec![root_temp_id];
+    while let Some(temp_id) = stack.pop() {
+        if temp_to_dag[temp_id] != usize::MAX {
+            continue;
+        }
+        let dag_id = dag_nodes.len();
+        temp_to_dag[temp_id] = dag_id;
+        dag_nodes.push(DagNode {
+            id: dag_id,
+            terminal: temp_nodes[temp_id].terminal,
+            children: temp_nodes[temp_id].children.clone(),
+            incoming_count: 0,
+            reachable_terminals: 0,
+            reachable_weight: 0,
+        });
+        for edge in temp_nodes[temp_id].children.iter().rev() {
+            stack.push(edge.target);
+        }
+    }
+    (temp_to_dag, dag_nodes)
+}
+fn fill_dag_counts(
+    source: &SourceGraph,
+    source_to_temp_dag: &[usize],
+    temp_to_dag: &[usize],
+    nodes: &mut [DagNode],
+) {
+    let mut terminal_ids_by_node = vec![BTreeSet::<usize>::new(); nodes.len()];
+    let all_edges = nodes
+        .iter()
+        .flat_map(|node| node.children.iter().map(|edge| edge.target))
+        .collect::<Vec<_>>();
+    for target in all_edges {
+        nodes[target].incoming_count += 1;
+    }
+    let source_parents = source_parent_map(source);
+    for (terminal_index, terminal) in source.terminals.iter().enumerate() {
+        let mut source_node_id = Some(terminal.node_id);
+        while let Some(node_id) = source_node_id {
+            let temp_id = source_to_temp_dag[node_id];
+            let dag_id = temp_to_dag[temp_id];
+            terminal_ids_by_node[dag_id].insert(terminal_index);
+            source_node_id = source_parents[node_id];
+        }
+    }
+    for (node_id, terminal_ids) in terminal_ids_by_node.into_iter().enumerate() {
+        nodes[node_id].reachable_terminals = terminal_ids.len();
+        nodes[node_id].reachable_weight = terminal_ids
+            .into_iter()
+            .map(|terminal_id| source.terminals[terminal_id].weight)
+            .sum();
+    }
+}
+fn source_parent_map(source: &SourceGraph) -> Vec<Option<usize>> {
+    let mut parents = vec![None; source.nodes.len()];
+    for node in &source.nodes {
+        for &child_id in &node.children {
+            parents[child_id] = Some(node.id);
+        }
+    }
+    parents
+}
+fn dag_max_depth(nodes: &[DagNode]) -> usize {
+    let mut max_depth = 0;
+    let mut stack = vec![(0, 0)];
+    while let Some((node_id, depth)) = stack.pop() {
+        max_depth = max_depth.max(depth);
+        for edge in &nodes[node_id].children {
+            stack.push((edge.target, depth + 1));
+        }
+    }
+    max_depth
+}
+fn ensure_output_parent(output: &Path) -> Result<()> {
+    if let Some(parent) = output.parent() {
+        if !parent.as_os_str().is_empty() {
+            fs::create_dir_all(parent)
+                .with_context(|| format!("failed to create {}", parent.display()))?;
+        }
+    }
+    Ok(())
+}
+fn read_records(input_path: &Path) -> Result<Vec<InputRecord>> {
+    let input = File::open(input_path)
+        .with_context(|| format!("failed to open {}", input_path.display()))?;
+    let reader = BufReader::new(input);
+    let mut records = Vec::new();
+    for (line_number, line) in reader.lines().enumerate() {
+        let line = line.with_context(|| {
+            format!(
+                "failed to read line {} from {}",
+                line_number + 1,
+                input_path.display()
+            )
+        })?;
+        let line = line.trim();
+        if line.is_empty() {
+            continue;
+        }
+        let record = serde_json::from_str(line).with_context(|| {
+            format!(
+                "failed to parse line {} from {}",
+                line_number + 1,
+                input_path.display()
+            )
+        })?;
+        records.push(record);
+    }
+    Ok(records)
+}
+fn build_observation(source_index: usize, record: &InputRecord) -> Option<PatternObservation> {
+    let (prefix, suffix) = find_episode_prefix(&record.value)?;
+    if prefix.is_empty() {
+        return None;
+    }
+    let digit_skeleton = digit_skeleton(&prefix);
+    Some(PatternObservation {
+        source_index,
+        prefix,
+        digit_skeleton,
+        suffix,
+        value: record.value.clone(),
+        uses_path: record.uses_path,
+        has_trailing_hash: record.has_trailing_hash,
+        count: record.count,
+    })
+}
+fn aggregate_observations(
+    observations: &[PatternObservation],
+    min_count: usize,
+    example_count: usize,
+) -> Vec<GroupBuilder> {
+    let mut groups = BTreeMap::<String, GroupBuilder>::new();
+    for observation in observations {
+        let group = groups
+            .entry(observation.prefix.clone())
+            .or_insert_with(|| GroupBuilder {
+                prefix: observation.prefix.clone(),
+                digit_skeleton: observation.digit_skeleton.clone(),
+                ..GroupBuilder::default()
+            });
+        group.count += 1;
+        group.weight += observation.count;
+        group.uses_path_count += observation.count * usize::from(observation.uses_path);
+        group.has_trailing_hash_count +=
+            observation.count * usize::from(observation.has_trailing_hash);
+        if !observation.suffix.is_empty() && group.suffix_examples.len() < example_count {
+            group.suffix_examples.push(observation.suffix.clone());
+        }
+        if group.value_examples.len() < example_count {
+            group.value_examples.push(observation.value.clone());
+        }
+    }
+    groups
+        .into_values()
+        .filter(|group| group.count >= min_count)
+        .collect()
+}
+fn build_trie(groups: &[GroupBuilder]) -> (Vec<OutputNode>, Vec<OutputTerminal>, usize, usize) {
+    let mut nodes = vec![TrieNode::root()];
+    let mut terminal_node_ids = Vec::with_capacity(groups.len());
+    for group in groups {
+        let (terminal_node_id, path) = insert_prefix(&mut nodes, &group.prefix);
+        terminal_node_ids.push(terminal_node_id);
+        for id in path {
+            nodes[id].subtree_patterns += group.count;
+            nodes[id].subtree_weight += group.weight;
+        }
+    }
+    assign_depths(&mut nodes);
+    let terminals = groups
+        .iter()
+        .zip(terminal_node_ids)
+        .map(|(group, node_id)| OutputTerminal {
+            node_id,
+            prefix: group.prefix.clone(),
+            digit_skeleton: group.digit_skeleton.clone(),
+            count: group.count,
+            weight: group.weight,
+            uses_path_count: group.uses_path_count,
+            has_trailing_hash_count: group.has_trailing_hash_count,
+            suffix_examples: group.suffix_examples.clone(),
+            value_examples: group.value_examples.clone(),
+            annotations: TerminalAnnotations::default(),
+        })
+        .collect::<Vec<_>>();
+    let max_depth = nodes.iter().map(|node| node.depth).max().unwrap_or(0);
+    let grouped_weight = groups.iter().map(|group| group.weight).sum();
+    let output_nodes = nodes
+        .into_iter()
+        .map(|node| OutputNode {
+            id: node.id,
+            edge_label: node.edge_label,
+            depth: node.depth,
+            children: node.children_by_label.into_values().collect(),
+            subtree_patterns: node.subtree_patterns,
+            subtree_weight: node.subtree_weight,
+        })
+        .collect();
+    (output_nodes, terminals, max_depth, grouped_weight)
+}
+impl TrieNode {
+    fn root() -> Self {
+        Self {
+            id: 0,
+            edge_label: String::new(),
+            depth: 0,
+            parent: None,
+            children_by_label: BTreeMap::new(),
+            subtree_patterns: 0,
+            subtree_weight: 0,
+        }
+    }
+}
+fn insert_prefix(nodes: &mut Vec<TrieNode>, prefix: &str) -> (usize, Vec<usize>) {
+    let mut current = 0;
+    let mut remaining = prefix;
+    let mut path = vec![current];
+    loop {
+        if remaining.is_empty() {
+            return (current, path);
+        }
+        let matching_child =
+            nodes[current]
+                .children_by_label
+                .iter()
+                .find_map(|(label, &child_id)| {
+                    let common_len = common_prefix_len(remaining, label);
+                    (common_len > 0).then(|| (label.clone(), child_id, common_len))
+                });
+        let Some((child_label, child_id, common_len)) = matching_child else {
+            let child_id = push_node(nodes, current, remaining.to_owned());
+            nodes[current]
+                .children_by_label
+                .insert(remaining.to_owned(), child_id);
+            path.push(child_id);
+            return (child_id, path);
+        };
+        if common_len == child_label.len() {
+            current = child_id;
+            remaining = &remaining[common_len..];
+            path.push(current);
+            continue;
+        }
+        let shared_label = child_label[..common_len].to_owned();
+        let old_suffix = child_label[common_len..].to_owned();
+        let new_suffix = remaining[common_len..].to_owned();
+        let split_id = push_node(nodes, current, shared_label.clone());
+        nodes[current].children_by_label.remove(&child_label);
+        nodes[current]
+            .children_by_label
+            .insert(shared_label, split_id);
+        nodes[child_id].edge_label = old_suffix.clone();
+        nodes[child_id].parent = Some(split_id);
+        nodes[split_id].subtree_patterns = nodes[child_id].subtree_patterns;
+        nodes[split_id].subtree_weight = nodes[child_id].subtree_weight;
+        nodes[split_id]
+            .children_by_label
+            .insert(old_suffix, child_id);
+        path.push(split_id);
+        if new_suffix.is_empty() {
+            return (split_id, path);
+        }
+        let new_child_id = push_node(nodes, split_id, new_suffix.clone());
+        nodes[split_id]
+            .children_by_label
+            .insert(new_suffix, new_child_id);
+        path.push(new_child_id);
+        return (new_child_id, path);
+    }
+}
+fn push_node(nodes: &mut Vec<TrieNode>, parent_id: usize, edge_label: String) -> usize {
+    let child_id = nodes.len();
+    nodes.push(TrieNode {
+        id: child_id,
+        edge_label,
+        depth: nodes[parent_id].depth + 1,
+        parent: Some(parent_id),
+        children_by_label: BTreeMap::new(),
+        subtree_patterns: 0,
+        subtree_weight: 0,
+    });
+    child_id
+}
+fn common_prefix_len(left: &str, right: &str) -> usize {
+    let mut len = 0;
+    for ((left_index, left_ch), (_, right_ch)) in left.char_indices().zip(right.char_indices()) {
+        if left_ch != right_ch {
+            break;
+        }
+        len = left_index + left_ch.len_utf8();
+    }
+    len
+}
+fn assign_depths(nodes: &mut [TrieNode]) {
+    let mut stack = vec![(0, 0)];
+    while let Some((node_id, depth)) = stack.pop() {
+        nodes[node_id].depth = depth;
+        let child_ids = nodes[node_id]
+            .children_by_label
+            .values()
+            .copied()
+            .collect::<Vec<_>>();
+        for child_id in child_ids {
+            stack.push((child_id, depth + 1));
+        }
+    }
+}
+fn find_episode_prefix(value: &str) -> Option<(String, String)> {
+    let best_end = EPISODE_PATTERNS
+        .iter()
+        .filter_map(|pattern| pattern.find(value).map(|matched| matched.end()))
+        .chain(find_delimited_number_episode_end(value))
+        .max()?;
+    let prefix = value[..best_end].trim_end().to_owned();
+    let suffix = value[best_end..].trim().to_owned();
+    Some((prefix, suffix))
+}
+fn find_delimited_number_episode_end(value: &str) -> Option<usize> {
+    let mut digits_start = None;
+    let mut digit_count = 0;
+    for (index, ch) in value
+        .char_indices()
+        .chain(std::iter::once((value.len(), '\0')))
+    {
+        if ch.is_ascii_digit() {
+            if digits_start.is_none() {
+                digits_start = Some(index);
+            }
+            digit_count += 1;
+            continue;
+        }
+        if let Some(start) = digits_start {
+            if (1..=4).contains(&digit_count)
+                && has_episode_left_boundary(value, start)
+                && has_episode_right_boundary(ch)
+            {
+                return Some(index);
+            }
+        }
+        digits_start = None;
+        digit_count = 0;
+    }
+    None
+}
+fn has_episode_left_boundary(value: &str, digits_start: usize) -> bool {
+    if digits_start == 0 {
+        return true;
+    }
+    value[..digits_start]
+        .chars()
+        .next_back()
+        .is_some_and(|ch| ch.is_whitespace() || matches!(ch, '.' | '_' | '-'))
+}
+fn has_episode_right_boundary(ch: char) -> bool {
+    ch == '\0'
+        || ch.is_whitespace()
+        || matches!(ch, '.' | '_' | '-' | ']' | ')' | '【' | '】' | '[')
+}
+fn digit_skeleton(text: &str) -> String {
+    DIGITS.replace_all(text, "<NUM>").into_owned()
+}
+fn display_path(path: &Path) -> String {
+    path.display().to_string()
+}
+fn default_count() -> usize {
+    1
+}
+fn default_annotations_value() -> serde_json::Value {
+    serde_json::json!({})
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn dag_merges_equal_suffix_nodes_without_normalizing_digits() {
+        let source = SourceGraph {
+            meta: serde_json::json!({}),
+            nodes: vec![
+                SourceNode {
+                    id: 0,
+                    edge_label: String::new(),
+                    depth: 0,
+                    children: vec![1, 3],
+                },
+                SourceNode {
+                    id: 1,
+                    edge_label: "A01".to_owned(),
+                    depth: 1,
+                    children: vec![2],
+                },
+                SourceNode {
+                    id: 2,
+                    edge_label: "X".to_owned(),
+                    depth: 2,
+                    children: vec![],
+                },
+                SourceNode {
+                    id: 3,
+                    edge_label: "B02".to_owned(),
+                    depth: 1,
+                    children: vec![4],
+                },
+                SourceNode {
+                    id: 4,
+                    edge_label: "X".to_owned(),
+                    depth: 2,
+                    children: vec![],
+                },
+            ],
+            terminals: vec![
+                SourceTerminal {
+                    terminal_id: None,
+                    node_id: 2,
+                    prefix: "A01X".to_owned(),
+                    digit_skeleton: "A<NUM>X".to_owned(),
+                    count: 1,
+                    weight: 1,
+                    uses_path_count: 0,
+                    has_trailing_hash_count: 0,
+                    suffix_examples: vec![],
+                    value_examples: vec!["A01X".to_owned()],
+                    annotations: serde_json::json!({}),
+                },
+                SourceTerminal {
+                    terminal_id: None,
+                    node_id: 4,
+                    prefix: "B02X".to_owned(),
+                    digit_skeleton: "B<NUM>X".to_owned(),
+                    count: 1,
+                    weight: 1,
+                    uses_path_count: 0,
+                    has_trailing_hash_count: 0,
+                    suffix_examples: vec![],
+                    value_examples: vec!["B02X".to_owned()],
+                    annotations: serde_json::json!({}),
+                },
+            ],
+        };
+        let dag = minimize_source_graph(
+            &source,
+            Path::new("dmhy_prefix_graph.json"),
+            Path::new("dmhy_prefix_dag.json"),
+        )
+        .unwrap();
+        assert!(dag.meta.preserves_digits);
+        assert!(dag.meta.merged_nodes >= 2);
+        assert_eq!(dag.terminals[0].prefix, "A01X");
+        assert_eq!(dag.terminals[1].prefix, "B02X");
+        assert_eq!(dag.terminals[0].node_id, dag.terminals[1].node_id);
+        let shared_parent_targets = dag.nodes[0]
+            .children
+            .iter()
+            .filter(|edge| edge.label == "A01" || edge.label == "B02")
+            .map(|edge| edge.target)
+            .collect::<BTreeSet<_>>();
+        assert_eq!(shared_parent_targets.len(), 1);
+        let shared_parent = shared_parent_targets.into_iter().next().unwrap();
+        assert_eq!(dag.nodes[shared_parent].children.len(), 1);
+        assert_eq!(dag.nodes[shared_parent].children[0].label, "X");
+        assert_eq!(dag.nodes[shared_parent].reachable_terminals, 2);
+        assert_eq!(dag.nodes[shared_parent].reachable_weight, 2);
+    }
+}

tools/test_annotated_dmhy_workflow.py ADDED Viewed

	@@ -0,0 +1,445 @@

+"""Smoke tests for annotated DMHY graph dataset helpers."""
+from __future__ import annotations
+import tempfile
+import json
+import subprocess
+import sys
+import unittest
+from pathlib import Path
+from tools.annotate_dmhy_prefix_graph import normalize_generated_tokens
+from tools.convert_annotated_dmhy_dataset import (
+    iter_validated_jsonl,
+    validate_record,
+)
+from tools.convert_to_char_dataset import convert_record
+class AnnotatedDmhyWorkflowTests(unittest.TestCase):
+    def test_generated_tokens_split_punctuation_and_use_b_only_labels(self) -> None:
+        tokens, labels = normalize_generated_tokens(
+            ["[ANi]", " ", "Title-Name", "07"],
+            ["B-GROUP", "O", "I-TITLE", "B-EPISODE"],
+        )
+        self.assertEqual(tokens, ["[", "ANi", "]", " ", "Title", "-", "Name", "07"])
+        self.assertEqual(
+            labels,
+            ["O", "B-GROUP", "O", "O", "B-TITLE", "O", "B-TITLE", "B-EPISODE"],
+        )
+        self.assertTrue(all(label == "O" or label.startswith("B-") for label in labels))
+    def test_preserve_i_labels_keeps_i_on_non_separator_pieces(self) -> None:
+        tokens, labels = normalize_generated_tokens(
+            ["Title-Name"],
+            ["I-TITLE"],
+            preserve_i_labels=True,
+        )
+        self.assertEqual(tokens, ["Title", "-", "Name"])
+        self.assertEqual(labels, ["I-TITLE", "O", "I-TITLE"])
+    def test_validation_rejects_embedded_punctuation(self) -> None:
+        record = {
+            "filename": "Title-Name 07",
+            "tokens": ["Title-Name", "07"],
+            "labels": ["B-TITLE", "B-EPISODE"],
+        }
+        with self.assertRaisesRegex(ValueError, "contains punctuation"):
+            validate_record(record, Path("sample.jsonl"), 1)
+    def test_validation_rejects_embedded_symbol_separator(self) -> None:
+        record = {
+            "filename": "Title 1920×1080 07",
+            "tokens": ["Title", "1920×1080", "07"],
+            "labels": ["B-TITLE", "B-RESOLUTION", "B-EPISODE"],
+        }
+        with self.assertRaisesRegex(ValueError, "contains punctuation"):
+            validate_record(record, Path("sample.jsonl"), 1)
+    def test_b_only_input_converts_to_char_i_labels(self) -> None:
+        record = {
+            "filename": "Title-Name 07",
+            "tokens": ["Title", "-", "Name", " ", "07"],
+            "labels": ["B-TITLE", "O", "B-TITLE", "O", "B-EPISODE"],
+        }
+        validate_record(record, Path("sample.jsonl"), 1)
+        converted = convert_record(record)
+        self.assertIn("I-TITLE", converted["labels"])
+        self.assertEqual(converted["tokens"][:5], ["T", "i", "t", "l", "e"])
+    def test_iter_validated_jsonl_accepts_generated_shape(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "records.jsonl"
+            path.write_text(
+                '{"filename":"A 01","tokens":["A"," ","01"],"labels":["B-TITLE","O","B-EPISODE"]}\n',
+                encoding="utf-8",
+            )
+            rows = list(iter_validated_jsonl(path))
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0]["filename"], "A 01")
+    def test_cli_smoke_annotate_then_convert_with_temp_files(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp = Path(tmpdir)
+            graph_path = tmp / "graph.json"
+            dataset_path = tmp / "dmhy_weak.generated.jsonl"
+            char_path = tmp / "dmhy_weak.generated_char.jsonl"
+            vocab_path = tmp / "vocab.generated.char.json"
+            manifest_path = tmp / "manifest.json"
+            graph_path.write_text(
+                json.dumps(
+                    {
+                        "terminals": [
+                            {
+                                "terminal_id": "t0",
+                                "weight": 1,
+                                "value_examples": [
+                                    "[ANi] Test Show - 01 [1080P][WEB-DL].mkv"
+                                ],
+                                "suffix_examples": [" [1080P][WEB-DL]"],
+                            }
+                        ]
+                    },
+                    ensure_ascii=False,
+                ),
+                encoding="utf-8",
+            )
+            annotate = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "tools.annotate_dmhy_prefix_graph",
+                    "--graph",
+                    str(graph_path),
+                    "--output",
+                    str(dataset_path),
+                    "--patch-output",
+                    "",
+                    "--examples-only",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(annotate.returncode, 0, annotate.stderr)
+            rows = [
+                json.loads(line)
+                for line in dataset_path.read_text(encoding="utf-8").splitlines()
+                if line.strip()
+            ]
+            self.assertEqual(len(rows), 1)
+            self.assertIn("annotations", rows[0])
+            self.assertEqual(rows[0]["tokens"][0], "[")
+            self.assertEqual(rows[0]["labels"][0], "O")
+            convert = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "tools.convert_annotated_dmhy_dataset",
+                    "--input",
+                    str(dataset_path),
+                    "--output",
+                    str(char_path),
+                    "--vocab-output",
+                    str(vocab_path),
+                    "--manifest-output",
+                    str(manifest_path),
+                    "--progress",
+                    "0",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(convert.returncode, 0, convert.stderr)
+            self.assertTrue(char_path.exists())
+            self.assertTrue(vocab_path.exists())
+            self.assertTrue(manifest_path.exists())
+    def test_cli_source_list_mode_expands_beyond_value_examples(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp = Path(tmpdir)
+            graph_path = tmp / "graph.json"
+            source_path = tmp / "dmhy_list.jsonl"
+            dataset_path = tmp / "dmhy_weak.generated.jsonl"
+            graph_path.write_text(
+                json.dumps(
+                    {
+                        "terminals": [
+                            {
+                                "terminal_id": "t0",
+                                "prefix": "[ANi] Full Show - ",
+                                "weight": 10,
+                                "value_examples": [
+                                    "[ANi] Full Show - 01 [1080P][WEB-DL].mkv"
+                                ],
+                                "suffix_examples": ["01 [1080P][WEB-DL]"],
+                            },
+                            {
+                                "terminal_id": "t1",
+                                "prefix": "[ANi] Other Show - ",
+                                "weight": 10,
+                                "value_examples": [
+                                    "[ANi] Other Show - 01 [1080P][WEB-DL].mkv"
+                                ],
+                                "suffix_examples": ["01 [1080P][WEB-DL]"],
+                            },
+                        ]
+                    },
+                    ensure_ascii=False,
+                ),
+                encoding="utf-8",
+            )
+            source_path.write_text(
+                "\n".join(
+                    json.dumps({"value": value}, ensure_ascii=False)
+                    for value in [
+                        "[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
+                        "[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
+                        "[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
+                        "[ANi] Other Show - 01 [1080P][WEB-DL].mkv",
+                    ]
+                )
+                + "\n",
+                encoding="utf-8",
+            )
+            annotate = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "tools.annotate_dmhy_prefix_graph",
+                    "--graph",
+                    str(graph_path),
+                    "--source-list",
+                    str(source_path),
+                    "--output",
+                    str(dataset_path),
+                    "--patch-output",
+                    "",
+                    "--limit",
+                    "1",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(annotate.returncode, 0, annotate.stderr)
+            rows = [
+                json.loads(line)
+                for line in dataset_path.read_text(encoding="utf-8").splitlines()
+                if line.strip()
+            ]
+            self.assertEqual(len(rows), 3)
+            self.assertEqual([row["filename"] for row in rows], [
+                "[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
+                "[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
+                "[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
+            ])
+            self.assertTrue(all(row["terminal_id"] == "t0" for row in rows))
+    def test_cli_examples_only_uses_terminal_value_examples(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp = Path(tmpdir)
+            graph_path = tmp / "graph.json"
+            source_path = tmp / "dmhy_list.jsonl"
+            dataset_path = tmp / "dmhy_weak.generated.jsonl"
+            graph_path.write_text(
+                json.dumps(
+                    {
+                        "terminals": [
+                            {
+                                "terminal_id": "t0",
+                                "prefix": "[ANi] Example Show - ",
+                                "weight": 10,
+                                "value_examples": [
+                                    "[ANi] Example Show - 01 [1080P][WEB-DL].mkv"
+                                ],
+                                "suffix_examples": ["01 [1080P][WEB-DL]"],
+                            }
+                        ]
+                    },
+                    ensure_ascii=False,
+                ),
+                encoding="utf-8",
+            )
+            source_path.write_text(
+                "\n".join(
+                    json.dumps({"value": value}, ensure_ascii=False)
+                    for value in [
+                        "[ANi] Example Show - 01 [1080P][WEB-DL].mkv",
+                        "[ANi] Example Show - 02 [1080P][WEB-DL].mkv",
+                    ]
+                )
+                + "\n",
+                encoding="utf-8",
+            )
+            annotate = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "tools.annotate_dmhy_prefix_graph",
+                    "--graph",
+                    str(graph_path),
+                    "--source-list",
+                    str(source_path),
+                    "--output",
+                    str(dataset_path),
+                    "--patch-output",
+                    "",
+                    "--examples-only",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(annotate.returncode, 0, annotate.stderr)
+            rows = [
+                json.loads(line)
+                for line in dataset_path.read_text(encoding="utf-8").splitlines()
+                if line.strip()
+            ]
+            self.assertEqual(len(rows), 1)
+            self.assertEqual(rows[0]["filename"], "[ANi] Example Show - 01 [1080P][WEB-DL].mkv")
+    def test_cli_dag_annotation_units_include_shared_node_terminals(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp = Path(tmpdir)
+            dag_path = tmp / "dmhy_prefix_dag.json"
+            output_path = tmp / "dmhy_prefix_dag.annotation_units.jsonl"
+            dag_path.write_text(
+                json.dumps(
+                    {
+                        "meta": {"version": "prefix-dag-v1"},
+                        "root": 0,
+                        "nodes": [
+                            {
+                                "id": 0,
+                                "terminal": False,
+                                "children": [
+                                    {"label": "A", "target": 1},
+                                    {"label": "B", "target": 2},
+                                ],
+                                "incoming_count": 0,
+                                "reachable_terminals": 2,
+                                "reachable_weight": 20,
+                            },
+                            {
+                                "id": 1,
+                                "terminal": False,
+                                "children": [{"label": " shared", "target": 3}],
+                                "incoming_count": 1,
+                                "reachable_terminals": 1,
+                                "reachable_weight": 10,
+                            },
+                            {
+                                "id": 2,
+                                "terminal": False,
+                                "children": [{"label": " shared", "target": 3}],
+                                "incoming_count": 1,
+                                "reachable_terminals": 1,
+                                "reachable_weight": 10,
+                            },
+                            {
+                                "id": 3,
+                                "terminal": False,
+                                "children": [
+                                    {"label": " 01", "target": 4},
+                                    {"label": " 02", "target": 5},
+                                ],
+                                "incoming_count": 2,
+                                "reachable_terminals": 2,
+                                "reachable_weight": 20,
+                            },
+                            {
+                                "id": 4,
+                                "terminal": True,
+                                "children": [],
+                                "incoming_count": 1,
+                                "reachable_terminals": 1,
+                                "reachable_weight": 10,
+                            },
+                            {
+                                "id": 5,
+                                "terminal": True,
+                                "children": [],
+                                "incoming_count": 1,
+                                "reachable_terminals": 1,
+                                "reachable_weight": 10,
+                            },
+                        ],
+                        "terminals": [
+                            {
+                                "terminal_id": "t0",
+                                "node_id": 4,
+                                "prefix": "Show A shared 01",
+                                "digit_skeleton": "Show A shared <NUM>",
+                                "count": 10,
+                                "weight": 10,
+                                "suffix_examples": [" [1080P][WEB-DL]"],
+                                "value_examples": ["Show A shared 01 [1080P][WEB-DL].mkv"],
+                                "annotations": {},
+                            },
+                            {
+                                "terminal_id": "t1",
+                                "node_id": 5,
+                                "prefix": "Show B shared 02",
+                                "digit_skeleton": "Show B shared <NUM>",
+                                "count": 10,
+                                "weight": 10,
+                                "suffix_examples": [" [1080P][WEB-DL]"],
+                                "value_examples": ["Show B shared 02 [1080P][WEB-DL].mkv"],
+                                "annotations": {},
+                            },
+                        ],
+                    },
+                    ensure_ascii=False,
+                ),
+                encoding="utf-8",
+            )
+            annotate = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "tools.annotate_dmhy_prefix_dag",
+                    "--dag",
+                    str(dag_path),
+                    "--output",
+                    str(output_path),
+                    "--min-reachable-terminals",
+                    "2",
+                    "--min-incoming-count",
+                    "2",
+                    "--limit",
+                    "1",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(annotate.returncode, 0, annotate.stderr)
+            rows = [
+                json.loads(line)
+                for line in output_path.read_text(encoding="utf-8").splitlines()
+                if line.strip()
+            ]
+            self.assertEqual(len(rows), 1)
+            self.assertEqual(rows[0]["unit_id"], "dag-node-3")
+            self.assertEqual(rows[0]["kind"], "shared_suffix")
+            self.assertEqual(rows[0]["terminal_ids"], ["t0", "t1"])
+            self.assertEqual(
+                rows[0]["prefix_examples"],
+                ["Show A shared 01", "Show B shared 02"],
+            )
+            self.assertEqual(rows[0]["common_edge_labels"], [" 01", " 02"])
+            self.assertIn("annotations", rows[0])
+if __name__ == "__main__":
+    unittest.main()