ModerRAS commited on
Commit
33bb11c
·
1 Parent(s): 4248c69

Add DMHY prefix graph annotation workflow

Browse files
datasets/AnimeName CHANGED
@@ -1 +1 @@
1
- Subproject commit 2ea069cd2c6f4c8b085bdfaddc5659781623cf45
 
1
+ Subproject commit ab3fbcad1a4bf889090d050248130c7d763c457e
tools/annotate_dmhy_prefix_dag.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build annotation units from a DMHY prefix DAG.
2
+
3
+ The DAG producer keeps repeated suffix structure shared across many raw
4
+ prefixes. This tool turns those shared nodes into compact, traceable units for
5
+ LLM or human review without calling any remote service.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Any, Iterable
15
+
16
+ from tools.annotate_dmhy_prefix_graph import (
17
+ heuristic_patch,
18
+ string_list,
19
+ )
20
+
21
+
22
+ DEFAULT_DAG = Path("datasets/AnimeName/dmhy_prefix_dag.json")
23
+ DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_dag.annotation_units.jsonl")
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class Args:
28
+ dag: Path
29
+ output: Path
30
+ min_reachable_terminals: int
31
+ min_incoming_count: int
32
+ limit: int | None
33
+ example_count: int
34
+
35
+
36
+ def parse_args() -> Args:
37
+ parser = argparse.ArgumentParser(
38
+ description="Emit DAG-aware DMHY prefix annotation units as JSONL"
39
+ )
40
+ parser.add_argument("--dag", type=Path, default=DEFAULT_DAG, help="Input dmhy_prefix_dag.json")
41
+ parser.add_argument(
42
+ "--output",
43
+ type=Path,
44
+ default=DEFAULT_OUTPUT,
45
+ help="Output annotation unit JSONL",
46
+ )
47
+ parser.add_argument(
48
+ "--min-reachable-terminals",
49
+ type=int,
50
+ default=2,
51
+ help="Select non-root nodes reaching at least this many terminals",
52
+ )
53
+ parser.add_argument(
54
+ "--min-incoming-count",
55
+ type=int,
56
+ default=2,
57
+ help="Select nodes with at least this many incoming DAG edges",
58
+ )
59
+ parser.add_argument("--limit", type=int, default=None, help="Maximum units to write")
60
+ parser.add_argument(
61
+ "--example-count",
62
+ type=int,
63
+ default=5,
64
+ help="Maximum examples retained per example field",
65
+ )
66
+ ns = parser.parse_args()
67
+ return Args(
68
+ dag=ns.dag,
69
+ output=ns.output,
70
+ min_reachable_terminals=max(1, ns.min_reachable_terminals),
71
+ min_incoming_count=max(1, ns.min_incoming_count),
72
+ limit=ns.limit,
73
+ example_count=max(1, ns.example_count),
74
+ )
75
+
76
+
77
+ def load_dag(path: Path) -> dict[str, Any]:
78
+ if not path.exists():
79
+ raise SystemExit(f"DAG not found: {path}")
80
+ try:
81
+ dag = json.loads(path.read_text(encoding="utf-8"))
82
+ except json.JSONDecodeError as exc:
83
+ raise SystemExit(f"invalid DAG JSON in {path}: {exc}") from exc
84
+ if not isinstance(dag, dict):
85
+ raise SystemExit(f"invalid DAG schema in {path}: root must be an object")
86
+ if not isinstance(dag.get("nodes"), list):
87
+ raise SystemExit(f"invalid DAG schema in {path}: missing nodes list")
88
+ if not isinstance(dag.get("terminals"), list):
89
+ raise SystemExit(f"invalid DAG schema in {path}: missing terminals list")
90
+ return dag
91
+
92
+
93
+ def node_id(node: dict[str, Any], fallback: int) -> int:
94
+ value = node.get("id", fallback)
95
+ try:
96
+ return int(value)
97
+ except (TypeError, ValueError):
98
+ raise SystemExit(f"invalid node id: {value!r}") from None
99
+
100
+
101
+ def terminal_id(terminal: dict[str, Any], fallback: int) -> str:
102
+ value = terminal.get("terminal_id", terminal.get("id", fallback))
103
+ return str(value)
104
+
105
+
106
+ def int_field(row: dict[str, Any], key: str, default: int = 0) -> int:
107
+ try:
108
+ return int(row.get(key, default) or default)
109
+ except (TypeError, ValueError):
110
+ return default
111
+
112
+
113
+ def build_indexes(dag: dict[str, Any]) -> tuple[dict[int, dict[str, Any]], dict[int, list[dict[str, Any]]]]:
114
+ nodes: dict[int, dict[str, Any]] = {}
115
+ for fallback, node in enumerate(dag["nodes"]):
116
+ if not isinstance(node, dict):
117
+ continue
118
+ nodes[node_id(node, fallback)] = node
119
+
120
+ terminals_by_node: dict[int, list[dict[str, Any]]] = {}
121
+ for fallback, terminal in enumerate(dag["terminals"]):
122
+ if not isinstance(terminal, dict):
123
+ continue
124
+ try:
125
+ terminal_node_id = int(terminal.get("node_id"))
126
+ except (TypeError, ValueError):
127
+ continue
128
+ terminal = dict(terminal)
129
+ terminal["_terminal_id"] = terminal_id(terminal, fallback)
130
+ terminal["_terminal_index"] = fallback
131
+ terminals_by_node.setdefault(terminal_node_id, []).append(terminal)
132
+ return nodes, terminals_by_node
133
+
134
+
135
+ def reachable_terminals(
136
+ start_node_id: int,
137
+ nodes: dict[int, dict[str, Any]],
138
+ terminals_by_node: dict[int, list[dict[str, Any]]],
139
+ ) -> list[dict[str, Any]]:
140
+ memo: dict[int, list[dict[str, Any]]] = {}
141
+ visiting: set[int] = set()
142
+
143
+ def visit(current_id: int) -> list[dict[str, Any]]:
144
+ if current_id in memo:
145
+ return memo[current_id]
146
+ if current_id in visiting:
147
+ raise SystemExit(f"cycle detected while traversing DAG at node {current_id}")
148
+ visiting.add(current_id)
149
+ found = list(terminals_by_node.get(current_id, []))
150
+ node = nodes.get(current_id, {})
151
+ for edge in node.get("children") or []:
152
+ if not isinstance(edge, dict):
153
+ continue
154
+ try:
155
+ target = int(edge.get("target"))
156
+ except (TypeError, ValueError):
157
+ continue
158
+ found.extend(visit(target))
159
+ visiting.remove(current_id)
160
+ deduped = dedupe_terminals(found)
161
+ memo[current_id] = deduped
162
+ return deduped
163
+
164
+ return visit(start_node_id)
165
+
166
+
167
+ def dedupe_terminals(terminals: Iterable[dict[str, Any]]) -> list[dict[str, Any]]:
168
+ seen: set[str] = set()
169
+ result: list[dict[str, Any]] = []
170
+ for terminal in terminals:
171
+ tid = str(terminal.get("_terminal_id") or terminal.get("terminal_id") or "")
172
+ if not tid or tid in seen:
173
+ continue
174
+ seen.add(tid)
175
+ result.append(terminal)
176
+ return result
177
+
178
+
179
+ def limited_unique(values: Iterable[str], limit: int) -> list[str]:
180
+ seen: set[str] = set()
181
+ result: list[str] = []
182
+ for value in values:
183
+ if not value or not value.strip() or value in seen:
184
+ continue
185
+ seen.add(value)
186
+ result.append(value)
187
+ if len(result) >= limit:
188
+ break
189
+ return result
190
+
191
+
192
+ def edge_labels(node: dict[str, Any], limit: int) -> list[str]:
193
+ labels = []
194
+ for edge in node.get("children") or []:
195
+ if isinstance(edge, dict) and edge.get("label") is not None:
196
+ labels.append(str(edge["label"]))
197
+ return limited_unique(labels, limit)
198
+
199
+
200
+ def aggregate_examples(terminals: list[dict[str, Any]], key: str, limit: int) -> list[str]:
201
+ values: list[str] = []
202
+ for terminal in terminals:
203
+ if key == "prefix":
204
+ value = terminal.get("prefix")
205
+ if value is not None:
206
+ values.append(str(value))
207
+ else:
208
+ values.extend(string_list(terminal.get(key)))
209
+ return limited_unique(values, limit)
210
+
211
+
212
+ def aggregate_needs_review(terminals: list[dict[str, Any]]) -> bool:
213
+ for index, terminal in enumerate(terminals):
214
+ annotations = terminal.get("annotations")
215
+ if isinstance(annotations, dict) and annotations.get("needs_llm_review") is not None:
216
+ if bool(annotations["needs_llm_review"]):
217
+ return True
218
+ continue
219
+ if heuristic_patch(terminal, index)["needs_llm_review"]:
220
+ return True
221
+ return False
222
+
223
+
224
+ def annotation_template() -> dict[str, Any]:
225
+ return {
226
+ "episode_title_suffixes": [],
227
+ "media_suffixes": [],
228
+ "title_candidates": [],
229
+ "notes": None,
230
+ }
231
+
232
+
233
+ def recommended_prompt(kind: str, terminal_count: int) -> str:
234
+ if kind == "shared_suffix":
235
+ return (
236
+ "Review the shared DAG suffix examples and mark episode-title text, media metadata, "
237
+ f"and possible title candidates for {terminal_count} linked terminals."
238
+ )
239
+ return "Review this terminal cluster and mark episode-title text, media metadata, and title candidates."
240
+
241
+
242
+ def make_unit(
243
+ node: dict[str, Any],
244
+ node_id_value: int,
245
+ terminals: list[dict[str, Any]],
246
+ kind: str,
247
+ example_count: int,
248
+ ) -> dict[str, Any]:
249
+ terminal_ids = [str(terminal["_terminal_id"]) for terminal in terminals]
250
+ reachable_weight = int_field(node, "reachable_weight")
251
+ if reachable_weight <= 0:
252
+ reachable_weight = sum(int_field(terminal, "weight", int_field(terminal, "count", 1)) for terminal in terminals)
253
+ return {
254
+ "unit_id": f"dag-node-{node_id_value}",
255
+ "node_id": node_id_value,
256
+ "kind": kind,
257
+ "incoming_count": int_field(node, "incoming_count"),
258
+ "reachable_terminals": len(terminals),
259
+ "reachable_weight": reachable_weight,
260
+ "terminal_ids": terminal_ids,
261
+ "prefix_examples": aggregate_examples(terminals, "prefix", example_count),
262
+ "value_examples": aggregate_examples(terminals, "value_examples", example_count),
263
+ "suffix_examples": aggregate_examples(terminals, "suffix_examples", example_count),
264
+ "common_edge_labels": edge_labels(node, example_count),
265
+ "needs_llm_review": aggregate_needs_review(terminals),
266
+ "recommended_prompt": recommended_prompt(kind, len(terminals)),
267
+ "annotations": annotation_template(),
268
+ }
269
+
270
+
271
+ def selected_units(dag: dict[str, Any], args: Args) -> list[dict[str, Any]]:
272
+ nodes, terminals_by_node = build_indexes(dag)
273
+ root = int(dag.get("root", 0) or 0)
274
+ candidates: list[tuple[tuple[int, int, int, int, int], dict[str, Any]]] = []
275
+
276
+ for current_id, node in nodes.items():
277
+ terminals = reachable_terminals(current_id, nodes, terminals_by_node)
278
+ if not terminals:
279
+ continue
280
+ incoming_count = int_field(node, "incoming_count")
281
+ reachable_count = len(terminals)
282
+ by_shared_incoming = incoming_count >= args.min_incoming_count
283
+ by_reachable = current_id != root and reachable_count >= args.min_reachable_terminals
284
+ if by_shared_incoming or by_reachable:
285
+ unit = make_unit(node, current_id, terminals, "shared_suffix", args.example_count)
286
+ sort_key = (
287
+ 0,
288
+ -unit["reachable_weight"],
289
+ -unit["reachable_terminals"],
290
+ -unit["incoming_count"],
291
+ current_id,
292
+ )
293
+ candidates.append((sort_key, unit))
294
+
295
+ covered_terminal_ids = {
296
+ terminal_id
297
+ for _sort_key, unit in candidates
298
+ for terminal_id in unit["terminal_ids"]
299
+ }
300
+ for current_id, terminals in terminals_by_node.items():
301
+ uncovered = [terminal for terminal in terminals if terminal["_terminal_id"] not in covered_terminal_ids]
302
+ if not uncovered:
303
+ continue
304
+ node = nodes.get(current_id, {"id": current_id})
305
+ unit = make_unit(node, current_id, uncovered, "terminal_cluster", args.example_count)
306
+ sort_key = (
307
+ 1,
308
+ -unit["reachable_weight"],
309
+ -unit["reachable_terminals"],
310
+ -unit["incoming_count"],
311
+ current_id,
312
+ )
313
+ candidates.append((sort_key, unit))
314
+
315
+ candidates.sort(key=lambda item: item[0])
316
+ units = [unit for _sort_key, unit in candidates]
317
+ if args.limit is not None:
318
+ units = units[: max(0, args.limit)]
319
+ return units
320
+
321
+
322
+ def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
323
+ path.parent.mkdir(parents=True, exist_ok=True)
324
+ count = 0
325
+ with path.open("w", encoding="utf-8", newline="\n") as handle:
326
+ for row in rows:
327
+ handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
328
+ count += 1
329
+ return count
330
+
331
+
332
+ def main() -> None:
333
+ args = parse_args()
334
+ dag = load_dag(args.dag)
335
+ units = selected_units(dag, args)
336
+ count = write_jsonl(args.output, units)
337
+ summary = {
338
+ "dag": str(args.dag),
339
+ "output": str(args.output),
340
+ "annotation_units": count,
341
+ "min_reachable_terminals": args.min_reachable_terminals,
342
+ "min_incoming_count": args.min_incoming_count,
343
+ "example_count": args.example_count,
344
+ }
345
+ print(json.dumps(summary, ensure_ascii=False, indent=2))
346
+
347
+
348
+ if __name__ == "__main__":
349
+ main()
tools/annotate_dmhy_prefix_graph.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Annotate DMHY prefix graph terminals and emit weak-label dataset rows.
2
+
3
+ The graph producer intentionally leaves terminal.annotations empty. This tool
4
+ adds a deterministic suffix-format layer without depending on network access:
5
+
6
+ - classify suffix examples into episode-title text vs media/hash metadata
7
+ - optionally ask an OpenAI-compatible Responses API for a second opinion
8
+ - write dmhy_weak-compatible JSONL records: filename, tokens, labels
9
+ - optionally write graph annotation patch JSONL and/or a merged graph JSON
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import re
18
+ import sys
19
+ import urllib.error
20
+ import urllib.request
21
+ from dataclasses import dataclass
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+ from typing import Any, Iterable
25
+
26
+ from anifilebert.tokenizer import AnimeTokenizer
27
+ from tools.dmhy_dataset import weak_label_filename
28
+
29
+
30
+ DEFAULT_GRAPH = Path("datasets/AnimeName/dmhy_prefix_graph.json")
31
+ DEFAULT_SOURCE_LIST = Path("datasets/AnimeName/dmhy_list.jsonl")
32
+ DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
33
+ DEFAULT_PATCH_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_graph.annotations.jsonl")
34
+ DEFAULT_MODEL = "gpt-5.4-mini"
35
+
36
+ SOURCE = "heuristic-v1"
37
+ LLM_SOURCE = "responses-v1"
38
+
39
+ TRAILING_HASH_RE = re.compile(r"^[A-Fa-f0-9]{8,}$")
40
+ RESOLUTION_RE = re.compile(r"(?i)(?:\b\d{3,4}p\b|\b\dk\b|\b\d{3,4}[xX×]\d{3,4}\b)")
41
+ MEDIA_WORD_RE = re.compile(
42
+ r"(?i)\b(?:"
43
+ r"web[-_. ]?dl|web[-_. ]?rip|bdrip|blu[-_. ]?ray|bdmv|bd|dvd[-_. ]?rip|dvd|"
44
+ r"hdtv|tvrip|remux|x26[45]|h\.?26[45]|hevc|avc|av1|aac\d*(?:\.\d+)?|"
45
+ r"flac|mp3|dts|opus|10[-_. ]?bit|8[-_. ]?bit|hi10p|ma10p|yuv\d+p?\d*|"
46
+ r"chs|cht|gb|big5|jpn?|eng|m?subs?|assx?\d*|srtx?\d*|vfr|cfr|"
47
+ r"nf|netflix|amzn|baha|cr|abema|dsnp|hulu"
48
+ r")\b"
49
+ )
50
+ LANG_CJK_RE = re.compile(r"(?:字幕|简体|繁体|简中|繁中|日语|英语|双语|内封|外挂)")
51
+ QUOTED_RE = re.compile(r"[「『\"“](.+?)[」』\"”]")
52
+ BRACKET_SEGMENT_RE = re.compile(r"(\[[^\]]+\]|\([^)]+\)|【[^】]+】|《[^》]+》)")
53
+ PATH_EPISODE_TITLE_RE = re.compile(
54
+ r"(?i)(?:^|[/\\])[^/\\]*?(?:"
55
+ r"S\d{1,2}E\d{1,4}|\d{1,2}x\d{1,4}|EP?\.?\s*\d{1,4}|ACT\.?\d{1,4}|第\s*\d{1,4}\s*[話话集回]"
56
+ r")\s*[-_ ]+(?P<title>[^/\\\[\(【《]+)"
57
+ )
58
+
59
+
60
+ @dataclass
61
+ class Args:
62
+ graph: Path
63
+ source_list: Path
64
+ output: Path
65
+ patch_output: Path | None
66
+ merge_output: Path | None
67
+ limit: int | None
68
+ min_weight: int | None
69
+ only_needs_review: bool
70
+ llm: bool
71
+ base_url: str
72
+ api_key: str | None
73
+ model: str
74
+ max_requests: int | None
75
+ http_timeout: int
76
+ preserve_i_labels: bool
77
+ examples_only: bool
78
+
79
+
80
+ def parse_args() -> Args:
81
+ parser = argparse.ArgumentParser(
82
+ description="Annotate DMHY prefix graph terminals and write dmhy_weak-compatible rows"
83
+ )
84
+ parser.add_argument("--graph", type=Path, default=DEFAULT_GRAPH, help="Input dmhy_prefix_graph.json")
85
+ parser.add_argument(
86
+ "--source-list",
87
+ type=Path,
88
+ default=DEFAULT_SOURCE_LIST,
89
+ help="Input dmhy_list.jsonl with full raw values; each line must contain a JSON object with value",
90
+ )
91
+ parser.add_argument(
92
+ "--output",
93
+ type=Path,
94
+ default=DEFAULT_OUTPUT,
95
+ help="Output dataset JSONL records compatible with dmhy_weak.jsonl",
96
+ )
97
+ parser.add_argument(
98
+ "--patch-output",
99
+ default=str(DEFAULT_PATCH_OUTPUT),
100
+ help="Optional JSONL terminal annotation patches; use empty string to disable",
101
+ )
102
+ parser.add_argument("--merge-output", type=Path, default=None, help="Optional full graph JSON with terminal.annotations merged")
103
+ parser.add_argument("--limit", type=int, default=None, help="Maximum selected terminals to process")
104
+ parser.add_argument("--min-weight", type=int, default=None, help="Only process terminals with weight >= this value")
105
+ parser.add_argument("--only-needs-review", action="store_true", help="Only process terminals with ambiguous suffix examples")
106
+ parser.add_argument("--llm", action="store_true", help="Opt in to Responses API annotation")
107
+ parser.add_argument(
108
+ "--base-url",
109
+ default=os.environ.get("ANIFILEBERT_LLM_BASE_URL", "http://10.137.32.209:8317/v1"),
110
+ help="OpenAI-compatible API base URL; used only with --llm",
111
+ )
112
+ parser.add_argument(
113
+ "--api-key",
114
+ default=os.environ.get("ANIFILEBERT_LLM_API_KEY"),
115
+ help="API key; falls back to ANIFILEBERT_LLM_API_KEY",
116
+ )
117
+ parser.add_argument("--model", default=DEFAULT_MODEL, help="Responses API model")
118
+ parser.add_argument("--max-requests", type=int, default=None, help="Maximum LLM requests; omitted means no cap")
119
+ parser.add_argument("--http-timeout", type=int, default=120, help="HTTP timeout in seconds per LLM request")
120
+ parser.add_argument(
121
+ "--preserve-i-labels",
122
+ action="store_true",
123
+ help="Keep I-* labels from weak labeling instead of normalizing generated token labels to B/O only",
124
+ )
125
+ parser.add_argument(
126
+ "--examples-only",
127
+ action="store_true",
128
+ help="Use terminal.value_examples only; preserves the old small-sample behavior",
129
+ )
130
+ ns = parser.parse_args()
131
+ patch_output_arg = str(ns.patch_output).strip()
132
+ patch_output = Path(patch_output_arg) if patch_output_arg else None
133
+ if patch_output is not None and str(patch_output).strip() == "":
134
+ patch_output = None
135
+ return Args(
136
+ graph=ns.graph,
137
+ source_list=ns.source_list,
138
+ output=ns.output,
139
+ patch_output=patch_output,
140
+ merge_output=ns.merge_output,
141
+ limit=ns.limit,
142
+ min_weight=ns.min_weight,
143
+ only_needs_review=ns.only_needs_review,
144
+ llm=ns.llm,
145
+ base_url=ns.base_url,
146
+ api_key=ns.api_key,
147
+ model=ns.model,
148
+ max_requests=ns.max_requests,
149
+ http_timeout=ns.http_timeout,
150
+ preserve_i_labels=ns.preserve_i_labels,
151
+ examples_only=ns.examples_only,
152
+ )
153
+
154
+
155
+ def load_graph(path: Path) -> dict[str, Any]:
156
+ if not path.exists():
157
+ raise SystemExit(f"graph not found: {path}")
158
+ try:
159
+ graph = json.loads(path.read_text(encoding="utf-8"))
160
+ except json.JSONDecodeError as exc:
161
+ raise SystemExit(f"invalid graph JSON in {path}: {exc}") from exc
162
+ if not isinstance(graph, dict):
163
+ raise SystemExit(f"invalid graph schema in {path}: root must be an object")
164
+ terminals = graph.get("terminals")
165
+ if not isinstance(terminals, list):
166
+ raise SystemExit(f"invalid graph schema in {path}: missing terminals list")
167
+ if not terminals:
168
+ raise SystemExit(f"graph has no terminals: {path}")
169
+ return graph
170
+
171
+
172
+ def terminal_id(terminal: dict[str, Any], index: int) -> str:
173
+ for key in ("terminal_id", "id", "node_id"):
174
+ value = terminal.get(key)
175
+ if value is not None:
176
+ return str(value)
177
+ return str(index)
178
+
179
+
180
+ def string_list(value: Any) -> list[str]:
181
+ if not isinstance(value, list):
182
+ return []
183
+ return [str(item) for item in value if str(item).strip()]
184
+
185
+
186
+ def unique_keep_order(values: Iterable[str]) -> list[str]:
187
+ seen: set[str] = set()
188
+ result: list[str] = []
189
+ for value in values:
190
+ cleaned = normalize_space(value)
191
+ if not cleaned or cleaned in seen:
192
+ continue
193
+ seen.add(cleaned)
194
+ result.append(cleaned)
195
+ return result
196
+
197
+
198
+ def normalize_space(value: str) -> str:
199
+ return re.sub(r"\s+", " ", value).strip()
200
+
201
+
202
+ def clean_candidate(value: str) -> str:
203
+ value = normalize_space(value)
204
+ value = value.strip("-_ .~/\\|")
205
+ value = value.strip("[]()【】《》「」『』\"“”")
206
+ return normalize_space(value.replace("_", " "))
207
+
208
+
209
+ def is_media_fragment(value: str) -> bool:
210
+ text = clean_candidate(value)
211
+ if not text:
212
+ return False
213
+ if TRAILING_HASH_RE.fullmatch(text):
214
+ return True
215
+ if RESOLUTION_RE.search(text) or MEDIA_WORD_RE.search(text) or LANG_CJK_RE.search(text):
216
+ return True
217
+ if len(text) <= 16 and re.fullmatch(r"[A-Fa-f0-9]{8,}(?:\s*rev)?", text):
218
+ return True
219
+ return False
220
+
221
+
222
+ def split_suffix_fragments(suffix: str) -> tuple[list[str], list[str]]:
223
+ episode_titles: list[str] = []
224
+ media: list[str] = []
225
+
226
+ for match in QUOTED_RE.finditer(suffix):
227
+ candidate = clean_candidate(match.group(1))
228
+ if candidate and not is_media_fragment(candidate):
229
+ episode_titles.append(candidate)
230
+
231
+ remainder = suffix
232
+ for segment in BRACKET_SEGMENT_RE.findall(suffix):
233
+ cleaned = clean_candidate(segment)
234
+ if is_media_fragment(cleaned):
235
+ media.append(segment.strip())
236
+ remainder = remainder.replace(segment, " ", 1)
237
+
238
+ for match in PATH_EPISODE_TITLE_RE.finditer(suffix):
239
+ candidate = clean_candidate(match.group("title"))
240
+ if candidate and not is_media_fragment(candidate):
241
+ episode_titles.append(candidate)
242
+
243
+ for piece in re.split(r"[/\\]", remainder):
244
+ cleaned = clean_candidate(piece)
245
+ if not cleaned:
246
+ continue
247
+ if is_media_fragment(cleaned):
248
+ media.append(cleaned)
249
+ elif QUOTED_RE.search(piece):
250
+ continue
251
+ elif looks_like_plain_episode_title(cleaned):
252
+ episode_titles.append(cleaned)
253
+
254
+ return unique_keep_order(episode_titles), unique_keep_order(media)
255
+
256
+
257
+ def looks_like_plain_episode_title(value: str) -> bool:
258
+ if len(value) < 3 or is_media_fragment(value):
259
+ return False
260
+ if re.fullmatch(r"(?i)(?:part|ova|special|season|stage|act)\s*\d+", value):
261
+ return False
262
+ if re.fullmatch(r"[\d\s._-]+", value):
263
+ return False
264
+ return bool(re.search(r"[A-Za-z\u3040-\u30ff\u3400-\u9fff]", value))
265
+
266
+
267
+ def heuristic_patch(terminal: dict[str, Any], index: int) -> dict[str, Any]:
268
+ suffix_examples = string_list(terminal.get("suffix_examples"))
269
+ value_examples = string_list(terminal.get("value_examples"))
270
+ episode_titles: list[str] = []
271
+ media_suffixes: list[str] = []
272
+
273
+ for suffix in suffix_examples:
274
+ title_bits, media_bits = split_suffix_fragments(suffix)
275
+ episode_titles.extend(title_bits)
276
+ media_suffixes.extend(media_bits)
277
+
278
+ if not episode_titles:
279
+ for value in value_examples:
280
+ for match in PATH_EPISODE_TITLE_RE.finditer(value):
281
+ candidate = clean_candidate(match.group("title"))
282
+ if candidate and not is_media_fragment(candidate):
283
+ episode_titles.append(candidate)
284
+
285
+ episode_titles = unique_keep_order(episode_titles)
286
+ media_suffixes = unique_keep_order(media_suffixes)
287
+ title_candidates = unique_keep_order(clean_candidate(item) for item in episode_titles)
288
+ needs_review = needs_llm_review(terminal, episode_titles, media_suffixes)
289
+ notes = summarize_notes(suffix_examples, episode_titles, media_suffixes, needs_review)
290
+ return {
291
+ "terminal_id": terminal_id(terminal, index),
292
+ "needs_llm_review": needs_review,
293
+ "episode_title_suffixes": episode_titles,
294
+ "media_suffixes": media_suffixes,
295
+ "title_candidates": title_candidates,
296
+ "llm_label": None,
297
+ "notes": notes,
298
+ "source": SOURCE,
299
+ }
300
+
301
+
302
+ def needs_llm_review(
303
+ terminal: dict[str, Any],
304
+ episode_titles: list[str],
305
+ media_suffixes: list[str],
306
+ ) -> bool:
307
+ suffix_examples = string_list(terminal.get("suffix_examples"))
308
+ if not suffix_examples:
309
+ return False
310
+ classified = len(episode_titles) + len(media_suffixes)
311
+ if episode_titles and media_suffixes:
312
+ return True
313
+ if classified == 0:
314
+ return True
315
+ suffix_text = " ".join(suffix_examples)
316
+ if "/" in suffix_text or "\\" in suffix_text:
317
+ return True
318
+ return False
319
+
320
+
321
+ def summarize_notes(
322
+ suffix_examples: list[str],
323
+ episode_titles: list[str],
324
+ media_suffixes: list[str],
325
+ needs_review: bool,
326
+ ) -> str:
327
+ parts = [
328
+ f"suffix_examples={len(suffix_examples)}",
329
+ f"episode_title_suffixes={len(episode_titles)}",
330
+ f"media_suffixes={len(media_suffixes)}",
331
+ ]
332
+ if needs_review:
333
+ parts.append("ambiguous_suffix_layer")
334
+ return "; ".join(parts)
335
+
336
+
337
+ def selected_terminals(graph: dict[str, Any], args: Args) -> list[tuple[int, dict[str, Any], dict[str, Any]]]:
338
+ selected: list[tuple[int, dict[str, Any], dict[str, Any]]] = []
339
+ for index, terminal in enumerate(graph["terminals"]):
340
+ if not isinstance(terminal, dict):
341
+ continue
342
+ weight = int(terminal.get("weight") or terminal.get("count") or 0)
343
+ if args.min_weight is not None and weight < args.min_weight:
344
+ continue
345
+ patch = heuristic_patch(terminal, index)
346
+ if args.only_needs_review and not patch["needs_llm_review"]:
347
+ continue
348
+ selected.append((index, terminal, patch))
349
+ if args.limit is not None and len(selected) >= args.limit:
350
+ break
351
+ return selected
352
+
353
+
354
+ def responses_url(base_url: str) -> str:
355
+ return base_url.rstrip("/") + "/responses"
356
+
357
+
358
+ def extract_response_text(data: dict[str, Any]) -> str:
359
+ output_text = data.get("output_text")
360
+ if isinstance(output_text, str) and output_text.strip():
361
+ return output_text
362
+ chunks: list[str] = []
363
+ for item in data.get("output") or []:
364
+ if not isinstance(item, dict):
365
+ continue
366
+ for content in item.get("content") or []:
367
+ if not isinstance(content, dict):
368
+ continue
369
+ text = content.get("text")
370
+ if isinstance(text, str):
371
+ chunks.append(text)
372
+ return "\n".join(chunks).strip()
373
+
374
+
375
+ def call_llm(terminal: dict[str, Any], patch: dict[str, Any], args: Args) -> dict[str, Any] | None:
376
+ if not args.api_key:
377
+ raise RuntimeError("--llm requires --api-key or ANIFILEBERT_LLM_API_KEY")
378
+
379
+ instructions = (
380
+ "You annotate anime filename suffix examples. Return strict JSON only with keys "
381
+ "episode_title_suffixes, media_suffixes, title_candidates, llm_label, notes. "
382
+ "Classify quoted human episode titles separately from media tags such as resolution, "
383
+ "codec, source, language, subtitle markers, hashes, and release metadata."
384
+ )
385
+ payload = {
386
+ "model": args.model,
387
+ "instructions": instructions,
388
+ "input": json.dumps(
389
+ {
390
+ "terminal_id": patch["terminal_id"],
391
+ "prefix": terminal.get("prefix"),
392
+ "digit_skeleton": terminal.get("digit_skeleton"),
393
+ "suffix_examples": string_list(terminal.get("suffix_examples")),
394
+ "value_examples": string_list(terminal.get("value_examples")),
395
+ "heuristic_patch": patch,
396
+ },
397
+ ensure_ascii=False,
398
+ ),
399
+ }
400
+ request = urllib.request.Request(
401
+ responses_url(args.base_url),
402
+ data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
403
+ headers={
404
+ "Authorization": f"Bearer {args.api_key}",
405
+ "Content-Type": "application/json",
406
+ },
407
+ method="POST",
408
+ )
409
+ try:
410
+ with urllib.request.urlopen(request, timeout=args.http_timeout) as response:
411
+ raw = response.read().decode("utf-8")
412
+ except urllib.error.HTTPError as exc:
413
+ body = exc.read().decode("utf-8", errors="replace")
414
+ raise RuntimeError(f"Responses API HTTP {exc.code}: {body[:500]}") from exc
415
+ except urllib.error.URLError as exc:
416
+ raise RuntimeError(f"Responses API request failed: {exc}") from exc
417
+
418
+ try:
419
+ data = json.loads(raw)
420
+ text = extract_response_text(data)
421
+ annotation = json.loads(strip_json_fence(text))
422
+ except (json.JSONDecodeError, TypeError) as exc:
423
+ raise RuntimeError(f"Responses API returned non-JSON annotation: {raw[:500]}") from exc
424
+
425
+ if not isinstance(annotation, dict):
426
+ raise RuntimeError("Responses API annotation must be a JSON object")
427
+ merged = dict(patch)
428
+ for key in ("episode_title_suffixes", "media_suffixes", "title_candidates"):
429
+ if key in annotation:
430
+ merged[key] = unique_keep_order(str(item) for item in annotation.get(key) or [])
431
+ if "llm_label" in annotation:
432
+ merged["llm_label"] = annotation["llm_label"]
433
+ if "notes" in annotation:
434
+ merged["notes"] = str(annotation["notes"])
435
+ merged["source"] = LLM_SOURCE
436
+ return merged
437
+
438
+
439
+ def strip_json_fence(text: str) -> str:
440
+ text = text.strip()
441
+ text = re.sub(r"^```(?:json)?\s*", "", text)
442
+ text = re.sub(r"\s*```$", "", text)
443
+ return text.strip()
444
+
445
+
446
+ PREFIX_BOUNDARY_CHARS = set(" \t\r\n-_.~/\\|::[]()【】《》「」『』\"'")
447
+
448
+
449
+ def prefix_boundary_ok(value: str, prefix: str) -> bool:
450
+ if not prefix or not value.startswith(prefix):
451
+ return False
452
+ if len(value) == len(prefix):
453
+ return True
454
+ next_char = value[len(prefix)]
455
+ last_char = prefix[-1]
456
+ return next_char in PREFIX_BOUNDARY_CHARS or last_char in PREFIX_BOUNDARY_CHARS
457
+
458
+
459
+ class PrefixTrieNode:
460
+ __slots__ = ("children", "terminal_ordinals")
461
+
462
+ def __init__(self) -> None:
463
+ self.children: dict[str, PrefixTrieNode] = {}
464
+ self.terminal_ordinals: list[int] = []
465
+
466
+
467
+ def build_prefix_trie(selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> PrefixTrieNode:
468
+ root = PrefixTrieNode()
469
+ for ordinal, (_index, terminal, _patch) in enumerate(selected):
470
+ prefix = str(terminal.get("prefix") or "")
471
+ if not prefix:
472
+ continue
473
+ node = root
474
+ for char in prefix:
475
+ node = node.children.setdefault(char, PrefixTrieNode())
476
+ node.terminal_ordinals.append(ordinal)
477
+ return root
478
+
479
+
480
+ def matching_terminal_ordinal(value: str, trie: PrefixTrieNode, selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> int | None:
481
+ node = trie
482
+ best: int | None = None
483
+ for char in value:
484
+ node = node.children.get(char)
485
+ if node is None:
486
+ break
487
+ for ordinal in node.terminal_ordinals:
488
+ prefix = str(selected[ordinal][1].get("prefix") or "")
489
+ if prefix_boundary_ok(value, prefix):
490
+ best = ordinal
491
+ return best
492
+
493
+
494
+ def source_list_matches(
495
+ source_list: Path,
496
+ selected: list[tuple[int, dict[str, Any], dict[str, Any]]],
497
+ ) -> dict[int, list[tuple[int, str]]]:
498
+ if not source_list.exists():
499
+ raise SystemExit(f"source list not found: {source_list}")
500
+
501
+ trie = build_prefix_trie(selected)
502
+ matches: dict[int, list[tuple[int, str]]] = {ordinal: [] for ordinal in range(len(selected))}
503
+ with source_list.open("r", encoding="utf-8") as handle:
504
+ for line_number, line in enumerate(handle, start=1):
505
+ if not line.strip():
506
+ continue
507
+ try:
508
+ row = json.loads(line)
509
+ except json.JSONDecodeError as exc:
510
+ raise SystemExit(f"invalid JSON in {source_list}:{line_number}: {exc}") from exc
511
+ if not isinstance(row, dict):
512
+ continue
513
+ value = row.get("value")
514
+ if not isinstance(value, str) or not value.strip():
515
+ continue
516
+ ordinal = matching_terminal_ordinal(value, trie, selected)
517
+ if ordinal is not None:
518
+ matches[ordinal].append((line_number, value))
519
+ return matches
520
+
521
+
522
+ def dataset_records(
523
+ terminal: dict[str, Any],
524
+ index: int,
525
+ patch: dict[str, Any],
526
+ tokenizer: AnimeTokenizer,
527
+ *,
528
+ filenames: Iterable[tuple[int, str]] | None = None,
529
+ preserve_i_labels: bool = False,
530
+ ) -> list[dict[str, Any]]:
531
+ records: list[dict[str, Any]] = []
532
+ seen: set[str] = set()
533
+ if filenames is None:
534
+ filenames = enumerate(string_list(terminal.get("value_examples")))
535
+ for source_index, filename in filenames:
536
+ if filename in seen:
537
+ continue
538
+ seen.add(filename)
539
+ sample = weak_label_filename(filename, tokenizer)
540
+ if sample is None:
541
+ continue
542
+ tokens, labels = normalize_generated_tokens(
543
+ sample["tokens"],
544
+ sample["labels"],
545
+ preserve_i_labels=preserve_i_labels,
546
+ )
547
+ records.append(
548
+ {
549
+ "file_id": f"prefix-graph:{patch['terminal_id']}:{source_index}",
550
+ "filename": filename,
551
+ "tokens": tokens,
552
+ "labels": labels,
553
+ "terminal_id": patch["terminal_id"],
554
+ "terminal_index": index,
555
+ "source": patch["source"],
556
+ "needs_llm_review": patch["needs_llm_review"],
557
+ "episode_title_suffixes": patch["episode_title_suffixes"],
558
+ "media_suffixes": patch["media_suffixes"],
559
+ "title_candidates": patch["title_candidates"],
560
+ "annotations": {
561
+ "terminal_id": patch["terminal_id"],
562
+ "terminal_index": index,
563
+ "source": patch["source"],
564
+ "needs_llm_review": patch["needs_llm_review"],
565
+ "episode_title_suffixes": patch["episode_title_suffixes"],
566
+ "media_suffixes": patch["media_suffixes"],
567
+ "title_candidates": patch["title_candidates"],
568
+ "llm_label": patch["llm_label"],
569
+ "notes": patch["notes"],
570
+ },
571
+ }
572
+ )
573
+ return records
574
+
575
+
576
+ def is_standalone_separator(token: str) -> bool:
577
+ return len(token) == 1 and (token.isspace() or not token.isalnum())
578
+
579
+
580
+ def split_generated_token(token: str) -> list[str]:
581
+ pieces: list[str] = []
582
+ current: list[str] = []
583
+ for char in token:
584
+ if char.isspace() or not char.isalnum():
585
+ if current:
586
+ pieces.append("".join(current))
587
+ current.clear()
588
+ pieces.append(char)
589
+ else:
590
+ current.append(char)
591
+ if current:
592
+ pieces.append("".join(current))
593
+ return pieces
594
+
595
+
596
+ def b_only_label(label: str) -> str:
597
+ if label.startswith(("B-", "I-")):
598
+ return "B-" + label.split("-", 1)[1]
599
+ return "O" if label == "O" else str(label)
600
+
601
+
602
+ def normalize_generated_tokens(
603
+ tokens: list[str],
604
+ labels: list[str],
605
+ *,
606
+ preserve_i_labels: bool = False,
607
+ ) -> tuple[list[str], list[str]]:
608
+ normalized_tokens: list[str] = []
609
+ normalized_labels: list[str] = []
610
+ for token, label in zip(tokens, labels):
611
+ source_label = str(label)
612
+ entity_label = source_label if preserve_i_labels else b_only_label(source_label)
613
+ for piece in split_generated_token(str(token)):
614
+ normalized_tokens.append(piece)
615
+ if entity_label == "O" or is_standalone_separator(piece):
616
+ normalized_labels.append("O")
617
+ else:
618
+ normalized_labels.append(entity_label)
619
+ return normalized_tokens, normalized_labels
620
+
621
+
622
+ def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
623
+ path.parent.mkdir(parents=True, exist_ok=True)
624
+ count = 0
625
+ with path.open("w", encoding="utf-8", newline="\n") as handle:
626
+ for row in rows:
627
+ handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
628
+ count += 1
629
+ return count
630
+
631
+
632
+ def merge_annotations(graph: dict[str, Any], patches_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
633
+ merged = json.loads(json.dumps(graph, ensure_ascii=False))
634
+ for index, terminal in enumerate(merged.get("terminals") or []):
635
+ if not isinstance(terminal, dict):
636
+ continue
637
+ patch = patches_by_id.get(terminal_id(terminal, index))
638
+ if patch is None:
639
+ continue
640
+ terminal["annotations"] = {
641
+ "episode_title_suffixes": patch["episode_title_suffixes"],
642
+ "media_suffixes": patch["media_suffixes"],
643
+ "title_candidates": patch["title_candidates"],
644
+ "needs_llm_review": patch["needs_llm_review"],
645
+ "llm_label": patch["llm_label"],
646
+ "notes": patch["notes"],
647
+ "source": patch["source"],
648
+ "annotated_at": datetime.now(timezone.utc).isoformat(),
649
+ }
650
+ return merged
651
+
652
+
653
+ def main() -> None:
654
+ args = parse_args()
655
+ graph = load_graph(args.graph)
656
+ selected = selected_terminals(graph, args)
657
+ if not selected:
658
+ raise SystemExit("no terminals selected; adjust --limit/--min-weight/--only-needs-review")
659
+
660
+ tokenizer = AnimeTokenizer()
661
+ llm_requests = 0
662
+ patches: list[dict[str, Any]] = []
663
+ records: list[dict[str, Any]] = []
664
+ source_matches = None if args.examples_only else source_list_matches(args.source_list, selected)
665
+
666
+ for ordinal, (index, terminal, patch) in enumerate(selected):
667
+ if args.llm and patch["needs_llm_review"]:
668
+ if args.max_requests is None or llm_requests < args.max_requests:
669
+ try:
670
+ llm_patch = call_llm(terminal, patch, args)
671
+ if llm_patch is not None:
672
+ patch = llm_patch
673
+ llm_requests += 1
674
+ except RuntimeError as exc:
675
+ print(f"warning: terminal {patch['terminal_id']}: {exc}; using heuristic patch", file=sys.stderr)
676
+ patch["notes"] = f"{patch['notes']}; llm_error={exc}"
677
+ patches.append(patch)
678
+ records.extend(
679
+ dataset_records(
680
+ terminal,
681
+ index,
682
+ patch,
683
+ tokenizer,
684
+ filenames=None if args.examples_only else source_matches.get(ordinal, []),
685
+ preserve_i_labels=args.preserve_i_labels,
686
+ )
687
+ )
688
+
689
+ record_count = write_jsonl(args.output, records)
690
+ patch_count = 0
691
+ if args.patch_output is not None:
692
+ patch_count = write_jsonl(args.patch_output, patches)
693
+ if args.merge_output is not None:
694
+ args.merge_output.parent.mkdir(parents=True, exist_ok=True)
695
+ merged = merge_annotations(graph, {patch["terminal_id"]: patch for patch in patches})
696
+ args.merge_output.write_text(json.dumps(merged, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
697
+
698
+ summary = {
699
+ "graph": str(args.graph),
700
+ "source_list": None if args.examples_only else str(args.source_list),
701
+ "output": str(args.output),
702
+ "patch_output": str(args.patch_output) if args.patch_output is not None else None,
703
+ "merge_output": str(args.merge_output) if args.merge_output is not None else None,
704
+ "selected_terminals": len(selected),
705
+ "examples_only": args.examples_only,
706
+ "dataset_records": record_count,
707
+ "patches": patch_count,
708
+ "llm_enabled": args.llm,
709
+ "llm_requests": llm_requests,
710
+ }
711
+ print(json.dumps(summary, ensure_ascii=False, indent=2))
712
+
713
+
714
+ if __name__ == "__main__":
715
+ main()
tools/convert_annotated_dmhy_dataset.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert annotated DMHY graph JSONL into the character-tokenized dataset.
2
+
3
+ The annotated graph workflow is expected to produce records compatible with
4
+ ``dmhy_weak.jsonl``: each row has ``filename``, ``tokens``, and ``labels``.
5
+ This wrapper validates that contract, then reuses ``tools.convert_to_char_dataset``
6
+ for the token-to-character projection and manifest statistics.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ from collections import Counter
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+ from statistics import mean
17
+ from typing import Iterable
18
+
19
+ from tools.convert_to_char_dataset import (
20
+ build_vocab,
21
+ convert_record,
22
+ coverage,
23
+ percentile,
24
+ )
25
+
26
+
27
+ DEFAULT_INPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
28
+ DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated_char.jsonl")
29
+ DEFAULT_VOCAB_OUTPUT = Path("datasets/AnimeName/vocab.generated.char.json")
30
+ DEFAULT_MANIFEST_OUTPUT = Path(
31
+ "datasets/AnimeName/dmhy_weak.generated_char.manifest.json"
32
+ )
33
+ REQUIRED_FIELDS = ("filename", "tokens", "labels")
34
+
35
+
36
+ def is_separator_or_space(char: str) -> bool:
37
+ return char.isspace() or not char.isalnum()
38
+
39
+
40
+ def token_has_embedded_separator(token: str) -> bool:
41
+ return len(token) > 1 and any(is_separator_or_space(char) for char in token)
42
+
43
+
44
+ def is_bioish_label(label: object) -> bool:
45
+ if not isinstance(label, str):
46
+ return False
47
+ if label == "O":
48
+ return True
49
+ prefix, sep, entity = label.partition("-")
50
+ return sep == "-" and prefix in {"B", "I"} and bool(entity)
51
+
52
+
53
+ def validate_record(
54
+ record: object,
55
+ path: Path,
56
+ line_no: int,
57
+ *,
58
+ check_punctuation: bool = True,
59
+ ) -> dict:
60
+ if not isinstance(record, dict):
61
+ raise ValueError(f"{path}:{line_no}: record must be a JSON object")
62
+
63
+ missing = [field for field in REQUIRED_FIELDS if field not in record]
64
+ if missing:
65
+ raise ValueError(
66
+ f"{path}:{line_no}: missing required field(s): {', '.join(missing)}"
67
+ )
68
+
69
+ filename = record["filename"]
70
+ tokens = record["tokens"]
71
+ labels = record["labels"]
72
+
73
+ if not isinstance(filename, str) or not filename:
74
+ raise ValueError(f"{path}:{line_no}: filename must be a non-empty string")
75
+ if not isinstance(tokens, list):
76
+ raise ValueError(f"{path}:{line_no}: tokens must be a list")
77
+ if not isinstance(labels, list):
78
+ raise ValueError(f"{path}:{line_no}: labels must be a list")
79
+ if len(tokens) != len(labels):
80
+ raise ValueError(
81
+ f"{path}:{line_no}: token/label length mismatch: "
82
+ f"{len(tokens)} tokens, {len(labels)} labels"
83
+ )
84
+
85
+ for index, token in enumerate(tokens):
86
+ if not isinstance(token, str):
87
+ raise ValueError(f"{path}:{line_no}: tokens[{index}] must be a string")
88
+ if check_punctuation and token_has_embedded_separator(token):
89
+ raise ValueError(
90
+ f"{path}:{line_no}: tokens[{index}] contains punctuation, symbol, or "
91
+ f"whitespace that should be a standalone token: {token!r}"
92
+ )
93
+
94
+ for index, label in enumerate(labels):
95
+ if not is_bioish_label(label):
96
+ raise ValueError(
97
+ f"{path}:{line_no}: labels[{index}] is not BIO-ish: {label!r}"
98
+ )
99
+
100
+ return record
101
+
102
+
103
+ def iter_validated_jsonl(path: Path, *, check_punctuation: bool = True) -> Iterable[dict]:
104
+ with path.open("r", encoding="utf-8") as handle:
105
+ for line_no, line in enumerate(handle, 1):
106
+ line = line.strip()
107
+ if not line:
108
+ continue
109
+ try:
110
+ record = json.loads(line)
111
+ except json.JSONDecodeError as exc:
112
+ raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
113
+ yield validate_record(
114
+ record,
115
+ path,
116
+ line_no,
117
+ check_punctuation=check_punctuation,
118
+ )
119
+
120
+
121
+ def parse_args() -> argparse.Namespace:
122
+ parser = argparse.ArgumentParser(
123
+ description=(
124
+ "Validate annotated DMHY graph JSONL and convert it to the "
125
+ "character-tokenized training format."
126
+ ),
127
+ epilog=(
128
+ "Equivalent projection logic is provided by "
129
+ "tools.convert_to_char_dataset.convert_record."
130
+ ),
131
+ )
132
+ parser.add_argument(
133
+ "--input",
134
+ default=str(DEFAULT_INPUT),
135
+ help=f"Input dmhy_weak-compatible JSONL (default: {DEFAULT_INPUT})",
136
+ )
137
+ parser.add_argument(
138
+ "--output",
139
+ default=str(DEFAULT_OUTPUT),
140
+ help=f"Output character-level JSONL (default: {DEFAULT_OUTPUT})",
141
+ )
142
+ parser.add_argument(
143
+ "--vocab-output",
144
+ default=str(DEFAULT_VOCAB_OUTPUT),
145
+ help=f"Output character vocab JSON (default: {DEFAULT_VOCAB_OUTPUT})",
146
+ )
147
+ parser.add_argument(
148
+ "--manifest-output",
149
+ default=str(DEFAULT_MANIFEST_OUTPUT),
150
+ help=(
151
+ "Output conversion manifest JSON "
152
+ f"(default: {DEFAULT_MANIFEST_OUTPUT})"
153
+ ),
154
+ )
155
+ parser.add_argument(
156
+ "--max-vocab-size",
157
+ type=int,
158
+ default=None,
159
+ help="Optional vocab cap including special tokens",
160
+ )
161
+ parser.add_argument("--limit", type=int, default=None, help="Convert only N rows")
162
+ parser.add_argument(
163
+ "--progress",
164
+ type=int,
165
+ default=50_000,
166
+ help="Print progress every N records",
167
+ )
168
+ parser.add_argument(
169
+ "--validate-only",
170
+ action="store_true",
171
+ help="Validate input records without writing converted outputs",
172
+ )
173
+ parser.add_argument(
174
+ "--allow-embedded-punctuation",
175
+ action="store_true",
176
+ help=(
177
+ "Skip the generated-workflow check that punctuation and whitespace "
178
+ "must be standalone tokens."
179
+ ),
180
+ )
181
+ return parser.parse_args()
182
+
183
+
184
+ def main() -> None:
185
+ args = parse_args()
186
+ input_path = Path(args.input)
187
+ output_path = Path(args.output)
188
+ vocab_path = Path(args.vocab_output)
189
+ manifest_path = Path(args.manifest_output)
190
+
191
+ if not input_path.exists():
192
+ raise FileNotFoundError(f"input JSONL does not exist: {input_path}")
193
+
194
+ if not args.validate_only:
195
+ output_path.parent.mkdir(parents=True, exist_ok=True)
196
+ vocab_path.parent.mkdir(parents=True, exist_ok=True)
197
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
198
+
199
+ char_counter: Counter[str] = Counter()
200
+ label_counter: Counter[str] = Counter()
201
+ row_count = 0
202
+ source_token_count = 0
203
+ char_token_count = 0
204
+ lengths: list[int] = []
205
+ examples: list[dict] = []
206
+
207
+ output_handle = None
208
+ try:
209
+ if not args.validate_only:
210
+ output_handle = output_path.open("w", encoding="utf-8", newline="\n")
211
+
212
+ for record in iter_validated_jsonl(
213
+ input_path,
214
+ check_punctuation=not args.allow_embedded_punctuation,
215
+ ):
216
+ converted = convert_record(record)
217
+
218
+ if output_handle is not None:
219
+ output_handle.write(
220
+ json.dumps(converted, ensure_ascii=False, separators=(",", ":"))
221
+ + "\n"
222
+ )
223
+
224
+ row_count += 1
225
+ source_token_count += converted["source_token_count"]
226
+ char_len = converted["char_token_count"]
227
+ char_token_count += char_len
228
+ lengths.append(char_len)
229
+ char_counter.update(converted["tokens"])
230
+ label_counter.update(converted["labels"])
231
+ if len(examples) < 5:
232
+ examples.append(converted)
233
+
234
+ if args.limit is not None and row_count >= args.limit:
235
+ break
236
+ if args.progress and row_count % args.progress == 0:
237
+ print(f"converted {row_count:,} rows; unique chars={len(char_counter):,}")
238
+ finally:
239
+ if output_handle is not None:
240
+ output_handle.close()
241
+
242
+ vocab = build_vocab(char_counter, args.max_vocab_size)
243
+
244
+ manifest = {
245
+ "created_at": datetime.now(timezone.utc).isoformat(),
246
+ "input": str(input_path),
247
+ "output": None if args.validate_only else str(output_path),
248
+ "vocab_output": None if args.validate_only else str(vocab_path),
249
+ "manifest_output": None if args.validate_only else str(manifest_path),
250
+ "tokenizer_variant": "char",
251
+ "source_workflow": "annotated_dmhy_graph",
252
+ "validation": {
253
+ "required_fields": list(REQUIRED_FIELDS),
254
+ "label_contract": "O or B-*/I-* with a non-empty entity name; B/O-only is accepted",
255
+ "punctuation_standalone": not args.allow_embedded_punctuation,
256
+ },
257
+ "projection": {
258
+ "B-X": "first char keeps B-X; remaining chars become I-X",
259
+ "I-X": "all chars keep I-X",
260
+ "O": "all chars keep O",
261
+ },
262
+ "row_count": row_count,
263
+ "source_token_count": source_token_count,
264
+ "char_token_count": char_token_count,
265
+ "unique_char_count": len(char_counter),
266
+ "vocab_size": len(vocab),
267
+ "max_vocab_size": args.max_vocab_size,
268
+ "vocab_coverage": coverage(char_counter, vocab),
269
+ "label_counts": dict(label_counter),
270
+ "char_length": {
271
+ "min": min(lengths) if lengths else 0,
272
+ "mean": mean(lengths) if lengths else 0,
273
+ "p50": percentile(lengths, 50),
274
+ "p90": percentile(lengths, 90),
275
+ "p95": percentile(lengths, 95),
276
+ "p99": percentile(lengths, 99),
277
+ "max": max(lengths) if lengths else 0,
278
+ },
279
+ "examples": examples,
280
+ }
281
+
282
+ if not args.validate_only:
283
+ vocab_path.write_text(
284
+ json.dumps(vocab, ensure_ascii=False, indent=2) + "\n",
285
+ encoding="utf-8",
286
+ )
287
+ manifest_path.write_text(
288
+ json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
289
+ encoding="utf-8",
290
+ )
291
+
292
+ print(
293
+ json.dumps(
294
+ {key: value for key, value in manifest.items() if key != "examples"},
295
+ ensure_ascii=False,
296
+ indent=2,
297
+ )
298
+ )
299
+
300
+
301
+ if __name__ == "__main__":
302
+ main()
tools/dmhy_prefix_grouper/Cargo.lock ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "aho-corasick"
7
+ version = "1.1.4"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
+ dependencies = [
11
+ "memchr",
12
+ ]
13
+
14
+ [[package]]
15
+ name = "anstream"
16
+ version = "1.0.0"
17
+ source = "registry+https://github.com/rust-lang/crates.io-index"
18
+ checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
19
+ dependencies = [
20
+ "anstyle",
21
+ "anstyle-parse",
22
+ "anstyle-query",
23
+ "anstyle-wincon",
24
+ "colorchoice",
25
+ "is_terminal_polyfill",
26
+ "utf8parse",
27
+ ]
28
+
29
+ [[package]]
30
+ name = "anstyle"
31
+ version = "1.0.14"
32
+ source = "registry+https://github.com/rust-lang/crates.io-index"
33
+ checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
34
+
35
+ [[package]]
36
+ name = "anstyle-parse"
37
+ version = "1.0.0"
38
+ source = "registry+https://github.com/rust-lang/crates.io-index"
39
+ checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
40
+ dependencies = [
41
+ "utf8parse",
42
+ ]
43
+
44
+ [[package]]
45
+ name = "anstyle-query"
46
+ version = "1.1.5"
47
+ source = "registry+https://github.com/rust-lang/crates.io-index"
48
+ checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
49
+ dependencies = [
50
+ "windows-sys",
51
+ ]
52
+
53
+ [[package]]
54
+ name = "anstyle-wincon"
55
+ version = "3.0.11"
56
+ source = "registry+https://github.com/rust-lang/crates.io-index"
57
+ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
58
+ dependencies = [
59
+ "anstyle",
60
+ "once_cell_polyfill",
61
+ "windows-sys",
62
+ ]
63
+
64
+ [[package]]
65
+ name = "anyhow"
66
+ version = "1.0.102"
67
+ source = "registry+https://github.com/rust-lang/crates.io-index"
68
+ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
69
+
70
+ [[package]]
71
+ name = "clap"
72
+ version = "4.6.1"
73
+ source = "registry+https://github.com/rust-lang/crates.io-index"
74
+ checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
75
+ dependencies = [
76
+ "clap_builder",
77
+ "clap_derive",
78
+ ]
79
+
80
+ [[package]]
81
+ name = "clap_builder"
82
+ version = "4.6.0"
83
+ source = "registry+https://github.com/rust-lang/crates.io-index"
84
+ checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
85
+ dependencies = [
86
+ "anstream",
87
+ "anstyle",
88
+ "clap_lex",
89
+ "strsim",
90
+ ]
91
+
92
+ [[package]]
93
+ name = "clap_derive"
94
+ version = "4.6.1"
95
+ source = "registry+https://github.com/rust-lang/crates.io-index"
96
+ checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
97
+ dependencies = [
98
+ "heck",
99
+ "proc-macro2",
100
+ "quote",
101
+ "syn",
102
+ ]
103
+
104
+ [[package]]
105
+ name = "clap_lex"
106
+ version = "1.1.0"
107
+ source = "registry+https://github.com/rust-lang/crates.io-index"
108
+ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
109
+
110
+ [[package]]
111
+ name = "colorchoice"
112
+ version = "1.0.5"
113
+ source = "registry+https://github.com/rust-lang/crates.io-index"
114
+ checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
115
+
116
+ [[package]]
117
+ name = "crossbeam-deque"
118
+ version = "0.8.6"
119
+ source = "registry+https://github.com/rust-lang/crates.io-index"
120
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
121
+ dependencies = [
122
+ "crossbeam-epoch",
123
+ "crossbeam-utils",
124
+ ]
125
+
126
+ [[package]]
127
+ name = "crossbeam-epoch"
128
+ version = "0.9.18"
129
+ source = "registry+https://github.com/rust-lang/crates.io-index"
130
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
131
+ dependencies = [
132
+ "crossbeam-utils",
133
+ ]
134
+
135
+ [[package]]
136
+ name = "crossbeam-utils"
137
+ version = "0.8.21"
138
+ source = "registry+https://github.com/rust-lang/crates.io-index"
139
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
140
+
141
+ [[package]]
142
+ name = "dmhy_prefix_grouper"
143
+ version = "0.1.0"
144
+ dependencies = [
145
+ "anyhow",
146
+ "clap",
147
+ "rayon",
148
+ "regex",
149
+ "serde",
150
+ "serde_json",
151
+ ]
152
+
153
+ [[package]]
154
+ name = "either"
155
+ version = "1.16.0"
156
+ source = "registry+https://github.com/rust-lang/crates.io-index"
157
+ checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
158
+
159
+ [[package]]
160
+ name = "heck"
161
+ version = "0.5.0"
162
+ source = "registry+https://github.com/rust-lang/crates.io-index"
163
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
164
+
165
+ [[package]]
166
+ name = "is_terminal_polyfill"
167
+ version = "1.70.2"
168
+ source = "registry+https://github.com/rust-lang/crates.io-index"
169
+ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
170
+
171
+ [[package]]
172
+ name = "itoa"
173
+ version = "1.0.18"
174
+ source = "registry+https://github.com/rust-lang/crates.io-index"
175
+ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
176
+
177
+ [[package]]
178
+ name = "memchr"
179
+ version = "2.8.1"
180
+ source = "registry+https://github.com/rust-lang/crates.io-index"
181
+ checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
182
+
183
+ [[package]]
184
+ name = "once_cell_polyfill"
185
+ version = "1.70.2"
186
+ source = "registry+https://github.com/rust-lang/crates.io-index"
187
+ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
188
+
189
+ [[package]]
190
+ name = "proc-macro2"
191
+ version = "1.0.106"
192
+ source = "registry+https://github.com/rust-lang/crates.io-index"
193
+ checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
194
+ dependencies = [
195
+ "unicode-ident",
196
+ ]
197
+
198
+ [[package]]
199
+ name = "quote"
200
+ version = "1.0.45"
201
+ source = "registry+https://github.com/rust-lang/crates.io-index"
202
+ checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
203
+ dependencies = [
204
+ "proc-macro2",
205
+ ]
206
+
207
+ [[package]]
208
+ name = "rayon"
209
+ version = "1.12.0"
210
+ source = "registry+https://github.com/rust-lang/crates.io-index"
211
+ checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
212
+ dependencies = [
213
+ "either",
214
+ "rayon-core",
215
+ ]
216
+
217
+ [[package]]
218
+ name = "rayon-core"
219
+ version = "1.13.0"
220
+ source = "registry+https://github.com/rust-lang/crates.io-index"
221
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
222
+ dependencies = [
223
+ "crossbeam-deque",
224
+ "crossbeam-utils",
225
+ ]
226
+
227
+ [[package]]
228
+ name = "regex"
229
+ version = "1.12.3"
230
+ source = "registry+https://github.com/rust-lang/crates.io-index"
231
+ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
232
+ dependencies = [
233
+ "aho-corasick",
234
+ "memchr",
235
+ "regex-automata",
236
+ "regex-syntax",
237
+ ]
238
+
239
+ [[package]]
240
+ name = "regex-automata"
241
+ version = "0.4.14"
242
+ source = "registry+https://github.com/rust-lang/crates.io-index"
243
+ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
244
+ dependencies = [
245
+ "aho-corasick",
246
+ "memchr",
247
+ "regex-syntax",
248
+ ]
249
+
250
+ [[package]]
251
+ name = "regex-syntax"
252
+ version = "0.8.10"
253
+ source = "registry+https://github.com/rust-lang/crates.io-index"
254
+ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
255
+
256
+ [[package]]
257
+ name = "serde"
258
+ version = "1.0.228"
259
+ source = "registry+https://github.com/rust-lang/crates.io-index"
260
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
261
+ dependencies = [
262
+ "serde_core",
263
+ "serde_derive",
264
+ ]
265
+
266
+ [[package]]
267
+ name = "serde_core"
268
+ version = "1.0.228"
269
+ source = "registry+https://github.com/rust-lang/crates.io-index"
270
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
271
+ dependencies = [
272
+ "serde_derive",
273
+ ]
274
+
275
+ [[package]]
276
+ name = "serde_derive"
277
+ version = "1.0.228"
278
+ source = "registry+https://github.com/rust-lang/crates.io-index"
279
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
280
+ dependencies = [
281
+ "proc-macro2",
282
+ "quote",
283
+ "syn",
284
+ ]
285
+
286
+ [[package]]
287
+ name = "serde_json"
288
+ version = "1.0.150"
289
+ source = "registry+https://github.com/rust-lang/crates.io-index"
290
+ checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
291
+ dependencies = [
292
+ "itoa",
293
+ "memchr",
294
+ "serde",
295
+ "serde_core",
296
+ "zmij",
297
+ ]
298
+
299
+ [[package]]
300
+ name = "strsim"
301
+ version = "0.11.1"
302
+ source = "registry+https://github.com/rust-lang/crates.io-index"
303
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
304
+
305
+ [[package]]
306
+ name = "syn"
307
+ version = "2.0.117"
308
+ source = "registry+https://github.com/rust-lang/crates.io-index"
309
+ checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
310
+ dependencies = [
311
+ "proc-macro2",
312
+ "quote",
313
+ "unicode-ident",
314
+ ]
315
+
316
+ [[package]]
317
+ name = "unicode-ident"
318
+ version = "1.0.24"
319
+ source = "registry+https://github.com/rust-lang/crates.io-index"
320
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
321
+
322
+ [[package]]
323
+ name = "utf8parse"
324
+ version = "0.2.2"
325
+ source = "registry+https://github.com/rust-lang/crates.io-index"
326
+ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
327
+
328
+ [[package]]
329
+ name = "windows-link"
330
+ version = "0.2.1"
331
+ source = "registry+https://github.com/rust-lang/crates.io-index"
332
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
333
+
334
+ [[package]]
335
+ name = "windows-sys"
336
+ version = "0.61.2"
337
+ source = "registry+https://github.com/rust-lang/crates.io-index"
338
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
339
+ dependencies = [
340
+ "windows-link",
341
+ ]
342
+
343
+ [[package]]
344
+ name = "zmij"
345
+ version = "1.0.21"
346
+ source = "registry+https://github.com/rust-lang/crates.io-index"
347
+ checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
tools/dmhy_prefix_grouper/Cargo.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [package]
2
+ name = "dmhy_prefix_grouper"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ anyhow = "1.0"
8
+ clap = { version = "4.5", features = ["derive"] }
9
+ rayon = "1.10"
10
+ regex = "1.11"
11
+ serde = { version = "1.0", features = ["derive"] }
12
+ serde_json = "1.0"
tools/dmhy_prefix_grouper/src/main.rs ADDED
@@ -0,0 +1,1070 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use std::collections::{BTreeMap, BTreeSet};
2
+ use std::fs::{self, File};
3
+ use std::io::{BufRead, BufReader, BufWriter};
4
+ use std::path::{Path, PathBuf};
5
+ use std::sync::LazyLock;
6
+
7
+ use anyhow::{Context, Result};
8
+ use clap::Parser;
9
+ use rayon::prelude::*;
10
+ use regex::Regex;
11
+ use serde::{Deserialize, Serialize};
12
+
13
+ const RADIX_VERSION: &str = "prefix-radix-v1";
14
+ const DAG_VERSION: &str = "prefix-dag-v1";
15
+
16
+ static EPISODE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
17
+ vec![
18
+ Regex::new(r"(?i)\bS\d{1,2}E\d{1,4}\b").unwrap(),
19
+ Regex::new(r"(?i)\bEP?\.?\s*\d{1,4}\b").unwrap(),
20
+ Regex::new(r"第\s*\d{1,4}\s*[話话集回]").unwrap(),
21
+ Regex::new(r"\b\d{1,4}\s*[話话集回]").unwrap(),
22
+ Regex::new(r"[\[(]\s*\d{1,4}\s*[\])]").unwrap(),
23
+ ]
24
+ });
25
+
26
+ static DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap());
27
+
28
+ #[derive(Parser)]
29
+ #[command(about = "Build a deterministic DMHY episode-prefix trie graph")]
30
+ struct Args {
31
+ #[arg(long, default_value = "datasets\\AnimeName\\dmhy_list.jsonl")]
32
+ input: PathBuf,
33
+
34
+ #[arg(long, default_value = "datasets\\AnimeName\\dmhy_prefix_graph.json")]
35
+ output: PathBuf,
36
+
37
+ #[arg(long, default_value_t = 2)]
38
+ min_count: usize,
39
+
40
+ #[arg(long, default_value_t = 5)]
41
+ example_count: usize,
42
+
43
+ #[arg(long)]
44
+ from_prefix_graph: Option<PathBuf>,
45
+
46
+ #[arg(long)]
47
+ dag_output: Option<PathBuf>,
48
+ }
49
+
50
+ #[derive(Clone, Deserialize)]
51
+ struct InputRecord {
52
+ value: String,
53
+ #[serde(default)]
54
+ uses_path: bool,
55
+ #[serde(default)]
56
+ has_trailing_hash: bool,
57
+ #[serde(default, rename = "has_digits")]
58
+ _has_digits: bool,
59
+ #[serde(default)]
60
+ #[serde(rename = "digit_skeleton")]
61
+ _digit_skeleton: String,
62
+ #[serde(default = "default_count")]
63
+ count: usize,
64
+ }
65
+
66
+ #[derive(Clone)]
67
+ struct PatternObservation {
68
+ source_index: usize,
69
+ prefix: String,
70
+ digit_skeleton: String,
71
+ suffix: String,
72
+ value: String,
73
+ uses_path: bool,
74
+ has_trailing_hash: bool,
75
+ count: usize,
76
+ }
77
+
78
+ #[derive(Default)]
79
+ struct GroupBuilder {
80
+ prefix: String,
81
+ digit_skeleton: String,
82
+ count: usize,
83
+ weight: usize,
84
+ uses_path_count: usize,
85
+ has_trailing_hash_count: usize,
86
+ suffix_examples: Vec<String>,
87
+ value_examples: Vec<String>,
88
+ }
89
+
90
+ struct TrieNode {
91
+ id: usize,
92
+ edge_label: String,
93
+ depth: usize,
94
+ parent: Option<usize>,
95
+ children_by_label: BTreeMap<String, usize>,
96
+ subtree_patterns: usize,
97
+ subtree_weight: usize,
98
+ }
99
+
100
+ #[derive(Serialize)]
101
+ struct GraphOutput {
102
+ meta: Meta,
103
+ nodes: Vec<OutputNode>,
104
+ terminals: Vec<OutputTerminal>,
105
+ }
106
+
107
+ #[derive(Serialize)]
108
+ struct Meta {
109
+ input: String,
110
+ output: String,
111
+ input_records: usize,
112
+ observations: usize,
113
+ no_episode_prefix: usize,
114
+ groups: usize,
115
+ nodes: usize,
116
+ max_depth: usize,
117
+ grouped_weight: usize,
118
+ min_count: usize,
119
+ example_count: usize,
120
+ tokenizer: TokenizerMeta,
121
+ }
122
+
123
+ #[derive(Serialize)]
124
+ struct TokenizerMeta {
125
+ version: &'static str,
126
+ notes: Vec<&'static str>,
127
+ }
128
+
129
+ #[derive(Serialize)]
130
+ struct OutputNode {
131
+ id: usize,
132
+ edge_label: String,
133
+ depth: usize,
134
+ children: Vec<usize>,
135
+ subtree_patterns: usize,
136
+ subtree_weight: usize,
137
+ }
138
+
139
+ #[derive(Serialize)]
140
+ struct OutputTerminal {
141
+ node_id: usize,
142
+ prefix: String,
143
+ digit_skeleton: String,
144
+ count: usize,
145
+ weight: usize,
146
+ uses_path_count: usize,
147
+ has_trailing_hash_count: usize,
148
+ suffix_examples: Vec<String>,
149
+ value_examples: Vec<String>,
150
+ annotations: TerminalAnnotations,
151
+ }
152
+
153
+ #[derive(Default, Serialize)]
154
+ struct TerminalAnnotations {
155
+ episode_title_suffixes: Vec<String>,
156
+ media_suffixes: Vec<String>,
157
+ title_candidates: Vec<String>,
158
+ needs_llm_review: bool,
159
+ llm_label: Option<String>,
160
+ notes: Option<String>,
161
+ }
162
+
163
+ #[derive(Deserialize)]
164
+ struct SourceGraph {
165
+ #[allow(dead_code)]
166
+ meta: serde_json::Value,
167
+ nodes: Vec<SourceNode>,
168
+ terminals: Vec<SourceTerminal>,
169
+ }
170
+
171
+ #[derive(Deserialize)]
172
+ struct SourceNode {
173
+ id: usize,
174
+ edge_label: String,
175
+ #[allow(dead_code)]
176
+ depth: usize,
177
+ #[serde(default)]
178
+ children: Vec<usize>,
179
+ }
180
+
181
+ #[derive(Clone, Deserialize, Serialize)]
182
+ struct SourceTerminal {
183
+ #[serde(default, skip_serializing_if = "Option::is_none")]
184
+ terminal_id: Option<String>,
185
+ node_id: usize,
186
+ prefix: String,
187
+ digit_skeleton: String,
188
+ count: usize,
189
+ weight: usize,
190
+ uses_path_count: usize,
191
+ has_trailing_hash_count: usize,
192
+ suffix_examples: Vec<String>,
193
+ value_examples: Vec<String>,
194
+ #[serde(default = "default_annotations_value")]
195
+ annotations: serde_json::Value,
196
+ }
197
+
198
+ #[derive(Serialize)]
199
+ struct DagOutput {
200
+ meta: DagMeta,
201
+ root: usize,
202
+ nodes: Vec<DagNode>,
203
+ terminals: Vec<DagTerminal>,
204
+ }
205
+
206
+ #[derive(Serialize)]
207
+ struct DagMeta {
208
+ version: &'static str,
209
+ input: String,
210
+ output: String,
211
+ source_nodes: usize,
212
+ source_terminals: usize,
213
+ dag_nodes: usize,
214
+ dag_edges: usize,
215
+ root: usize,
216
+ max_depth: usize,
217
+ merged_nodes: usize,
218
+ preserves_digits: bool,
219
+ merge_strategy: &'static str,
220
+ notes: Vec<&'static str>,
221
+ }
222
+
223
+ #[derive(Clone, Serialize)]
224
+ struct DagNode {
225
+ id: usize,
226
+ terminal: bool,
227
+ children: Vec<DagEdge>,
228
+ incoming_count: usize,
229
+ reachable_terminals: usize,
230
+ reachable_weight: usize,
231
+ }
232
+
233
+ #[derive(Clone, Serialize)]
234
+ struct DagEdge {
235
+ label: String,
236
+ target: usize,
237
+ }
238
+
239
+ #[derive(Serialize)]
240
+ struct DagTerminal {
241
+ terminal_id: String,
242
+ node_id: usize,
243
+ prefix: String,
244
+ digit_skeleton: String,
245
+ count: usize,
246
+ weight: usize,
247
+ uses_path_count: usize,
248
+ has_trailing_hash_count: usize,
249
+ suffix_examples: Vec<String>,
250
+ value_examples: Vec<String>,
251
+ annotations: serde_json::Value,
252
+ }
253
+
254
+ #[derive(Clone, Eq, Ord, PartialEq, PartialOrd)]
255
+ struct NodeSignature {
256
+ terminal: bool,
257
+ children: Vec<(String, usize)>,
258
+ }
259
+
260
+ #[derive(Clone)]
261
+ struct TempDagNode {
262
+ terminal: bool,
263
+ children: Vec<DagEdge>,
264
+ }
265
+
266
+ fn main() -> Result<()> {
267
+ let args = Args::parse();
268
+ if args.from_prefix_graph.is_some() || args.dag_output.is_some() {
269
+ let dag = build_dag_from_args(&args)?;
270
+ println!("{}", serde_json::to_string_pretty(&dag.meta)?);
271
+ } else {
272
+ let graph = build_graph(&args)?;
273
+ println!("{}", serde_json::to_string_pretty(&graph.meta)?);
274
+ }
275
+ Ok(())
276
+ }
277
+
278
+ fn build_graph(args: &Args) -> Result<GraphOutput> {
279
+ ensure_output_parent(&args.output)?;
280
+
281
+ let records = read_records(&args.input)?;
282
+ let input_records = records.len();
283
+
284
+ let mut observations = records
285
+ .par_iter()
286
+ .enumerate()
287
+ .filter_map(|(source_index, record)| build_observation(source_index, record))
288
+ .collect::<Vec<_>>();
289
+
290
+ observations.sort_by(|left, right| {
291
+ left.prefix
292
+ .cmp(&right.prefix)
293
+ .then(left.source_index.cmp(&right.source_index))
294
+ });
295
+
296
+ let no_episode_prefix = input_records.saturating_sub(observations.len());
297
+ let groups = aggregate_observations(&observations, args.min_count, args.example_count);
298
+ let (nodes, terminals, max_depth, grouped_weight) = build_trie(&groups);
299
+
300
+ let graph = GraphOutput {
301
+ meta: Meta {
302
+ input: display_path(&args.input),
303
+ output: display_path(&args.output),
304
+ input_records,
305
+ observations: observations.len(),
306
+ no_episode_prefix,
307
+ groups: terminals.len(),
308
+ nodes: nodes.len(),
309
+ max_depth,
310
+ grouped_weight,
311
+ min_count: args.min_count,
312
+ example_count: args.example_count,
313
+ tokenizer: TokenizerMeta {
314
+ version: RADIX_VERSION,
315
+ notes: vec![
316
+ "Episode prefixes are detected with the legacy regex/boundary rules.",
317
+ "Graph insertion preserves original prefix digits; digit_skeleton is secondary metadata only.",
318
+ "Graph nodes form a compressed radix trie over complete original prefix strings.",
319
+ "Edge labels split only at actual branch points; punctuation and separators do not force levels.",
320
+ ],
321
+ },
322
+ },
323
+ nodes,
324
+ terminals,
325
+ };
326
+
327
+ let output = File::create(&args.output)
328
+ .with_context(|| format!("failed to create {}", args.output.display()))?;
329
+ let writer = BufWriter::new(output);
330
+ serde_json::to_writer_pretty(writer, &graph)
331
+ .with_context(|| format!("failed to write {}", args.output.display()))?;
332
+
333
+ Ok(graph)
334
+ }
335
+
336
+ fn build_dag_from_args(args: &Args) -> Result<DagOutput> {
337
+ let input = args
338
+ .from_prefix_graph
339
+ .as_ref()
340
+ .unwrap_or(&args.output)
341
+ .to_path_buf();
342
+ let output = args
343
+ .dag_output
344
+ .as_ref()
345
+ .cloned()
346
+ .unwrap_or_else(|| PathBuf::from("datasets\\AnimeName\\dmhy_prefix_dag.json"));
347
+
348
+ build_dag(&input, &output)
349
+ }
350
+
351
+ fn build_dag(input_path: &Path, output_path: &Path) -> Result<DagOutput> {
352
+ ensure_output_parent(output_path)?;
353
+
354
+ let input = File::open(input_path)
355
+ .with_context(|| format!("failed to open {}", input_path.display()))?;
356
+ let reader = BufReader::new(input);
357
+ let source: SourceGraph = serde_json::from_reader(reader)
358
+ .with_context(|| format!("failed to parse {}", input_path.display()))?;
359
+
360
+ validate_source_graph(&source)?;
361
+ let dag = minimize_source_graph(&source, input_path, output_path)?;
362
+
363
+ let output = File::create(output_path)
364
+ .with_context(|| format!("failed to create {}", output_path.display()))?;
365
+ let writer = BufWriter::new(output);
366
+ serde_json::to_writer_pretty(writer, &dag)
367
+ .with_context(|| format!("failed to write {}", output_path.display()))?;
368
+
369
+ Ok(dag)
370
+ }
371
+
372
+ fn validate_source_graph(source: &SourceGraph) -> Result<()> {
373
+ for (index, node) in source.nodes.iter().enumerate() {
374
+ anyhow::ensure!(
375
+ node.id == index,
376
+ "source node id {} appears at index {}",
377
+ node.id,
378
+ index
379
+ );
380
+ for &child_id in &node.children {
381
+ anyhow::ensure!(
382
+ child_id < source.nodes.len(),
383
+ "source node {} references missing child {}",
384
+ node.id,
385
+ child_id
386
+ );
387
+ }
388
+ }
389
+
390
+ for terminal in &source.terminals {
391
+ anyhow::ensure!(
392
+ terminal.node_id < source.nodes.len(),
393
+ "terminal prefix {:?} references missing node {}",
394
+ terminal.prefix,
395
+ terminal.node_id
396
+ );
397
+ }
398
+
399
+ Ok(())
400
+ }
401
+
402
+ fn minimize_source_graph(
403
+ source: &SourceGraph,
404
+ input_path: &Path,
405
+ output_path: &Path,
406
+ ) -> Result<DagOutput> {
407
+ let terminal_source_nodes = source
408
+ .terminals
409
+ .iter()
410
+ .map(|terminal| terminal.node_id)
411
+ .collect::<BTreeSet<_>>();
412
+ let postorder = source_postorder(source);
413
+ let mut source_to_temp_dag = vec![usize::MAX; source.nodes.len()];
414
+ let mut signatures = BTreeMap::<NodeSignature, usize>::new();
415
+ let mut temp_nodes = Vec::<TempDagNode>::new();
416
+
417
+ for source_id in postorder {
418
+ let source_node = &source.nodes[source_id];
419
+ let mut children = source_node
420
+ .children
421
+ .iter()
422
+ .map(|&child_id| {
423
+ let child = &source.nodes[child_id];
424
+ let target = source_to_temp_dag[child_id];
425
+ anyhow::ensure!(
426
+ target != usize::MAX,
427
+ "source child {} was not canonicalized before parent {}",
428
+ child_id,
429
+ source_id
430
+ );
431
+ Ok((child.edge_label.clone(), target))
432
+ })
433
+ .collect::<Result<Vec<_>>>()?;
434
+ children.sort();
435
+
436
+ let signature = NodeSignature {
437
+ terminal: terminal_source_nodes.contains(&source_id),
438
+ children,
439
+ };
440
+
441
+ let temp_id = if let Some(&existing) = signatures.get(&signature) {
442
+ existing
443
+ } else {
444
+ let temp_id = temp_nodes.len();
445
+ temp_nodes.push(TempDagNode {
446
+ terminal: signature.terminal,
447
+ children: signature
448
+ .children
449
+ .iter()
450
+ .map(|(label, target)| DagEdge {
451
+ label: label.clone(),
452
+ target: *target,
453
+ })
454
+ .collect(),
455
+ });
456
+ signatures.insert(signature, temp_id);
457
+ temp_id
458
+ };
459
+
460
+ source_to_temp_dag[source_id] = temp_id;
461
+ }
462
+
463
+ let root_temp_id = source_to_temp_dag[0];
464
+ let (temp_to_dag, mut dag_nodes) = renumber_dag(root_temp_id, &temp_nodes);
465
+ for node in &mut dag_nodes {
466
+ for edge in &mut node.children {
467
+ edge.target = temp_to_dag[edge.target];
468
+ }
469
+ }
470
+
471
+ let terminals = source
472
+ .terminals
473
+ .iter()
474
+ .enumerate()
475
+ .map(|(index, terminal)| {
476
+ let temp_id = source_to_temp_dag[terminal.node_id];
477
+ DagTerminal {
478
+ terminal_id: terminal
479
+ .terminal_id
480
+ .clone()
481
+ .unwrap_or_else(|| format!("t{}", index)),
482
+ node_id: temp_to_dag[temp_id],
483
+ prefix: terminal.prefix.clone(),
484
+ digit_skeleton: terminal.digit_skeleton.clone(),
485
+ count: terminal.count,
486
+ weight: terminal.weight,
487
+ uses_path_count: terminal.uses_path_count,
488
+ has_trailing_hash_count: terminal.has_trailing_hash_count,
489
+ suffix_examples: terminal.suffix_examples.clone(),
490
+ value_examples: terminal.value_examples.clone(),
491
+ annotations: terminal.annotations.clone(),
492
+ }
493
+ })
494
+ .collect::<Vec<_>>();
495
+
496
+ fill_dag_counts(&source, &source_to_temp_dag, &temp_to_dag, &mut dag_nodes);
497
+ let dag_edges = dag_nodes.iter().map(|node| node.children.len()).sum();
498
+ let max_depth = dag_max_depth(&dag_nodes);
499
+
500
+ Ok(DagOutput {
501
+ meta: DagMeta {
502
+ version: DAG_VERSION,
503
+ input: display_path(input_path),
504
+ output: display_path(output_path),
505
+ source_nodes: source.nodes.len(),
506
+ source_terminals: source.terminals.len(),
507
+ dag_nodes: dag_nodes.len(),
508
+ dag_edges,
509
+ root: 0,
510
+ max_depth,
511
+ merged_nodes: source.nodes.len().saturating_sub(dag_nodes.len()),
512
+ preserves_digits: true,
513
+ merge_strategy:
514
+ "bottom-up suffix equivalence over radix trie nodes; edge labels remain raw substrings",
515
+ notes: vec![
516
+ "Terminal prefixes and digit_skeleton values are copied from the source graph; digits are not normalized for merging.",
517
+ "Node reachable_terminals and reachable_weight count terminal instances mapped onto the shared DAG suffix, so shared nodes report aggregate suffix usage rather than one caller-specific path.",
518
+ ],
519
+ },
520
+ root: 0,
521
+ nodes: dag_nodes,
522
+ terminals,
523
+ })
524
+ }
525
+
526
+ fn source_postorder(source: &SourceGraph) -> Vec<usize> {
527
+ let mut order = Vec::with_capacity(source.nodes.len());
528
+ let mut stack = vec![(0, false)];
529
+ while let Some((node_id, expanded)) = stack.pop() {
530
+ if expanded {
531
+ order.push(node_id);
532
+ } else {
533
+ stack.push((node_id, true));
534
+ for &child_id in source.nodes[node_id].children.iter().rev() {
535
+ stack.push((child_id, false));
536
+ }
537
+ }
538
+ }
539
+ order
540
+ }
541
+
542
+ fn renumber_dag(root_temp_id: usize, temp_nodes: &[TempDagNode]) -> (Vec<usize>, Vec<DagNode>) {
543
+ let mut temp_to_dag = vec![usize::MAX; temp_nodes.len()];
544
+ let mut dag_nodes = Vec::<DagNode>::new();
545
+ let mut stack = vec![root_temp_id];
546
+
547
+ while let Some(temp_id) = stack.pop() {
548
+ if temp_to_dag[temp_id] != usize::MAX {
549
+ continue;
550
+ }
551
+
552
+ let dag_id = dag_nodes.len();
553
+ temp_to_dag[temp_id] = dag_id;
554
+ dag_nodes.push(DagNode {
555
+ id: dag_id,
556
+ terminal: temp_nodes[temp_id].terminal,
557
+ children: temp_nodes[temp_id].children.clone(),
558
+ incoming_count: 0,
559
+ reachable_terminals: 0,
560
+ reachable_weight: 0,
561
+ });
562
+
563
+ for edge in temp_nodes[temp_id].children.iter().rev() {
564
+ stack.push(edge.target);
565
+ }
566
+ }
567
+
568
+ (temp_to_dag, dag_nodes)
569
+ }
570
+
571
+ fn fill_dag_counts(
572
+ source: &SourceGraph,
573
+ source_to_temp_dag: &[usize],
574
+ temp_to_dag: &[usize],
575
+ nodes: &mut [DagNode],
576
+ ) {
577
+ let mut terminal_ids_by_node = vec![BTreeSet::<usize>::new(); nodes.len()];
578
+
579
+ let all_edges = nodes
580
+ .iter()
581
+ .flat_map(|node| node.children.iter().map(|edge| edge.target))
582
+ .collect::<Vec<_>>();
583
+ for target in all_edges {
584
+ nodes[target].incoming_count += 1;
585
+ }
586
+
587
+ let source_parents = source_parent_map(source);
588
+ for (terminal_index, terminal) in source.terminals.iter().enumerate() {
589
+ let mut source_node_id = Some(terminal.node_id);
590
+ while let Some(node_id) = source_node_id {
591
+ let temp_id = source_to_temp_dag[node_id];
592
+ let dag_id = temp_to_dag[temp_id];
593
+ terminal_ids_by_node[dag_id].insert(terminal_index);
594
+ source_node_id = source_parents[node_id];
595
+ }
596
+ }
597
+
598
+ for (node_id, terminal_ids) in terminal_ids_by_node.into_iter().enumerate() {
599
+ nodes[node_id].reachable_terminals = terminal_ids.len();
600
+ nodes[node_id].reachable_weight = terminal_ids
601
+ .into_iter()
602
+ .map(|terminal_id| source.terminals[terminal_id].weight)
603
+ .sum();
604
+ }
605
+ }
606
+
607
+ fn source_parent_map(source: &SourceGraph) -> Vec<Option<usize>> {
608
+ let mut parents = vec![None; source.nodes.len()];
609
+ for node in &source.nodes {
610
+ for &child_id in &node.children {
611
+ parents[child_id] = Some(node.id);
612
+ }
613
+ }
614
+ parents
615
+ }
616
+
617
+ fn dag_max_depth(nodes: &[DagNode]) -> usize {
618
+ let mut max_depth = 0;
619
+ let mut stack = vec![(0, 0)];
620
+ while let Some((node_id, depth)) = stack.pop() {
621
+ max_depth = max_depth.max(depth);
622
+ for edge in &nodes[node_id].children {
623
+ stack.push((edge.target, depth + 1));
624
+ }
625
+ }
626
+ max_depth
627
+ }
628
+
629
+ fn ensure_output_parent(output: &Path) -> Result<()> {
630
+ if let Some(parent) = output.parent() {
631
+ if !parent.as_os_str().is_empty() {
632
+ fs::create_dir_all(parent)
633
+ .with_context(|| format!("failed to create {}", parent.display()))?;
634
+ }
635
+ }
636
+ Ok(())
637
+ }
638
+
639
+ fn read_records(input_path: &Path) -> Result<Vec<InputRecord>> {
640
+ let input = File::open(input_path)
641
+ .with_context(|| format!("failed to open {}", input_path.display()))?;
642
+ let reader = BufReader::new(input);
643
+ let mut records = Vec::new();
644
+
645
+ for (line_number, line) in reader.lines().enumerate() {
646
+ let line = line.with_context(|| {
647
+ format!(
648
+ "failed to read line {} from {}",
649
+ line_number + 1,
650
+ input_path.display()
651
+ )
652
+ })?;
653
+ let line = line.trim();
654
+ if line.is_empty() {
655
+ continue;
656
+ }
657
+
658
+ let record = serde_json::from_str(line).with_context(|| {
659
+ format!(
660
+ "failed to parse line {} from {}",
661
+ line_number + 1,
662
+ input_path.display()
663
+ )
664
+ })?;
665
+ records.push(record);
666
+ }
667
+
668
+ Ok(records)
669
+ }
670
+
671
+ fn build_observation(source_index: usize, record: &InputRecord) -> Option<PatternObservation> {
672
+ let (prefix, suffix) = find_episode_prefix(&record.value)?;
673
+ if prefix.is_empty() {
674
+ return None;
675
+ }
676
+ let digit_skeleton = digit_skeleton(&prefix);
677
+
678
+ Some(PatternObservation {
679
+ source_index,
680
+ prefix,
681
+ digit_skeleton,
682
+ suffix,
683
+ value: record.value.clone(),
684
+ uses_path: record.uses_path,
685
+ has_trailing_hash: record.has_trailing_hash,
686
+ count: record.count,
687
+ })
688
+ }
689
+
690
+ fn aggregate_observations(
691
+ observations: &[PatternObservation],
692
+ min_count: usize,
693
+ example_count: usize,
694
+ ) -> Vec<GroupBuilder> {
695
+ let mut groups = BTreeMap::<String, GroupBuilder>::new();
696
+
697
+ for observation in observations {
698
+ let group = groups
699
+ .entry(observation.prefix.clone())
700
+ .or_insert_with(|| GroupBuilder {
701
+ prefix: observation.prefix.clone(),
702
+ digit_skeleton: observation.digit_skeleton.clone(),
703
+ ..GroupBuilder::default()
704
+ });
705
+
706
+ group.count += 1;
707
+ group.weight += observation.count;
708
+ group.uses_path_count += observation.count * usize::from(observation.uses_path);
709
+ group.has_trailing_hash_count +=
710
+ observation.count * usize::from(observation.has_trailing_hash);
711
+ if !observation.suffix.is_empty() && group.suffix_examples.len() < example_count {
712
+ group.suffix_examples.push(observation.suffix.clone());
713
+ }
714
+ if group.value_examples.len() < example_count {
715
+ group.value_examples.push(observation.value.clone());
716
+ }
717
+ }
718
+
719
+ groups
720
+ .into_values()
721
+ .filter(|group| group.count >= min_count)
722
+ .collect()
723
+ }
724
+
725
+ fn build_trie(groups: &[GroupBuilder]) -> (Vec<OutputNode>, Vec<OutputTerminal>, usize, usize) {
726
+ let mut nodes = vec![TrieNode::root()];
727
+ let mut terminal_node_ids = Vec::with_capacity(groups.len());
728
+
729
+ for group in groups {
730
+ let (terminal_node_id, path) = insert_prefix(&mut nodes, &group.prefix);
731
+ terminal_node_ids.push(terminal_node_id);
732
+
733
+ for id in path {
734
+ nodes[id].subtree_patterns += group.count;
735
+ nodes[id].subtree_weight += group.weight;
736
+ }
737
+ }
738
+
739
+ assign_depths(&mut nodes);
740
+
741
+ let terminals = groups
742
+ .iter()
743
+ .zip(terminal_node_ids)
744
+ .map(|(group, node_id)| OutputTerminal {
745
+ node_id,
746
+ prefix: group.prefix.clone(),
747
+ digit_skeleton: group.digit_skeleton.clone(),
748
+ count: group.count,
749
+ weight: group.weight,
750
+ uses_path_count: group.uses_path_count,
751
+ has_trailing_hash_count: group.has_trailing_hash_count,
752
+ suffix_examples: group.suffix_examples.clone(),
753
+ value_examples: group.value_examples.clone(),
754
+ annotations: TerminalAnnotations::default(),
755
+ })
756
+ .collect::<Vec<_>>();
757
+
758
+ let max_depth = nodes.iter().map(|node| node.depth).max().unwrap_or(0);
759
+ let grouped_weight = groups.iter().map(|group| group.weight).sum();
760
+ let output_nodes = nodes
761
+ .into_iter()
762
+ .map(|node| OutputNode {
763
+ id: node.id,
764
+ edge_label: node.edge_label,
765
+ depth: node.depth,
766
+ children: node.children_by_label.into_values().collect(),
767
+ subtree_patterns: node.subtree_patterns,
768
+ subtree_weight: node.subtree_weight,
769
+ })
770
+ .collect();
771
+
772
+ (output_nodes, terminals, max_depth, grouped_weight)
773
+ }
774
+
775
+ impl TrieNode {
776
+ fn root() -> Self {
777
+ Self {
778
+ id: 0,
779
+ edge_label: String::new(),
780
+ depth: 0,
781
+ parent: None,
782
+ children_by_label: BTreeMap::new(),
783
+ subtree_patterns: 0,
784
+ subtree_weight: 0,
785
+ }
786
+ }
787
+ }
788
+
789
+ fn insert_prefix(nodes: &mut Vec<TrieNode>, prefix: &str) -> (usize, Vec<usize>) {
790
+ let mut current = 0;
791
+ let mut remaining = prefix;
792
+ let mut path = vec![current];
793
+
794
+ loop {
795
+ if remaining.is_empty() {
796
+ return (current, path);
797
+ }
798
+
799
+ let matching_child =
800
+ nodes[current]
801
+ .children_by_label
802
+ .iter()
803
+ .find_map(|(label, &child_id)| {
804
+ let common_len = common_prefix_len(remaining, label);
805
+ (common_len > 0).then(|| (label.clone(), child_id, common_len))
806
+ });
807
+
808
+ let Some((child_label, child_id, common_len)) = matching_child else {
809
+ let child_id = push_node(nodes, current, remaining.to_owned());
810
+ nodes[current]
811
+ .children_by_label
812
+ .insert(remaining.to_owned(), child_id);
813
+ path.push(child_id);
814
+ return (child_id, path);
815
+ };
816
+
817
+ if common_len == child_label.len() {
818
+ current = child_id;
819
+ remaining = &remaining[common_len..];
820
+ path.push(current);
821
+ continue;
822
+ }
823
+
824
+ let shared_label = child_label[..common_len].to_owned();
825
+ let old_suffix = child_label[common_len..].to_owned();
826
+ let new_suffix = remaining[common_len..].to_owned();
827
+
828
+ let split_id = push_node(nodes, current, shared_label.clone());
829
+ nodes[current].children_by_label.remove(&child_label);
830
+ nodes[current]
831
+ .children_by_label
832
+ .insert(shared_label, split_id);
833
+
834
+ nodes[child_id].edge_label = old_suffix.clone();
835
+ nodes[child_id].parent = Some(split_id);
836
+ nodes[split_id].subtree_patterns = nodes[child_id].subtree_patterns;
837
+ nodes[split_id].subtree_weight = nodes[child_id].subtree_weight;
838
+ nodes[split_id]
839
+ .children_by_label
840
+ .insert(old_suffix, child_id);
841
+
842
+ path.push(split_id);
843
+ if new_suffix.is_empty() {
844
+ return (split_id, path);
845
+ }
846
+
847
+ let new_child_id = push_node(nodes, split_id, new_suffix.clone());
848
+ nodes[split_id]
849
+ .children_by_label
850
+ .insert(new_suffix, new_child_id);
851
+ path.push(new_child_id);
852
+ return (new_child_id, path);
853
+ }
854
+ }
855
+
856
+ fn push_node(nodes: &mut Vec<TrieNode>, parent_id: usize, edge_label: String) -> usize {
857
+ let child_id = nodes.len();
858
+ nodes.push(TrieNode {
859
+ id: child_id,
860
+ edge_label,
861
+ depth: nodes[parent_id].depth + 1,
862
+ parent: Some(parent_id),
863
+ children_by_label: BTreeMap::new(),
864
+ subtree_patterns: 0,
865
+ subtree_weight: 0,
866
+ });
867
+ child_id
868
+ }
869
+
870
+ fn common_prefix_len(left: &str, right: &str) -> usize {
871
+ let mut len = 0;
872
+ for ((left_index, left_ch), (_, right_ch)) in left.char_indices().zip(right.char_indices()) {
873
+ if left_ch != right_ch {
874
+ break;
875
+ }
876
+ len = left_index + left_ch.len_utf8();
877
+ }
878
+ len
879
+ }
880
+
881
+ fn assign_depths(nodes: &mut [TrieNode]) {
882
+ let mut stack = vec![(0, 0)];
883
+ while let Some((node_id, depth)) = stack.pop() {
884
+ nodes[node_id].depth = depth;
885
+ let child_ids = nodes[node_id]
886
+ .children_by_label
887
+ .values()
888
+ .copied()
889
+ .collect::<Vec<_>>();
890
+ for child_id in child_ids {
891
+ stack.push((child_id, depth + 1));
892
+ }
893
+ }
894
+ }
895
+
896
+ fn find_episode_prefix(value: &str) -> Option<(String, String)> {
897
+ let best_end = EPISODE_PATTERNS
898
+ .iter()
899
+ .filter_map(|pattern| pattern.find(value).map(|matched| matched.end()))
900
+ .chain(find_delimited_number_episode_end(value))
901
+ .max()?;
902
+
903
+ let prefix = value[..best_end].trim_end().to_owned();
904
+ let suffix = value[best_end..].trim().to_owned();
905
+ Some((prefix, suffix))
906
+ }
907
+
908
+ fn find_delimited_number_episode_end(value: &str) -> Option<usize> {
909
+ let mut digits_start = None;
910
+ let mut digit_count = 0;
911
+
912
+ for (index, ch) in value
913
+ .char_indices()
914
+ .chain(std::iter::once((value.len(), '\0')))
915
+ {
916
+ if ch.is_ascii_digit() {
917
+ if digits_start.is_none() {
918
+ digits_start = Some(index);
919
+ }
920
+ digit_count += 1;
921
+ continue;
922
+ }
923
+
924
+ if let Some(start) = digits_start {
925
+ if (1..=4).contains(&digit_count)
926
+ && has_episode_left_boundary(value, start)
927
+ && has_episode_right_boundary(ch)
928
+ {
929
+ return Some(index);
930
+ }
931
+ }
932
+
933
+ digits_start = None;
934
+ digit_count = 0;
935
+ }
936
+
937
+ None
938
+ }
939
+
940
+ fn has_episode_left_boundary(value: &str, digits_start: usize) -> bool {
941
+ if digits_start == 0 {
942
+ return true;
943
+ }
944
+
945
+ value[..digits_start]
946
+ .chars()
947
+ .next_back()
948
+ .is_some_and(|ch| ch.is_whitespace() || matches!(ch, '.' | '_' | '-'))
949
+ }
950
+
951
+ fn has_episode_right_boundary(ch: char) -> bool {
952
+ ch == '\0'
953
+ || ch.is_whitespace()
954
+ || matches!(ch, '.' | '_' | '-' | ']' | ')' | '【' | '】' | '[')
955
+ }
956
+
957
+ fn digit_skeleton(text: &str) -> String {
958
+ DIGITS.replace_all(text, "<NUM>").into_owned()
959
+ }
960
+
961
+ fn display_path(path: &Path) -> String {
962
+ path.display().to_string()
963
+ }
964
+
965
+ fn default_count() -> usize {
966
+ 1
967
+ }
968
+
969
+ fn default_annotations_value() -> serde_json::Value {
970
+ serde_json::json!({})
971
+ }
972
+
973
+ #[cfg(test)]
974
+ mod tests {
975
+ use super::*;
976
+
977
+ #[test]
978
+ fn dag_merges_equal_suffix_nodes_without_normalizing_digits() {
979
+ let source = SourceGraph {
980
+ meta: serde_json::json!({}),
981
+ nodes: vec![
982
+ SourceNode {
983
+ id: 0,
984
+ edge_label: String::new(),
985
+ depth: 0,
986
+ children: vec![1, 3],
987
+ },
988
+ SourceNode {
989
+ id: 1,
990
+ edge_label: "A01".to_owned(),
991
+ depth: 1,
992
+ children: vec![2],
993
+ },
994
+ SourceNode {
995
+ id: 2,
996
+ edge_label: "X".to_owned(),
997
+ depth: 2,
998
+ children: vec![],
999
+ },
1000
+ SourceNode {
1001
+ id: 3,
1002
+ edge_label: "B02".to_owned(),
1003
+ depth: 1,
1004
+ children: vec![4],
1005
+ },
1006
+ SourceNode {
1007
+ id: 4,
1008
+ edge_label: "X".to_owned(),
1009
+ depth: 2,
1010
+ children: vec![],
1011
+ },
1012
+ ],
1013
+ terminals: vec![
1014
+ SourceTerminal {
1015
+ terminal_id: None,
1016
+ node_id: 2,
1017
+ prefix: "A01X".to_owned(),
1018
+ digit_skeleton: "A<NUM>X".to_owned(),
1019
+ count: 1,
1020
+ weight: 1,
1021
+ uses_path_count: 0,
1022
+ has_trailing_hash_count: 0,
1023
+ suffix_examples: vec![],
1024
+ value_examples: vec!["A01X".to_owned()],
1025
+ annotations: serde_json::json!({}),
1026
+ },
1027
+ SourceTerminal {
1028
+ terminal_id: None,
1029
+ node_id: 4,
1030
+ prefix: "B02X".to_owned(),
1031
+ digit_skeleton: "B<NUM>X".to_owned(),
1032
+ count: 1,
1033
+ weight: 1,
1034
+ uses_path_count: 0,
1035
+ has_trailing_hash_count: 0,
1036
+ suffix_examples: vec![],
1037
+ value_examples: vec!["B02X".to_owned()],
1038
+ annotations: serde_json::json!({}),
1039
+ },
1040
+ ],
1041
+ };
1042
+
1043
+ let dag = minimize_source_graph(
1044
+ &source,
1045
+ Path::new("dmhy_prefix_graph.json"),
1046
+ Path::new("dmhy_prefix_dag.json"),
1047
+ )
1048
+ .unwrap();
1049
+
1050
+ assert!(dag.meta.preserves_digits);
1051
+ assert!(dag.meta.merged_nodes >= 2);
1052
+ assert_eq!(dag.terminals[0].prefix, "A01X");
1053
+ assert_eq!(dag.terminals[1].prefix, "B02X");
1054
+ assert_eq!(dag.terminals[0].node_id, dag.terminals[1].node_id);
1055
+
1056
+ let shared_parent_targets = dag.nodes[0]
1057
+ .children
1058
+ .iter()
1059
+ .filter(|edge| edge.label == "A01" || edge.label == "B02")
1060
+ .map(|edge| edge.target)
1061
+ .collect::<BTreeSet<_>>();
1062
+ assert_eq!(shared_parent_targets.len(), 1);
1063
+
1064
+ let shared_parent = shared_parent_targets.into_iter().next().unwrap();
1065
+ assert_eq!(dag.nodes[shared_parent].children.len(), 1);
1066
+ assert_eq!(dag.nodes[shared_parent].children[0].label, "X");
1067
+ assert_eq!(dag.nodes[shared_parent].reachable_terminals, 2);
1068
+ assert_eq!(dag.nodes[shared_parent].reachable_weight, 2);
1069
+ }
1070
+ }
tools/test_annotated_dmhy_workflow.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Smoke tests for annotated DMHY graph dataset helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ import json
7
+ import subprocess
8
+ import sys
9
+ import unittest
10
+ from pathlib import Path
11
+
12
+ from tools.annotate_dmhy_prefix_graph import normalize_generated_tokens
13
+ from tools.convert_annotated_dmhy_dataset import (
14
+ iter_validated_jsonl,
15
+ validate_record,
16
+ )
17
+ from tools.convert_to_char_dataset import convert_record
18
+
19
+
20
+ class AnnotatedDmhyWorkflowTests(unittest.TestCase):
21
+ def test_generated_tokens_split_punctuation_and_use_b_only_labels(self) -> None:
22
+ tokens, labels = normalize_generated_tokens(
23
+ ["[ANi]", " ", "Title-Name", "07"],
24
+ ["B-GROUP", "O", "I-TITLE", "B-EPISODE"],
25
+ )
26
+ self.assertEqual(tokens, ["[", "ANi", "]", " ", "Title", "-", "Name", "07"])
27
+ self.assertEqual(
28
+ labels,
29
+ ["O", "B-GROUP", "O", "O", "B-TITLE", "O", "B-TITLE", "B-EPISODE"],
30
+ )
31
+ self.assertTrue(all(label == "O" or label.startswith("B-") for label in labels))
32
+
33
+ def test_preserve_i_labels_keeps_i_on_non_separator_pieces(self) -> None:
34
+ tokens, labels = normalize_generated_tokens(
35
+ ["Title-Name"],
36
+ ["I-TITLE"],
37
+ preserve_i_labels=True,
38
+ )
39
+ self.assertEqual(tokens, ["Title", "-", "Name"])
40
+ self.assertEqual(labels, ["I-TITLE", "O", "I-TITLE"])
41
+
42
+ def test_validation_rejects_embedded_punctuation(self) -> None:
43
+ record = {
44
+ "filename": "Title-Name 07",
45
+ "tokens": ["Title-Name", "07"],
46
+ "labels": ["B-TITLE", "B-EPISODE"],
47
+ }
48
+ with self.assertRaisesRegex(ValueError, "contains punctuation"):
49
+ validate_record(record, Path("sample.jsonl"), 1)
50
+
51
+ def test_validation_rejects_embedded_symbol_separator(self) -> None:
52
+ record = {
53
+ "filename": "Title 1920×1080 07",
54
+ "tokens": ["Title", "1920×1080", "07"],
55
+ "labels": ["B-TITLE", "B-RESOLUTION", "B-EPISODE"],
56
+ }
57
+ with self.assertRaisesRegex(ValueError, "contains punctuation"):
58
+ validate_record(record, Path("sample.jsonl"), 1)
59
+
60
+ def test_b_only_input_converts_to_char_i_labels(self) -> None:
61
+ record = {
62
+ "filename": "Title-Name 07",
63
+ "tokens": ["Title", "-", "Name", " ", "07"],
64
+ "labels": ["B-TITLE", "O", "B-TITLE", "O", "B-EPISODE"],
65
+ }
66
+ validate_record(record, Path("sample.jsonl"), 1)
67
+ converted = convert_record(record)
68
+ self.assertIn("I-TITLE", converted["labels"])
69
+ self.assertEqual(converted["tokens"][:5], ["T", "i", "t", "l", "e"])
70
+
71
+ def test_iter_validated_jsonl_accepts_generated_shape(self) -> None:
72
+ with tempfile.TemporaryDirectory() as tmpdir:
73
+ path = Path(tmpdir) / "records.jsonl"
74
+ path.write_text(
75
+ '{"filename":"A 01","tokens":["A"," ","01"],"labels":["B-TITLE","O","B-EPISODE"]}\n',
76
+ encoding="utf-8",
77
+ )
78
+ rows = list(iter_validated_jsonl(path))
79
+ self.assertEqual(len(rows), 1)
80
+ self.assertEqual(rows[0]["filename"], "A 01")
81
+
82
+ def test_cli_smoke_annotate_then_convert_with_temp_files(self) -> None:
83
+ with tempfile.TemporaryDirectory() as tmpdir:
84
+ tmp = Path(tmpdir)
85
+ graph_path = tmp / "graph.json"
86
+ dataset_path = tmp / "dmhy_weak.generated.jsonl"
87
+ char_path = tmp / "dmhy_weak.generated_char.jsonl"
88
+ vocab_path = tmp / "vocab.generated.char.json"
89
+ manifest_path = tmp / "manifest.json"
90
+ graph_path.write_text(
91
+ json.dumps(
92
+ {
93
+ "terminals": [
94
+ {
95
+ "terminal_id": "t0",
96
+ "weight": 1,
97
+ "value_examples": [
98
+ "[ANi] Test Show - 01 [1080P][WEB-DL].mkv"
99
+ ],
100
+ "suffix_examples": [" [1080P][WEB-DL]"],
101
+ }
102
+ ]
103
+ },
104
+ ensure_ascii=False,
105
+ ),
106
+ encoding="utf-8",
107
+ )
108
+
109
+ annotate = subprocess.run(
110
+ [
111
+ sys.executable,
112
+ "-m",
113
+ "tools.annotate_dmhy_prefix_graph",
114
+ "--graph",
115
+ str(graph_path),
116
+ "--output",
117
+ str(dataset_path),
118
+ "--patch-output",
119
+ "",
120
+ "--examples-only",
121
+ ],
122
+ check=False,
123
+ capture_output=True,
124
+ text=True,
125
+ )
126
+ self.assertEqual(annotate.returncode, 0, annotate.stderr)
127
+
128
+ rows = [
129
+ json.loads(line)
130
+ for line in dataset_path.read_text(encoding="utf-8").splitlines()
131
+ if line.strip()
132
+ ]
133
+ self.assertEqual(len(rows), 1)
134
+ self.assertIn("annotations", rows[0])
135
+ self.assertEqual(rows[0]["tokens"][0], "[")
136
+ self.assertEqual(rows[0]["labels"][0], "O")
137
+
138
+ convert = subprocess.run(
139
+ [
140
+ sys.executable,
141
+ "-m",
142
+ "tools.convert_annotated_dmhy_dataset",
143
+ "--input",
144
+ str(dataset_path),
145
+ "--output",
146
+ str(char_path),
147
+ "--vocab-output",
148
+ str(vocab_path),
149
+ "--manifest-output",
150
+ str(manifest_path),
151
+ "--progress",
152
+ "0",
153
+ ],
154
+ check=False,
155
+ capture_output=True,
156
+ text=True,
157
+ )
158
+ self.assertEqual(convert.returncode, 0, convert.stderr)
159
+ self.assertTrue(char_path.exists())
160
+ self.assertTrue(vocab_path.exists())
161
+ self.assertTrue(manifest_path.exists())
162
+
163
+ def test_cli_source_list_mode_expands_beyond_value_examples(self) -> None:
164
+ with tempfile.TemporaryDirectory() as tmpdir:
165
+ tmp = Path(tmpdir)
166
+ graph_path = tmp / "graph.json"
167
+ source_path = tmp / "dmhy_list.jsonl"
168
+ dataset_path = tmp / "dmhy_weak.generated.jsonl"
169
+ graph_path.write_text(
170
+ json.dumps(
171
+ {
172
+ "terminals": [
173
+ {
174
+ "terminal_id": "t0",
175
+ "prefix": "[ANi] Full Show - ",
176
+ "weight": 10,
177
+ "value_examples": [
178
+ "[ANi] Full Show - 01 [1080P][WEB-DL].mkv"
179
+ ],
180
+ "suffix_examples": ["01 [1080P][WEB-DL]"],
181
+ },
182
+ {
183
+ "terminal_id": "t1",
184
+ "prefix": "[ANi] Other Show - ",
185
+ "weight": 10,
186
+ "value_examples": [
187
+ "[ANi] Other Show - 01 [1080P][WEB-DL].mkv"
188
+ ],
189
+ "suffix_examples": ["01 [1080P][WEB-DL]"],
190
+ },
191
+ ]
192
+ },
193
+ ensure_ascii=False,
194
+ ),
195
+ encoding="utf-8",
196
+ )
197
+ source_path.write_text(
198
+ "\n".join(
199
+ json.dumps({"value": value}, ensure_ascii=False)
200
+ for value in [
201
+ "[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
202
+ "[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
203
+ "[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
204
+ "[ANi] Other Show - 01 [1080P][WEB-DL].mkv",
205
+ ]
206
+ )
207
+ + "\n",
208
+ encoding="utf-8",
209
+ )
210
+
211
+ annotate = subprocess.run(
212
+ [
213
+ sys.executable,
214
+ "-m",
215
+ "tools.annotate_dmhy_prefix_graph",
216
+ "--graph",
217
+ str(graph_path),
218
+ "--source-list",
219
+ str(source_path),
220
+ "--output",
221
+ str(dataset_path),
222
+ "--patch-output",
223
+ "",
224
+ "--limit",
225
+ "1",
226
+ ],
227
+ check=False,
228
+ capture_output=True,
229
+ text=True,
230
+ )
231
+ self.assertEqual(annotate.returncode, 0, annotate.stderr)
232
+ rows = [
233
+ json.loads(line)
234
+ for line in dataset_path.read_text(encoding="utf-8").splitlines()
235
+ if line.strip()
236
+ ]
237
+ self.assertEqual(len(rows), 3)
238
+ self.assertEqual([row["filename"] for row in rows], [
239
+ "[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
240
+ "[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
241
+ "[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
242
+ ])
243
+ self.assertTrue(all(row["terminal_id"] == "t0" for row in rows))
244
+
245
+ def test_cli_examples_only_uses_terminal_value_examples(self) -> None:
246
+ with tempfile.TemporaryDirectory() as tmpdir:
247
+ tmp = Path(tmpdir)
248
+ graph_path = tmp / "graph.json"
249
+ source_path = tmp / "dmhy_list.jsonl"
250
+ dataset_path = tmp / "dmhy_weak.generated.jsonl"
251
+ graph_path.write_text(
252
+ json.dumps(
253
+ {
254
+ "terminals": [
255
+ {
256
+ "terminal_id": "t0",
257
+ "prefix": "[ANi] Example Show - ",
258
+ "weight": 10,
259
+ "value_examples": [
260
+ "[ANi] Example Show - 01 [1080P][WEB-DL].mkv"
261
+ ],
262
+ "suffix_examples": ["01 [1080P][WEB-DL]"],
263
+ }
264
+ ]
265
+ },
266
+ ensure_ascii=False,
267
+ ),
268
+ encoding="utf-8",
269
+ )
270
+ source_path.write_text(
271
+ "\n".join(
272
+ json.dumps({"value": value}, ensure_ascii=False)
273
+ for value in [
274
+ "[ANi] Example Show - 01 [1080P][WEB-DL].mkv",
275
+ "[ANi] Example Show - 02 [1080P][WEB-DL].mkv",
276
+ ]
277
+ )
278
+ + "\n",
279
+ encoding="utf-8",
280
+ )
281
+
282
+ annotate = subprocess.run(
283
+ [
284
+ sys.executable,
285
+ "-m",
286
+ "tools.annotate_dmhy_prefix_graph",
287
+ "--graph",
288
+ str(graph_path),
289
+ "--source-list",
290
+ str(source_path),
291
+ "--output",
292
+ str(dataset_path),
293
+ "--patch-output",
294
+ "",
295
+ "--examples-only",
296
+ ],
297
+ check=False,
298
+ capture_output=True,
299
+ text=True,
300
+ )
301
+ self.assertEqual(annotate.returncode, 0, annotate.stderr)
302
+ rows = [
303
+ json.loads(line)
304
+ for line in dataset_path.read_text(encoding="utf-8").splitlines()
305
+ if line.strip()
306
+ ]
307
+ self.assertEqual(len(rows), 1)
308
+ self.assertEqual(rows[0]["filename"], "[ANi] Example Show - 01 [1080P][WEB-DL].mkv")
309
+
310
+ def test_cli_dag_annotation_units_include_shared_node_terminals(self) -> None:
311
+ with tempfile.TemporaryDirectory() as tmpdir:
312
+ tmp = Path(tmpdir)
313
+ dag_path = tmp / "dmhy_prefix_dag.json"
314
+ output_path = tmp / "dmhy_prefix_dag.annotation_units.jsonl"
315
+ dag_path.write_text(
316
+ json.dumps(
317
+ {
318
+ "meta": {"version": "prefix-dag-v1"},
319
+ "root": 0,
320
+ "nodes": [
321
+ {
322
+ "id": 0,
323
+ "terminal": False,
324
+ "children": [
325
+ {"label": "A", "target": 1},
326
+ {"label": "B", "target": 2},
327
+ ],
328
+ "incoming_count": 0,
329
+ "reachable_terminals": 2,
330
+ "reachable_weight": 20,
331
+ },
332
+ {
333
+ "id": 1,
334
+ "terminal": False,
335
+ "children": [{"label": " shared", "target": 3}],
336
+ "incoming_count": 1,
337
+ "reachable_terminals": 1,
338
+ "reachable_weight": 10,
339
+ },
340
+ {
341
+ "id": 2,
342
+ "terminal": False,
343
+ "children": [{"label": " shared", "target": 3}],
344
+ "incoming_count": 1,
345
+ "reachable_terminals": 1,
346
+ "reachable_weight": 10,
347
+ },
348
+ {
349
+ "id": 3,
350
+ "terminal": False,
351
+ "children": [
352
+ {"label": " 01", "target": 4},
353
+ {"label": " 02", "target": 5},
354
+ ],
355
+ "incoming_count": 2,
356
+ "reachable_terminals": 2,
357
+ "reachable_weight": 20,
358
+ },
359
+ {
360
+ "id": 4,
361
+ "terminal": True,
362
+ "children": [],
363
+ "incoming_count": 1,
364
+ "reachable_terminals": 1,
365
+ "reachable_weight": 10,
366
+ },
367
+ {
368
+ "id": 5,
369
+ "terminal": True,
370
+ "children": [],
371
+ "incoming_count": 1,
372
+ "reachable_terminals": 1,
373
+ "reachable_weight": 10,
374
+ },
375
+ ],
376
+ "terminals": [
377
+ {
378
+ "terminal_id": "t0",
379
+ "node_id": 4,
380
+ "prefix": "Show A shared 01",
381
+ "digit_skeleton": "Show A shared <NUM>",
382
+ "count": 10,
383
+ "weight": 10,
384
+ "suffix_examples": [" [1080P][WEB-DL]"],
385
+ "value_examples": ["Show A shared 01 [1080P][WEB-DL].mkv"],
386
+ "annotations": {},
387
+ },
388
+ {
389
+ "terminal_id": "t1",
390
+ "node_id": 5,
391
+ "prefix": "Show B shared 02",
392
+ "digit_skeleton": "Show B shared <NUM>",
393
+ "count": 10,
394
+ "weight": 10,
395
+ "suffix_examples": [" [1080P][WEB-DL]"],
396
+ "value_examples": ["Show B shared 02 [1080P][WEB-DL].mkv"],
397
+ "annotations": {},
398
+ },
399
+ ],
400
+ },
401
+ ensure_ascii=False,
402
+ ),
403
+ encoding="utf-8",
404
+ )
405
+
406
+ annotate = subprocess.run(
407
+ [
408
+ sys.executable,
409
+ "-m",
410
+ "tools.annotate_dmhy_prefix_dag",
411
+ "--dag",
412
+ str(dag_path),
413
+ "--output",
414
+ str(output_path),
415
+ "--min-reachable-terminals",
416
+ "2",
417
+ "--min-incoming-count",
418
+ "2",
419
+ "--limit",
420
+ "1",
421
+ ],
422
+ check=False,
423
+ capture_output=True,
424
+ text=True,
425
+ )
426
+ self.assertEqual(annotate.returncode, 0, annotate.stderr)
427
+ rows = [
428
+ json.loads(line)
429
+ for line in output_path.read_text(encoding="utf-8").splitlines()
430
+ if line.strip()
431
+ ]
432
+ self.assertEqual(len(rows), 1)
433
+ self.assertEqual(rows[0]["unit_id"], "dag-node-3")
434
+ self.assertEqual(rows[0]["kind"], "shared_suffix")
435
+ self.assertEqual(rows[0]["terminal_ids"], ["t0", "t1"])
436
+ self.assertEqual(
437
+ rows[0]["prefix_examples"],
438
+ ["Show A shared 01", "Show B shared 02"],
439
+ )
440
+ self.assertEqual(rows[0]["common_edge_labels"], [" 01", " 02"])
441
+ self.assertIn("annotations", rows[0])
442
+
443
+
444
+ if __name__ == "__main__":
445
+ unittest.main()