Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Add DMHY prefix graph annotation workflow
Browse files- datasets/AnimeName +1 -1
- tools/annotate_dmhy_prefix_dag.py +349 -0
- tools/annotate_dmhy_prefix_graph.py +715 -0
- tools/convert_annotated_dmhy_dataset.py +302 -0
- tools/dmhy_prefix_grouper/Cargo.lock +347 -0
- tools/dmhy_prefix_grouper/Cargo.toml +12 -0
- tools/dmhy_prefix_grouper/src/main.rs +1070 -0
- tools/test_annotated_dmhy_workflow.py +445 -0
datasets/AnimeName
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit ab3fbcad1a4bf889090d050248130c7d763c457e
|
tools/annotate_dmhy_prefix_dag.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build annotation units from a DMHY prefix DAG.
|
| 2 |
+
|
| 3 |
+
The DAG producer keeps repeated suffix structure shared across many raw
|
| 4 |
+
prefixes. This tool turns those shared nodes into compact, traceable units for
|
| 5 |
+
LLM or human review without calling any remote service.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Any, Iterable
|
| 15 |
+
|
| 16 |
+
from tools.annotate_dmhy_prefix_graph import (
|
| 17 |
+
heuristic_patch,
|
| 18 |
+
string_list,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
DEFAULT_DAG = Path("datasets/AnimeName/dmhy_prefix_dag.json")
|
| 23 |
+
DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_dag.annotation_units.jsonl")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass(frozen=True)
|
| 27 |
+
class Args:
|
| 28 |
+
dag: Path
|
| 29 |
+
output: Path
|
| 30 |
+
min_reachable_terminals: int
|
| 31 |
+
min_incoming_count: int
|
| 32 |
+
limit: int | None
|
| 33 |
+
example_count: int
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def parse_args() -> Args:
|
| 37 |
+
parser = argparse.ArgumentParser(
|
| 38 |
+
description="Emit DAG-aware DMHY prefix annotation units as JSONL"
|
| 39 |
+
)
|
| 40 |
+
parser.add_argument("--dag", type=Path, default=DEFAULT_DAG, help="Input dmhy_prefix_dag.json")
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--output",
|
| 43 |
+
type=Path,
|
| 44 |
+
default=DEFAULT_OUTPUT,
|
| 45 |
+
help="Output annotation unit JSONL",
|
| 46 |
+
)
|
| 47 |
+
parser.add_argument(
|
| 48 |
+
"--min-reachable-terminals",
|
| 49 |
+
type=int,
|
| 50 |
+
default=2,
|
| 51 |
+
help="Select non-root nodes reaching at least this many terminals",
|
| 52 |
+
)
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--min-incoming-count",
|
| 55 |
+
type=int,
|
| 56 |
+
default=2,
|
| 57 |
+
help="Select nodes with at least this many incoming DAG edges",
|
| 58 |
+
)
|
| 59 |
+
parser.add_argument("--limit", type=int, default=None, help="Maximum units to write")
|
| 60 |
+
parser.add_argument(
|
| 61 |
+
"--example-count",
|
| 62 |
+
type=int,
|
| 63 |
+
default=5,
|
| 64 |
+
help="Maximum examples retained per example field",
|
| 65 |
+
)
|
| 66 |
+
ns = parser.parse_args()
|
| 67 |
+
return Args(
|
| 68 |
+
dag=ns.dag,
|
| 69 |
+
output=ns.output,
|
| 70 |
+
min_reachable_terminals=max(1, ns.min_reachable_terminals),
|
| 71 |
+
min_incoming_count=max(1, ns.min_incoming_count),
|
| 72 |
+
limit=ns.limit,
|
| 73 |
+
example_count=max(1, ns.example_count),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def load_dag(path: Path) -> dict[str, Any]:
|
| 78 |
+
if not path.exists():
|
| 79 |
+
raise SystemExit(f"DAG not found: {path}")
|
| 80 |
+
try:
|
| 81 |
+
dag = json.loads(path.read_text(encoding="utf-8"))
|
| 82 |
+
except json.JSONDecodeError as exc:
|
| 83 |
+
raise SystemExit(f"invalid DAG JSON in {path}: {exc}") from exc
|
| 84 |
+
if not isinstance(dag, dict):
|
| 85 |
+
raise SystemExit(f"invalid DAG schema in {path}: root must be an object")
|
| 86 |
+
if not isinstance(dag.get("nodes"), list):
|
| 87 |
+
raise SystemExit(f"invalid DAG schema in {path}: missing nodes list")
|
| 88 |
+
if not isinstance(dag.get("terminals"), list):
|
| 89 |
+
raise SystemExit(f"invalid DAG schema in {path}: missing terminals list")
|
| 90 |
+
return dag
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def node_id(node: dict[str, Any], fallback: int) -> int:
|
| 94 |
+
value = node.get("id", fallback)
|
| 95 |
+
try:
|
| 96 |
+
return int(value)
|
| 97 |
+
except (TypeError, ValueError):
|
| 98 |
+
raise SystemExit(f"invalid node id: {value!r}") from None
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def terminal_id(terminal: dict[str, Any], fallback: int) -> str:
|
| 102 |
+
value = terminal.get("terminal_id", terminal.get("id", fallback))
|
| 103 |
+
return str(value)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def int_field(row: dict[str, Any], key: str, default: int = 0) -> int:
|
| 107 |
+
try:
|
| 108 |
+
return int(row.get(key, default) or default)
|
| 109 |
+
except (TypeError, ValueError):
|
| 110 |
+
return default
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def build_indexes(dag: dict[str, Any]) -> tuple[dict[int, dict[str, Any]], dict[int, list[dict[str, Any]]]]:
|
| 114 |
+
nodes: dict[int, dict[str, Any]] = {}
|
| 115 |
+
for fallback, node in enumerate(dag["nodes"]):
|
| 116 |
+
if not isinstance(node, dict):
|
| 117 |
+
continue
|
| 118 |
+
nodes[node_id(node, fallback)] = node
|
| 119 |
+
|
| 120 |
+
terminals_by_node: dict[int, list[dict[str, Any]]] = {}
|
| 121 |
+
for fallback, terminal in enumerate(dag["terminals"]):
|
| 122 |
+
if not isinstance(terminal, dict):
|
| 123 |
+
continue
|
| 124 |
+
try:
|
| 125 |
+
terminal_node_id = int(terminal.get("node_id"))
|
| 126 |
+
except (TypeError, ValueError):
|
| 127 |
+
continue
|
| 128 |
+
terminal = dict(terminal)
|
| 129 |
+
terminal["_terminal_id"] = terminal_id(terminal, fallback)
|
| 130 |
+
terminal["_terminal_index"] = fallback
|
| 131 |
+
terminals_by_node.setdefault(terminal_node_id, []).append(terminal)
|
| 132 |
+
return nodes, terminals_by_node
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def reachable_terminals(
|
| 136 |
+
start_node_id: int,
|
| 137 |
+
nodes: dict[int, dict[str, Any]],
|
| 138 |
+
terminals_by_node: dict[int, list[dict[str, Any]]],
|
| 139 |
+
) -> list[dict[str, Any]]:
|
| 140 |
+
memo: dict[int, list[dict[str, Any]]] = {}
|
| 141 |
+
visiting: set[int] = set()
|
| 142 |
+
|
| 143 |
+
def visit(current_id: int) -> list[dict[str, Any]]:
|
| 144 |
+
if current_id in memo:
|
| 145 |
+
return memo[current_id]
|
| 146 |
+
if current_id in visiting:
|
| 147 |
+
raise SystemExit(f"cycle detected while traversing DAG at node {current_id}")
|
| 148 |
+
visiting.add(current_id)
|
| 149 |
+
found = list(terminals_by_node.get(current_id, []))
|
| 150 |
+
node = nodes.get(current_id, {})
|
| 151 |
+
for edge in node.get("children") or []:
|
| 152 |
+
if not isinstance(edge, dict):
|
| 153 |
+
continue
|
| 154 |
+
try:
|
| 155 |
+
target = int(edge.get("target"))
|
| 156 |
+
except (TypeError, ValueError):
|
| 157 |
+
continue
|
| 158 |
+
found.extend(visit(target))
|
| 159 |
+
visiting.remove(current_id)
|
| 160 |
+
deduped = dedupe_terminals(found)
|
| 161 |
+
memo[current_id] = deduped
|
| 162 |
+
return deduped
|
| 163 |
+
|
| 164 |
+
return visit(start_node_id)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def dedupe_terminals(terminals: Iterable[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 168 |
+
seen: set[str] = set()
|
| 169 |
+
result: list[dict[str, Any]] = []
|
| 170 |
+
for terminal in terminals:
|
| 171 |
+
tid = str(terminal.get("_terminal_id") or terminal.get("terminal_id") or "")
|
| 172 |
+
if not tid or tid in seen:
|
| 173 |
+
continue
|
| 174 |
+
seen.add(tid)
|
| 175 |
+
result.append(terminal)
|
| 176 |
+
return result
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def limited_unique(values: Iterable[str], limit: int) -> list[str]:
|
| 180 |
+
seen: set[str] = set()
|
| 181 |
+
result: list[str] = []
|
| 182 |
+
for value in values:
|
| 183 |
+
if not value or not value.strip() or value in seen:
|
| 184 |
+
continue
|
| 185 |
+
seen.add(value)
|
| 186 |
+
result.append(value)
|
| 187 |
+
if len(result) >= limit:
|
| 188 |
+
break
|
| 189 |
+
return result
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def edge_labels(node: dict[str, Any], limit: int) -> list[str]:
|
| 193 |
+
labels = []
|
| 194 |
+
for edge in node.get("children") or []:
|
| 195 |
+
if isinstance(edge, dict) and edge.get("label") is not None:
|
| 196 |
+
labels.append(str(edge["label"]))
|
| 197 |
+
return limited_unique(labels, limit)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def aggregate_examples(terminals: list[dict[str, Any]], key: str, limit: int) -> list[str]:
|
| 201 |
+
values: list[str] = []
|
| 202 |
+
for terminal in terminals:
|
| 203 |
+
if key == "prefix":
|
| 204 |
+
value = terminal.get("prefix")
|
| 205 |
+
if value is not None:
|
| 206 |
+
values.append(str(value))
|
| 207 |
+
else:
|
| 208 |
+
values.extend(string_list(terminal.get(key)))
|
| 209 |
+
return limited_unique(values, limit)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def aggregate_needs_review(terminals: list[dict[str, Any]]) -> bool:
|
| 213 |
+
for index, terminal in enumerate(terminals):
|
| 214 |
+
annotations = terminal.get("annotations")
|
| 215 |
+
if isinstance(annotations, dict) and annotations.get("needs_llm_review") is not None:
|
| 216 |
+
if bool(annotations["needs_llm_review"]):
|
| 217 |
+
return True
|
| 218 |
+
continue
|
| 219 |
+
if heuristic_patch(terminal, index)["needs_llm_review"]:
|
| 220 |
+
return True
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def annotation_template() -> dict[str, Any]:
|
| 225 |
+
return {
|
| 226 |
+
"episode_title_suffixes": [],
|
| 227 |
+
"media_suffixes": [],
|
| 228 |
+
"title_candidates": [],
|
| 229 |
+
"notes": None,
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def recommended_prompt(kind: str, terminal_count: int) -> str:
|
| 234 |
+
if kind == "shared_suffix":
|
| 235 |
+
return (
|
| 236 |
+
"Review the shared DAG suffix examples and mark episode-title text, media metadata, "
|
| 237 |
+
f"and possible title candidates for {terminal_count} linked terminals."
|
| 238 |
+
)
|
| 239 |
+
return "Review this terminal cluster and mark episode-title text, media metadata, and title candidates."
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def make_unit(
|
| 243 |
+
node: dict[str, Any],
|
| 244 |
+
node_id_value: int,
|
| 245 |
+
terminals: list[dict[str, Any]],
|
| 246 |
+
kind: str,
|
| 247 |
+
example_count: int,
|
| 248 |
+
) -> dict[str, Any]:
|
| 249 |
+
terminal_ids = [str(terminal["_terminal_id"]) for terminal in terminals]
|
| 250 |
+
reachable_weight = int_field(node, "reachable_weight")
|
| 251 |
+
if reachable_weight <= 0:
|
| 252 |
+
reachable_weight = sum(int_field(terminal, "weight", int_field(terminal, "count", 1)) for terminal in terminals)
|
| 253 |
+
return {
|
| 254 |
+
"unit_id": f"dag-node-{node_id_value}",
|
| 255 |
+
"node_id": node_id_value,
|
| 256 |
+
"kind": kind,
|
| 257 |
+
"incoming_count": int_field(node, "incoming_count"),
|
| 258 |
+
"reachable_terminals": len(terminals),
|
| 259 |
+
"reachable_weight": reachable_weight,
|
| 260 |
+
"terminal_ids": terminal_ids,
|
| 261 |
+
"prefix_examples": aggregate_examples(terminals, "prefix", example_count),
|
| 262 |
+
"value_examples": aggregate_examples(terminals, "value_examples", example_count),
|
| 263 |
+
"suffix_examples": aggregate_examples(terminals, "suffix_examples", example_count),
|
| 264 |
+
"common_edge_labels": edge_labels(node, example_count),
|
| 265 |
+
"needs_llm_review": aggregate_needs_review(terminals),
|
| 266 |
+
"recommended_prompt": recommended_prompt(kind, len(terminals)),
|
| 267 |
+
"annotations": annotation_template(),
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def selected_units(dag: dict[str, Any], args: Args) -> list[dict[str, Any]]:
|
| 272 |
+
nodes, terminals_by_node = build_indexes(dag)
|
| 273 |
+
root = int(dag.get("root", 0) or 0)
|
| 274 |
+
candidates: list[tuple[tuple[int, int, int, int, int], dict[str, Any]]] = []
|
| 275 |
+
|
| 276 |
+
for current_id, node in nodes.items():
|
| 277 |
+
terminals = reachable_terminals(current_id, nodes, terminals_by_node)
|
| 278 |
+
if not terminals:
|
| 279 |
+
continue
|
| 280 |
+
incoming_count = int_field(node, "incoming_count")
|
| 281 |
+
reachable_count = len(terminals)
|
| 282 |
+
by_shared_incoming = incoming_count >= args.min_incoming_count
|
| 283 |
+
by_reachable = current_id != root and reachable_count >= args.min_reachable_terminals
|
| 284 |
+
if by_shared_incoming or by_reachable:
|
| 285 |
+
unit = make_unit(node, current_id, terminals, "shared_suffix", args.example_count)
|
| 286 |
+
sort_key = (
|
| 287 |
+
0,
|
| 288 |
+
-unit["reachable_weight"],
|
| 289 |
+
-unit["reachable_terminals"],
|
| 290 |
+
-unit["incoming_count"],
|
| 291 |
+
current_id,
|
| 292 |
+
)
|
| 293 |
+
candidates.append((sort_key, unit))
|
| 294 |
+
|
| 295 |
+
covered_terminal_ids = {
|
| 296 |
+
terminal_id
|
| 297 |
+
for _sort_key, unit in candidates
|
| 298 |
+
for terminal_id in unit["terminal_ids"]
|
| 299 |
+
}
|
| 300 |
+
for current_id, terminals in terminals_by_node.items():
|
| 301 |
+
uncovered = [terminal for terminal in terminals if terminal["_terminal_id"] not in covered_terminal_ids]
|
| 302 |
+
if not uncovered:
|
| 303 |
+
continue
|
| 304 |
+
node = nodes.get(current_id, {"id": current_id})
|
| 305 |
+
unit = make_unit(node, current_id, uncovered, "terminal_cluster", args.example_count)
|
| 306 |
+
sort_key = (
|
| 307 |
+
1,
|
| 308 |
+
-unit["reachable_weight"],
|
| 309 |
+
-unit["reachable_terminals"],
|
| 310 |
+
-unit["incoming_count"],
|
| 311 |
+
current_id,
|
| 312 |
+
)
|
| 313 |
+
candidates.append((sort_key, unit))
|
| 314 |
+
|
| 315 |
+
candidates.sort(key=lambda item: item[0])
|
| 316 |
+
units = [unit for _sort_key, unit in candidates]
|
| 317 |
+
if args.limit is not None:
|
| 318 |
+
units = units[: max(0, args.limit)]
|
| 319 |
+
return units
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
|
| 323 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 324 |
+
count = 0
|
| 325 |
+
with path.open("w", encoding="utf-8", newline="\n") as handle:
|
| 326 |
+
for row in rows:
|
| 327 |
+
handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
|
| 328 |
+
count += 1
|
| 329 |
+
return count
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def main() -> None:
|
| 333 |
+
args = parse_args()
|
| 334 |
+
dag = load_dag(args.dag)
|
| 335 |
+
units = selected_units(dag, args)
|
| 336 |
+
count = write_jsonl(args.output, units)
|
| 337 |
+
summary = {
|
| 338 |
+
"dag": str(args.dag),
|
| 339 |
+
"output": str(args.output),
|
| 340 |
+
"annotation_units": count,
|
| 341 |
+
"min_reachable_terminals": args.min_reachable_terminals,
|
| 342 |
+
"min_incoming_count": args.min_incoming_count,
|
| 343 |
+
"example_count": args.example_count,
|
| 344 |
+
}
|
| 345 |
+
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
if __name__ == "__main__":
|
| 349 |
+
main()
|
tools/annotate_dmhy_prefix_graph.py
ADDED
|
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Annotate DMHY prefix graph terminals and emit weak-label dataset rows.
|
| 2 |
+
|
| 3 |
+
The graph producer intentionally leaves terminal.annotations empty. This tool
|
| 4 |
+
adds a deterministic suffix-format layer without depending on network access:
|
| 5 |
+
|
| 6 |
+
- classify suffix examples into episode-title text vs media/hash metadata
|
| 7 |
+
- optionally ask an OpenAI-compatible Responses API for a second opinion
|
| 8 |
+
- write dmhy_weak-compatible JSONL records: filename, tokens, labels
|
| 9 |
+
- optionally write graph annotation patch JSONL and/or a merged graph JSON
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import sys
|
| 19 |
+
import urllib.error
|
| 20 |
+
import urllib.request
|
| 21 |
+
from dataclasses import dataclass
|
| 22 |
+
from datetime import datetime, timezone
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from typing import Any, Iterable
|
| 25 |
+
|
| 26 |
+
from anifilebert.tokenizer import AnimeTokenizer
|
| 27 |
+
from tools.dmhy_dataset import weak_label_filename
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
DEFAULT_GRAPH = Path("datasets/AnimeName/dmhy_prefix_graph.json")
|
| 31 |
+
DEFAULT_SOURCE_LIST = Path("datasets/AnimeName/dmhy_list.jsonl")
|
| 32 |
+
DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
|
| 33 |
+
DEFAULT_PATCH_OUTPUT = Path("datasets/AnimeName/dmhy_prefix_graph.annotations.jsonl")
|
| 34 |
+
DEFAULT_MODEL = "gpt-5.4-mini"
|
| 35 |
+
|
| 36 |
+
SOURCE = "heuristic-v1"
|
| 37 |
+
LLM_SOURCE = "responses-v1"
|
| 38 |
+
|
| 39 |
+
TRAILING_HASH_RE = re.compile(r"^[A-Fa-f0-9]{8,}$")
|
| 40 |
+
RESOLUTION_RE = re.compile(r"(?i)(?:\b\d{3,4}p\b|\b\dk\b|\b\d{3,4}[xX×]\d{3,4}\b)")
|
| 41 |
+
MEDIA_WORD_RE = re.compile(
|
| 42 |
+
r"(?i)\b(?:"
|
| 43 |
+
r"web[-_. ]?dl|web[-_. ]?rip|bdrip|blu[-_. ]?ray|bdmv|bd|dvd[-_. ]?rip|dvd|"
|
| 44 |
+
r"hdtv|tvrip|remux|x26[45]|h\.?26[45]|hevc|avc|av1|aac\d*(?:\.\d+)?|"
|
| 45 |
+
r"flac|mp3|dts|opus|10[-_. ]?bit|8[-_. ]?bit|hi10p|ma10p|yuv\d+p?\d*|"
|
| 46 |
+
r"chs|cht|gb|big5|jpn?|eng|m?subs?|assx?\d*|srtx?\d*|vfr|cfr|"
|
| 47 |
+
r"nf|netflix|amzn|baha|cr|abema|dsnp|hulu"
|
| 48 |
+
r")\b"
|
| 49 |
+
)
|
| 50 |
+
LANG_CJK_RE = re.compile(r"(?:字幕|简体|繁体|简中|繁中|日语|英语|双语|内封|外挂)")
|
| 51 |
+
QUOTED_RE = re.compile(r"[「『\"“](.+?)[」』\"”]")
|
| 52 |
+
BRACKET_SEGMENT_RE = re.compile(r"(\[[^\]]+\]|\([^)]+\)|【[^】]+】|《[^》]+》)")
|
| 53 |
+
PATH_EPISODE_TITLE_RE = re.compile(
|
| 54 |
+
r"(?i)(?:^|[/\\])[^/\\]*?(?:"
|
| 55 |
+
r"S\d{1,2}E\d{1,4}|\d{1,2}x\d{1,4}|EP?\.?\s*\d{1,4}|ACT\.?\d{1,4}|第\s*\d{1,4}\s*[話话集回]"
|
| 56 |
+
r")\s*[-_ ]+(?P<title>[^/\\\[\(【《]+)"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class Args:
|
| 62 |
+
graph: Path
|
| 63 |
+
source_list: Path
|
| 64 |
+
output: Path
|
| 65 |
+
patch_output: Path | None
|
| 66 |
+
merge_output: Path | None
|
| 67 |
+
limit: int | None
|
| 68 |
+
min_weight: int | None
|
| 69 |
+
only_needs_review: bool
|
| 70 |
+
llm: bool
|
| 71 |
+
base_url: str
|
| 72 |
+
api_key: str | None
|
| 73 |
+
model: str
|
| 74 |
+
max_requests: int | None
|
| 75 |
+
http_timeout: int
|
| 76 |
+
preserve_i_labels: bool
|
| 77 |
+
examples_only: bool
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def parse_args() -> Args:
|
| 81 |
+
parser = argparse.ArgumentParser(
|
| 82 |
+
description="Annotate DMHY prefix graph terminals and write dmhy_weak-compatible rows"
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument("--graph", type=Path, default=DEFAULT_GRAPH, help="Input dmhy_prefix_graph.json")
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--source-list",
|
| 87 |
+
type=Path,
|
| 88 |
+
default=DEFAULT_SOURCE_LIST,
|
| 89 |
+
help="Input dmhy_list.jsonl with full raw values; each line must contain a JSON object with value",
|
| 90 |
+
)
|
| 91 |
+
parser.add_argument(
|
| 92 |
+
"--output",
|
| 93 |
+
type=Path,
|
| 94 |
+
default=DEFAULT_OUTPUT,
|
| 95 |
+
help="Output dataset JSONL records compatible with dmhy_weak.jsonl",
|
| 96 |
+
)
|
| 97 |
+
parser.add_argument(
|
| 98 |
+
"--patch-output",
|
| 99 |
+
default=str(DEFAULT_PATCH_OUTPUT),
|
| 100 |
+
help="Optional JSONL terminal annotation patches; use empty string to disable",
|
| 101 |
+
)
|
| 102 |
+
parser.add_argument("--merge-output", type=Path, default=None, help="Optional full graph JSON with terminal.annotations merged")
|
| 103 |
+
parser.add_argument("--limit", type=int, default=None, help="Maximum selected terminals to process")
|
| 104 |
+
parser.add_argument("--min-weight", type=int, default=None, help="Only process terminals with weight >= this value")
|
| 105 |
+
parser.add_argument("--only-needs-review", action="store_true", help="Only process terminals with ambiguous suffix examples")
|
| 106 |
+
parser.add_argument("--llm", action="store_true", help="Opt in to Responses API annotation")
|
| 107 |
+
parser.add_argument(
|
| 108 |
+
"--base-url",
|
| 109 |
+
default=os.environ.get("ANIFILEBERT_LLM_BASE_URL", "http://10.137.32.209:8317/v1"),
|
| 110 |
+
help="OpenAI-compatible API base URL; used only with --llm",
|
| 111 |
+
)
|
| 112 |
+
parser.add_argument(
|
| 113 |
+
"--api-key",
|
| 114 |
+
default=os.environ.get("ANIFILEBERT_LLM_API_KEY"),
|
| 115 |
+
help="API key; falls back to ANIFILEBERT_LLM_API_KEY",
|
| 116 |
+
)
|
| 117 |
+
parser.add_argument("--model", default=DEFAULT_MODEL, help="Responses API model")
|
| 118 |
+
parser.add_argument("--max-requests", type=int, default=None, help="Maximum LLM requests; omitted means no cap")
|
| 119 |
+
parser.add_argument("--http-timeout", type=int, default=120, help="HTTP timeout in seconds per LLM request")
|
| 120 |
+
parser.add_argument(
|
| 121 |
+
"--preserve-i-labels",
|
| 122 |
+
action="store_true",
|
| 123 |
+
help="Keep I-* labels from weak labeling instead of normalizing generated token labels to B/O only",
|
| 124 |
+
)
|
| 125 |
+
parser.add_argument(
|
| 126 |
+
"--examples-only",
|
| 127 |
+
action="store_true",
|
| 128 |
+
help="Use terminal.value_examples only; preserves the old small-sample behavior",
|
| 129 |
+
)
|
| 130 |
+
ns = parser.parse_args()
|
| 131 |
+
patch_output_arg = str(ns.patch_output).strip()
|
| 132 |
+
patch_output = Path(patch_output_arg) if patch_output_arg else None
|
| 133 |
+
if patch_output is not None and str(patch_output).strip() == "":
|
| 134 |
+
patch_output = None
|
| 135 |
+
return Args(
|
| 136 |
+
graph=ns.graph,
|
| 137 |
+
source_list=ns.source_list,
|
| 138 |
+
output=ns.output,
|
| 139 |
+
patch_output=patch_output,
|
| 140 |
+
merge_output=ns.merge_output,
|
| 141 |
+
limit=ns.limit,
|
| 142 |
+
min_weight=ns.min_weight,
|
| 143 |
+
only_needs_review=ns.only_needs_review,
|
| 144 |
+
llm=ns.llm,
|
| 145 |
+
base_url=ns.base_url,
|
| 146 |
+
api_key=ns.api_key,
|
| 147 |
+
model=ns.model,
|
| 148 |
+
max_requests=ns.max_requests,
|
| 149 |
+
http_timeout=ns.http_timeout,
|
| 150 |
+
preserve_i_labels=ns.preserve_i_labels,
|
| 151 |
+
examples_only=ns.examples_only,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def load_graph(path: Path) -> dict[str, Any]:
|
| 156 |
+
if not path.exists():
|
| 157 |
+
raise SystemExit(f"graph not found: {path}")
|
| 158 |
+
try:
|
| 159 |
+
graph = json.loads(path.read_text(encoding="utf-8"))
|
| 160 |
+
except json.JSONDecodeError as exc:
|
| 161 |
+
raise SystemExit(f"invalid graph JSON in {path}: {exc}") from exc
|
| 162 |
+
if not isinstance(graph, dict):
|
| 163 |
+
raise SystemExit(f"invalid graph schema in {path}: root must be an object")
|
| 164 |
+
terminals = graph.get("terminals")
|
| 165 |
+
if not isinstance(terminals, list):
|
| 166 |
+
raise SystemExit(f"invalid graph schema in {path}: missing terminals list")
|
| 167 |
+
if not terminals:
|
| 168 |
+
raise SystemExit(f"graph has no terminals: {path}")
|
| 169 |
+
return graph
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def terminal_id(terminal: dict[str, Any], index: int) -> str:
|
| 173 |
+
for key in ("terminal_id", "id", "node_id"):
|
| 174 |
+
value = terminal.get(key)
|
| 175 |
+
if value is not None:
|
| 176 |
+
return str(value)
|
| 177 |
+
return str(index)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def string_list(value: Any) -> list[str]:
|
| 181 |
+
if not isinstance(value, list):
|
| 182 |
+
return []
|
| 183 |
+
return [str(item) for item in value if str(item).strip()]
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def unique_keep_order(values: Iterable[str]) -> list[str]:
|
| 187 |
+
seen: set[str] = set()
|
| 188 |
+
result: list[str] = []
|
| 189 |
+
for value in values:
|
| 190 |
+
cleaned = normalize_space(value)
|
| 191 |
+
if not cleaned or cleaned in seen:
|
| 192 |
+
continue
|
| 193 |
+
seen.add(cleaned)
|
| 194 |
+
result.append(cleaned)
|
| 195 |
+
return result
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def normalize_space(value: str) -> str:
|
| 199 |
+
return re.sub(r"\s+", " ", value).strip()
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def clean_candidate(value: str) -> str:
|
| 203 |
+
value = normalize_space(value)
|
| 204 |
+
value = value.strip("-_ .~/\\|")
|
| 205 |
+
value = value.strip("[]()【】《》「」『』\"“”")
|
| 206 |
+
return normalize_space(value.replace("_", " "))
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def is_media_fragment(value: str) -> bool:
|
| 210 |
+
text = clean_candidate(value)
|
| 211 |
+
if not text:
|
| 212 |
+
return False
|
| 213 |
+
if TRAILING_HASH_RE.fullmatch(text):
|
| 214 |
+
return True
|
| 215 |
+
if RESOLUTION_RE.search(text) or MEDIA_WORD_RE.search(text) or LANG_CJK_RE.search(text):
|
| 216 |
+
return True
|
| 217 |
+
if len(text) <= 16 and re.fullmatch(r"[A-Fa-f0-9]{8,}(?:\s*rev)?", text):
|
| 218 |
+
return True
|
| 219 |
+
return False
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def split_suffix_fragments(suffix: str) -> tuple[list[str], list[str]]:
|
| 223 |
+
episode_titles: list[str] = []
|
| 224 |
+
media: list[str] = []
|
| 225 |
+
|
| 226 |
+
for match in QUOTED_RE.finditer(suffix):
|
| 227 |
+
candidate = clean_candidate(match.group(1))
|
| 228 |
+
if candidate and not is_media_fragment(candidate):
|
| 229 |
+
episode_titles.append(candidate)
|
| 230 |
+
|
| 231 |
+
remainder = suffix
|
| 232 |
+
for segment in BRACKET_SEGMENT_RE.findall(suffix):
|
| 233 |
+
cleaned = clean_candidate(segment)
|
| 234 |
+
if is_media_fragment(cleaned):
|
| 235 |
+
media.append(segment.strip())
|
| 236 |
+
remainder = remainder.replace(segment, " ", 1)
|
| 237 |
+
|
| 238 |
+
for match in PATH_EPISODE_TITLE_RE.finditer(suffix):
|
| 239 |
+
candidate = clean_candidate(match.group("title"))
|
| 240 |
+
if candidate and not is_media_fragment(candidate):
|
| 241 |
+
episode_titles.append(candidate)
|
| 242 |
+
|
| 243 |
+
for piece in re.split(r"[/\\]", remainder):
|
| 244 |
+
cleaned = clean_candidate(piece)
|
| 245 |
+
if not cleaned:
|
| 246 |
+
continue
|
| 247 |
+
if is_media_fragment(cleaned):
|
| 248 |
+
media.append(cleaned)
|
| 249 |
+
elif QUOTED_RE.search(piece):
|
| 250 |
+
continue
|
| 251 |
+
elif looks_like_plain_episode_title(cleaned):
|
| 252 |
+
episode_titles.append(cleaned)
|
| 253 |
+
|
| 254 |
+
return unique_keep_order(episode_titles), unique_keep_order(media)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def looks_like_plain_episode_title(value: str) -> bool:
|
| 258 |
+
if len(value) < 3 or is_media_fragment(value):
|
| 259 |
+
return False
|
| 260 |
+
if re.fullmatch(r"(?i)(?:part|ova|special|season|stage|act)\s*\d+", value):
|
| 261 |
+
return False
|
| 262 |
+
if re.fullmatch(r"[\d\s._-]+", value):
|
| 263 |
+
return False
|
| 264 |
+
return bool(re.search(r"[A-Za-z\u3040-\u30ff\u3400-\u9fff]", value))
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def heuristic_patch(terminal: dict[str, Any], index: int) -> dict[str, Any]:
|
| 268 |
+
suffix_examples = string_list(terminal.get("suffix_examples"))
|
| 269 |
+
value_examples = string_list(terminal.get("value_examples"))
|
| 270 |
+
episode_titles: list[str] = []
|
| 271 |
+
media_suffixes: list[str] = []
|
| 272 |
+
|
| 273 |
+
for suffix in suffix_examples:
|
| 274 |
+
title_bits, media_bits = split_suffix_fragments(suffix)
|
| 275 |
+
episode_titles.extend(title_bits)
|
| 276 |
+
media_suffixes.extend(media_bits)
|
| 277 |
+
|
| 278 |
+
if not episode_titles:
|
| 279 |
+
for value in value_examples:
|
| 280 |
+
for match in PATH_EPISODE_TITLE_RE.finditer(value):
|
| 281 |
+
candidate = clean_candidate(match.group("title"))
|
| 282 |
+
if candidate and not is_media_fragment(candidate):
|
| 283 |
+
episode_titles.append(candidate)
|
| 284 |
+
|
| 285 |
+
episode_titles = unique_keep_order(episode_titles)
|
| 286 |
+
media_suffixes = unique_keep_order(media_suffixes)
|
| 287 |
+
title_candidates = unique_keep_order(clean_candidate(item) for item in episode_titles)
|
| 288 |
+
needs_review = needs_llm_review(terminal, episode_titles, media_suffixes)
|
| 289 |
+
notes = summarize_notes(suffix_examples, episode_titles, media_suffixes, needs_review)
|
| 290 |
+
return {
|
| 291 |
+
"terminal_id": terminal_id(terminal, index),
|
| 292 |
+
"needs_llm_review": needs_review,
|
| 293 |
+
"episode_title_suffixes": episode_titles,
|
| 294 |
+
"media_suffixes": media_suffixes,
|
| 295 |
+
"title_candidates": title_candidates,
|
| 296 |
+
"llm_label": None,
|
| 297 |
+
"notes": notes,
|
| 298 |
+
"source": SOURCE,
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def needs_llm_review(
|
| 303 |
+
terminal: dict[str, Any],
|
| 304 |
+
episode_titles: list[str],
|
| 305 |
+
media_suffixes: list[str],
|
| 306 |
+
) -> bool:
|
| 307 |
+
suffix_examples = string_list(terminal.get("suffix_examples"))
|
| 308 |
+
if not suffix_examples:
|
| 309 |
+
return False
|
| 310 |
+
classified = len(episode_titles) + len(media_suffixes)
|
| 311 |
+
if episode_titles and media_suffixes:
|
| 312 |
+
return True
|
| 313 |
+
if classified == 0:
|
| 314 |
+
return True
|
| 315 |
+
suffix_text = " ".join(suffix_examples)
|
| 316 |
+
if "/" in suffix_text or "\\" in suffix_text:
|
| 317 |
+
return True
|
| 318 |
+
return False
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def summarize_notes(
|
| 322 |
+
suffix_examples: list[str],
|
| 323 |
+
episode_titles: list[str],
|
| 324 |
+
media_suffixes: list[str],
|
| 325 |
+
needs_review: bool,
|
| 326 |
+
) -> str:
|
| 327 |
+
parts = [
|
| 328 |
+
f"suffix_examples={len(suffix_examples)}",
|
| 329 |
+
f"episode_title_suffixes={len(episode_titles)}",
|
| 330 |
+
f"media_suffixes={len(media_suffixes)}",
|
| 331 |
+
]
|
| 332 |
+
if needs_review:
|
| 333 |
+
parts.append("ambiguous_suffix_layer")
|
| 334 |
+
return "; ".join(parts)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def selected_terminals(graph: dict[str, Any], args: Args) -> list[tuple[int, dict[str, Any], dict[str, Any]]]:
|
| 338 |
+
selected: list[tuple[int, dict[str, Any], dict[str, Any]]] = []
|
| 339 |
+
for index, terminal in enumerate(graph["terminals"]):
|
| 340 |
+
if not isinstance(terminal, dict):
|
| 341 |
+
continue
|
| 342 |
+
weight = int(terminal.get("weight") or terminal.get("count") or 0)
|
| 343 |
+
if args.min_weight is not None and weight < args.min_weight:
|
| 344 |
+
continue
|
| 345 |
+
patch = heuristic_patch(terminal, index)
|
| 346 |
+
if args.only_needs_review and not patch["needs_llm_review"]:
|
| 347 |
+
continue
|
| 348 |
+
selected.append((index, terminal, patch))
|
| 349 |
+
if args.limit is not None and len(selected) >= args.limit:
|
| 350 |
+
break
|
| 351 |
+
return selected
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def responses_url(base_url: str) -> str:
|
| 355 |
+
return base_url.rstrip("/") + "/responses"
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def extract_response_text(data: dict[str, Any]) -> str:
|
| 359 |
+
output_text = data.get("output_text")
|
| 360 |
+
if isinstance(output_text, str) and output_text.strip():
|
| 361 |
+
return output_text
|
| 362 |
+
chunks: list[str] = []
|
| 363 |
+
for item in data.get("output") or []:
|
| 364 |
+
if not isinstance(item, dict):
|
| 365 |
+
continue
|
| 366 |
+
for content in item.get("content") or []:
|
| 367 |
+
if not isinstance(content, dict):
|
| 368 |
+
continue
|
| 369 |
+
text = content.get("text")
|
| 370 |
+
if isinstance(text, str):
|
| 371 |
+
chunks.append(text)
|
| 372 |
+
return "\n".join(chunks).strip()
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def call_llm(terminal: dict[str, Any], patch: dict[str, Any], args: Args) -> dict[str, Any] | None:
|
| 376 |
+
if not args.api_key:
|
| 377 |
+
raise RuntimeError("--llm requires --api-key or ANIFILEBERT_LLM_API_KEY")
|
| 378 |
+
|
| 379 |
+
instructions = (
|
| 380 |
+
"You annotate anime filename suffix examples. Return strict JSON only with keys "
|
| 381 |
+
"episode_title_suffixes, media_suffixes, title_candidates, llm_label, notes. "
|
| 382 |
+
"Classify quoted human episode titles separately from media tags such as resolution, "
|
| 383 |
+
"codec, source, language, subtitle markers, hashes, and release metadata."
|
| 384 |
+
)
|
| 385 |
+
payload = {
|
| 386 |
+
"model": args.model,
|
| 387 |
+
"instructions": instructions,
|
| 388 |
+
"input": json.dumps(
|
| 389 |
+
{
|
| 390 |
+
"terminal_id": patch["terminal_id"],
|
| 391 |
+
"prefix": terminal.get("prefix"),
|
| 392 |
+
"digit_skeleton": terminal.get("digit_skeleton"),
|
| 393 |
+
"suffix_examples": string_list(terminal.get("suffix_examples")),
|
| 394 |
+
"value_examples": string_list(terminal.get("value_examples")),
|
| 395 |
+
"heuristic_patch": patch,
|
| 396 |
+
},
|
| 397 |
+
ensure_ascii=False,
|
| 398 |
+
),
|
| 399 |
+
}
|
| 400 |
+
request = urllib.request.Request(
|
| 401 |
+
responses_url(args.base_url),
|
| 402 |
+
data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
|
| 403 |
+
headers={
|
| 404 |
+
"Authorization": f"Bearer {args.api_key}",
|
| 405 |
+
"Content-Type": "application/json",
|
| 406 |
+
},
|
| 407 |
+
method="POST",
|
| 408 |
+
)
|
| 409 |
+
try:
|
| 410 |
+
with urllib.request.urlopen(request, timeout=args.http_timeout) as response:
|
| 411 |
+
raw = response.read().decode("utf-8")
|
| 412 |
+
except urllib.error.HTTPError as exc:
|
| 413 |
+
body = exc.read().decode("utf-8", errors="replace")
|
| 414 |
+
raise RuntimeError(f"Responses API HTTP {exc.code}: {body[:500]}") from exc
|
| 415 |
+
except urllib.error.URLError as exc:
|
| 416 |
+
raise RuntimeError(f"Responses API request failed: {exc}") from exc
|
| 417 |
+
|
| 418 |
+
try:
|
| 419 |
+
data = json.loads(raw)
|
| 420 |
+
text = extract_response_text(data)
|
| 421 |
+
annotation = json.loads(strip_json_fence(text))
|
| 422 |
+
except (json.JSONDecodeError, TypeError) as exc:
|
| 423 |
+
raise RuntimeError(f"Responses API returned non-JSON annotation: {raw[:500]}") from exc
|
| 424 |
+
|
| 425 |
+
if not isinstance(annotation, dict):
|
| 426 |
+
raise RuntimeError("Responses API annotation must be a JSON object")
|
| 427 |
+
merged = dict(patch)
|
| 428 |
+
for key in ("episode_title_suffixes", "media_suffixes", "title_candidates"):
|
| 429 |
+
if key in annotation:
|
| 430 |
+
merged[key] = unique_keep_order(str(item) for item in annotation.get(key) or [])
|
| 431 |
+
if "llm_label" in annotation:
|
| 432 |
+
merged["llm_label"] = annotation["llm_label"]
|
| 433 |
+
if "notes" in annotation:
|
| 434 |
+
merged["notes"] = str(annotation["notes"])
|
| 435 |
+
merged["source"] = LLM_SOURCE
|
| 436 |
+
return merged
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
def strip_json_fence(text: str) -> str:
|
| 440 |
+
text = text.strip()
|
| 441 |
+
text = re.sub(r"^```(?:json)?\s*", "", text)
|
| 442 |
+
text = re.sub(r"\s*```$", "", text)
|
| 443 |
+
return text.strip()
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
PREFIX_BOUNDARY_CHARS = set(" \t\r\n-_.~/\\|::[]()【】《》「」『』\"'")
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
def prefix_boundary_ok(value: str, prefix: str) -> bool:
|
| 450 |
+
if not prefix or not value.startswith(prefix):
|
| 451 |
+
return False
|
| 452 |
+
if len(value) == len(prefix):
|
| 453 |
+
return True
|
| 454 |
+
next_char = value[len(prefix)]
|
| 455 |
+
last_char = prefix[-1]
|
| 456 |
+
return next_char in PREFIX_BOUNDARY_CHARS or last_char in PREFIX_BOUNDARY_CHARS
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
class PrefixTrieNode:
|
| 460 |
+
__slots__ = ("children", "terminal_ordinals")
|
| 461 |
+
|
| 462 |
+
def __init__(self) -> None:
|
| 463 |
+
self.children: dict[str, PrefixTrieNode] = {}
|
| 464 |
+
self.terminal_ordinals: list[int] = []
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
def build_prefix_trie(selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> PrefixTrieNode:
|
| 468 |
+
root = PrefixTrieNode()
|
| 469 |
+
for ordinal, (_index, terminal, _patch) in enumerate(selected):
|
| 470 |
+
prefix = str(terminal.get("prefix") or "")
|
| 471 |
+
if not prefix:
|
| 472 |
+
continue
|
| 473 |
+
node = root
|
| 474 |
+
for char in prefix:
|
| 475 |
+
node = node.children.setdefault(char, PrefixTrieNode())
|
| 476 |
+
node.terminal_ordinals.append(ordinal)
|
| 477 |
+
return root
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def matching_terminal_ordinal(value: str, trie: PrefixTrieNode, selected: list[tuple[int, dict[str, Any], dict[str, Any]]]) -> int | None:
|
| 481 |
+
node = trie
|
| 482 |
+
best: int | None = None
|
| 483 |
+
for char in value:
|
| 484 |
+
node = node.children.get(char)
|
| 485 |
+
if node is None:
|
| 486 |
+
break
|
| 487 |
+
for ordinal in node.terminal_ordinals:
|
| 488 |
+
prefix = str(selected[ordinal][1].get("prefix") or "")
|
| 489 |
+
if prefix_boundary_ok(value, prefix):
|
| 490 |
+
best = ordinal
|
| 491 |
+
return best
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def source_list_matches(
|
| 495 |
+
source_list: Path,
|
| 496 |
+
selected: list[tuple[int, dict[str, Any], dict[str, Any]]],
|
| 497 |
+
) -> dict[int, list[tuple[int, str]]]:
|
| 498 |
+
if not source_list.exists():
|
| 499 |
+
raise SystemExit(f"source list not found: {source_list}")
|
| 500 |
+
|
| 501 |
+
trie = build_prefix_trie(selected)
|
| 502 |
+
matches: dict[int, list[tuple[int, str]]] = {ordinal: [] for ordinal in range(len(selected))}
|
| 503 |
+
with source_list.open("r", encoding="utf-8") as handle:
|
| 504 |
+
for line_number, line in enumerate(handle, start=1):
|
| 505 |
+
if not line.strip():
|
| 506 |
+
continue
|
| 507 |
+
try:
|
| 508 |
+
row = json.loads(line)
|
| 509 |
+
except json.JSONDecodeError as exc:
|
| 510 |
+
raise SystemExit(f"invalid JSON in {source_list}:{line_number}: {exc}") from exc
|
| 511 |
+
if not isinstance(row, dict):
|
| 512 |
+
continue
|
| 513 |
+
value = row.get("value")
|
| 514 |
+
if not isinstance(value, str) or not value.strip():
|
| 515 |
+
continue
|
| 516 |
+
ordinal = matching_terminal_ordinal(value, trie, selected)
|
| 517 |
+
if ordinal is not None:
|
| 518 |
+
matches[ordinal].append((line_number, value))
|
| 519 |
+
return matches
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def dataset_records(
|
| 523 |
+
terminal: dict[str, Any],
|
| 524 |
+
index: int,
|
| 525 |
+
patch: dict[str, Any],
|
| 526 |
+
tokenizer: AnimeTokenizer,
|
| 527 |
+
*,
|
| 528 |
+
filenames: Iterable[tuple[int, str]] | None = None,
|
| 529 |
+
preserve_i_labels: bool = False,
|
| 530 |
+
) -> list[dict[str, Any]]:
|
| 531 |
+
records: list[dict[str, Any]] = []
|
| 532 |
+
seen: set[str] = set()
|
| 533 |
+
if filenames is None:
|
| 534 |
+
filenames = enumerate(string_list(terminal.get("value_examples")))
|
| 535 |
+
for source_index, filename in filenames:
|
| 536 |
+
if filename in seen:
|
| 537 |
+
continue
|
| 538 |
+
seen.add(filename)
|
| 539 |
+
sample = weak_label_filename(filename, tokenizer)
|
| 540 |
+
if sample is None:
|
| 541 |
+
continue
|
| 542 |
+
tokens, labels = normalize_generated_tokens(
|
| 543 |
+
sample["tokens"],
|
| 544 |
+
sample["labels"],
|
| 545 |
+
preserve_i_labels=preserve_i_labels,
|
| 546 |
+
)
|
| 547 |
+
records.append(
|
| 548 |
+
{
|
| 549 |
+
"file_id": f"prefix-graph:{patch['terminal_id']}:{source_index}",
|
| 550 |
+
"filename": filename,
|
| 551 |
+
"tokens": tokens,
|
| 552 |
+
"labels": labels,
|
| 553 |
+
"terminal_id": patch["terminal_id"],
|
| 554 |
+
"terminal_index": index,
|
| 555 |
+
"source": patch["source"],
|
| 556 |
+
"needs_llm_review": patch["needs_llm_review"],
|
| 557 |
+
"episode_title_suffixes": patch["episode_title_suffixes"],
|
| 558 |
+
"media_suffixes": patch["media_suffixes"],
|
| 559 |
+
"title_candidates": patch["title_candidates"],
|
| 560 |
+
"annotations": {
|
| 561 |
+
"terminal_id": patch["terminal_id"],
|
| 562 |
+
"terminal_index": index,
|
| 563 |
+
"source": patch["source"],
|
| 564 |
+
"needs_llm_review": patch["needs_llm_review"],
|
| 565 |
+
"episode_title_suffixes": patch["episode_title_suffixes"],
|
| 566 |
+
"media_suffixes": patch["media_suffixes"],
|
| 567 |
+
"title_candidates": patch["title_candidates"],
|
| 568 |
+
"llm_label": patch["llm_label"],
|
| 569 |
+
"notes": patch["notes"],
|
| 570 |
+
},
|
| 571 |
+
}
|
| 572 |
+
)
|
| 573 |
+
return records
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
def is_standalone_separator(token: str) -> bool:
|
| 577 |
+
return len(token) == 1 and (token.isspace() or not token.isalnum())
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
def split_generated_token(token: str) -> list[str]:
|
| 581 |
+
pieces: list[str] = []
|
| 582 |
+
current: list[str] = []
|
| 583 |
+
for char in token:
|
| 584 |
+
if char.isspace() or not char.isalnum():
|
| 585 |
+
if current:
|
| 586 |
+
pieces.append("".join(current))
|
| 587 |
+
current.clear()
|
| 588 |
+
pieces.append(char)
|
| 589 |
+
else:
|
| 590 |
+
current.append(char)
|
| 591 |
+
if current:
|
| 592 |
+
pieces.append("".join(current))
|
| 593 |
+
return pieces
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
def b_only_label(label: str) -> str:
|
| 597 |
+
if label.startswith(("B-", "I-")):
|
| 598 |
+
return "B-" + label.split("-", 1)[1]
|
| 599 |
+
return "O" if label == "O" else str(label)
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
def normalize_generated_tokens(
|
| 603 |
+
tokens: list[str],
|
| 604 |
+
labels: list[str],
|
| 605 |
+
*,
|
| 606 |
+
preserve_i_labels: bool = False,
|
| 607 |
+
) -> tuple[list[str], list[str]]:
|
| 608 |
+
normalized_tokens: list[str] = []
|
| 609 |
+
normalized_labels: list[str] = []
|
| 610 |
+
for token, label in zip(tokens, labels):
|
| 611 |
+
source_label = str(label)
|
| 612 |
+
entity_label = source_label if preserve_i_labels else b_only_label(source_label)
|
| 613 |
+
for piece in split_generated_token(str(token)):
|
| 614 |
+
normalized_tokens.append(piece)
|
| 615 |
+
if entity_label == "O" or is_standalone_separator(piece):
|
| 616 |
+
normalized_labels.append("O")
|
| 617 |
+
else:
|
| 618 |
+
normalized_labels.append(entity_label)
|
| 619 |
+
return normalized_tokens, normalized_labels
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
|
| 623 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 624 |
+
count = 0
|
| 625 |
+
with path.open("w", encoding="utf-8", newline="\n") as handle:
|
| 626 |
+
for row in rows:
|
| 627 |
+
handle.write(json.dumps(row, ensure_ascii=False, separators=(",", ":")) + "\n")
|
| 628 |
+
count += 1
|
| 629 |
+
return count
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
def merge_annotations(graph: dict[str, Any], patches_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
| 633 |
+
merged = json.loads(json.dumps(graph, ensure_ascii=False))
|
| 634 |
+
for index, terminal in enumerate(merged.get("terminals") or []):
|
| 635 |
+
if not isinstance(terminal, dict):
|
| 636 |
+
continue
|
| 637 |
+
patch = patches_by_id.get(terminal_id(terminal, index))
|
| 638 |
+
if patch is None:
|
| 639 |
+
continue
|
| 640 |
+
terminal["annotations"] = {
|
| 641 |
+
"episode_title_suffixes": patch["episode_title_suffixes"],
|
| 642 |
+
"media_suffixes": patch["media_suffixes"],
|
| 643 |
+
"title_candidates": patch["title_candidates"],
|
| 644 |
+
"needs_llm_review": patch["needs_llm_review"],
|
| 645 |
+
"llm_label": patch["llm_label"],
|
| 646 |
+
"notes": patch["notes"],
|
| 647 |
+
"source": patch["source"],
|
| 648 |
+
"annotated_at": datetime.now(timezone.utc).isoformat(),
|
| 649 |
+
}
|
| 650 |
+
return merged
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
def main() -> None:
|
| 654 |
+
args = parse_args()
|
| 655 |
+
graph = load_graph(args.graph)
|
| 656 |
+
selected = selected_terminals(graph, args)
|
| 657 |
+
if not selected:
|
| 658 |
+
raise SystemExit("no terminals selected; adjust --limit/--min-weight/--only-needs-review")
|
| 659 |
+
|
| 660 |
+
tokenizer = AnimeTokenizer()
|
| 661 |
+
llm_requests = 0
|
| 662 |
+
patches: list[dict[str, Any]] = []
|
| 663 |
+
records: list[dict[str, Any]] = []
|
| 664 |
+
source_matches = None if args.examples_only else source_list_matches(args.source_list, selected)
|
| 665 |
+
|
| 666 |
+
for ordinal, (index, terminal, patch) in enumerate(selected):
|
| 667 |
+
if args.llm and patch["needs_llm_review"]:
|
| 668 |
+
if args.max_requests is None or llm_requests < args.max_requests:
|
| 669 |
+
try:
|
| 670 |
+
llm_patch = call_llm(terminal, patch, args)
|
| 671 |
+
if llm_patch is not None:
|
| 672 |
+
patch = llm_patch
|
| 673 |
+
llm_requests += 1
|
| 674 |
+
except RuntimeError as exc:
|
| 675 |
+
print(f"warning: terminal {patch['terminal_id']}: {exc}; using heuristic patch", file=sys.stderr)
|
| 676 |
+
patch["notes"] = f"{patch['notes']}; llm_error={exc}"
|
| 677 |
+
patches.append(patch)
|
| 678 |
+
records.extend(
|
| 679 |
+
dataset_records(
|
| 680 |
+
terminal,
|
| 681 |
+
index,
|
| 682 |
+
patch,
|
| 683 |
+
tokenizer,
|
| 684 |
+
filenames=None if args.examples_only else source_matches.get(ordinal, []),
|
| 685 |
+
preserve_i_labels=args.preserve_i_labels,
|
| 686 |
+
)
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
record_count = write_jsonl(args.output, records)
|
| 690 |
+
patch_count = 0
|
| 691 |
+
if args.patch_output is not None:
|
| 692 |
+
patch_count = write_jsonl(args.patch_output, patches)
|
| 693 |
+
if args.merge_output is not None:
|
| 694 |
+
args.merge_output.parent.mkdir(parents=True, exist_ok=True)
|
| 695 |
+
merged = merge_annotations(graph, {patch["terminal_id"]: patch for patch in patches})
|
| 696 |
+
args.merge_output.write_text(json.dumps(merged, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 697 |
+
|
| 698 |
+
summary = {
|
| 699 |
+
"graph": str(args.graph),
|
| 700 |
+
"source_list": None if args.examples_only else str(args.source_list),
|
| 701 |
+
"output": str(args.output),
|
| 702 |
+
"patch_output": str(args.patch_output) if args.patch_output is not None else None,
|
| 703 |
+
"merge_output": str(args.merge_output) if args.merge_output is not None else None,
|
| 704 |
+
"selected_terminals": len(selected),
|
| 705 |
+
"examples_only": args.examples_only,
|
| 706 |
+
"dataset_records": record_count,
|
| 707 |
+
"patches": patch_count,
|
| 708 |
+
"llm_enabled": args.llm,
|
| 709 |
+
"llm_requests": llm_requests,
|
| 710 |
+
}
|
| 711 |
+
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
if __name__ == "__main__":
|
| 715 |
+
main()
|
tools/convert_annotated_dmhy_dataset.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Convert annotated DMHY graph JSONL into the character-tokenized dataset.
|
| 2 |
+
|
| 3 |
+
The annotated graph workflow is expected to produce records compatible with
|
| 4 |
+
``dmhy_weak.jsonl``: each row has ``filename``, ``tokens``, and ``labels``.
|
| 5 |
+
This wrapper validates that contract, then reuses ``tools.convert_to_char_dataset``
|
| 6 |
+
for the token-to-character projection and manifest statistics.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import json
|
| 13 |
+
from collections import Counter
|
| 14 |
+
from datetime import datetime, timezone
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from statistics import mean
|
| 17 |
+
from typing import Iterable
|
| 18 |
+
|
| 19 |
+
from tools.convert_to_char_dataset import (
|
| 20 |
+
build_vocab,
|
| 21 |
+
convert_record,
|
| 22 |
+
coverage,
|
| 23 |
+
percentile,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
DEFAULT_INPUT = Path("datasets/AnimeName/dmhy_weak.generated.jsonl")
|
| 28 |
+
DEFAULT_OUTPUT = Path("datasets/AnimeName/dmhy_weak.generated_char.jsonl")
|
| 29 |
+
DEFAULT_VOCAB_OUTPUT = Path("datasets/AnimeName/vocab.generated.char.json")
|
| 30 |
+
DEFAULT_MANIFEST_OUTPUT = Path(
|
| 31 |
+
"datasets/AnimeName/dmhy_weak.generated_char.manifest.json"
|
| 32 |
+
)
|
| 33 |
+
REQUIRED_FIELDS = ("filename", "tokens", "labels")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def is_separator_or_space(char: str) -> bool:
|
| 37 |
+
return char.isspace() or not char.isalnum()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def token_has_embedded_separator(token: str) -> bool:
|
| 41 |
+
return len(token) > 1 and any(is_separator_or_space(char) for char in token)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def is_bioish_label(label: object) -> bool:
|
| 45 |
+
if not isinstance(label, str):
|
| 46 |
+
return False
|
| 47 |
+
if label == "O":
|
| 48 |
+
return True
|
| 49 |
+
prefix, sep, entity = label.partition("-")
|
| 50 |
+
return sep == "-" and prefix in {"B", "I"} and bool(entity)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def validate_record(
|
| 54 |
+
record: object,
|
| 55 |
+
path: Path,
|
| 56 |
+
line_no: int,
|
| 57 |
+
*,
|
| 58 |
+
check_punctuation: bool = True,
|
| 59 |
+
) -> dict:
|
| 60 |
+
if not isinstance(record, dict):
|
| 61 |
+
raise ValueError(f"{path}:{line_no}: record must be a JSON object")
|
| 62 |
+
|
| 63 |
+
missing = [field for field in REQUIRED_FIELDS if field not in record]
|
| 64 |
+
if missing:
|
| 65 |
+
raise ValueError(
|
| 66 |
+
f"{path}:{line_no}: missing required field(s): {', '.join(missing)}"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
filename = record["filename"]
|
| 70 |
+
tokens = record["tokens"]
|
| 71 |
+
labels = record["labels"]
|
| 72 |
+
|
| 73 |
+
if not isinstance(filename, str) or not filename:
|
| 74 |
+
raise ValueError(f"{path}:{line_no}: filename must be a non-empty string")
|
| 75 |
+
if not isinstance(tokens, list):
|
| 76 |
+
raise ValueError(f"{path}:{line_no}: tokens must be a list")
|
| 77 |
+
if not isinstance(labels, list):
|
| 78 |
+
raise ValueError(f"{path}:{line_no}: labels must be a list")
|
| 79 |
+
if len(tokens) != len(labels):
|
| 80 |
+
raise ValueError(
|
| 81 |
+
f"{path}:{line_no}: token/label length mismatch: "
|
| 82 |
+
f"{len(tokens)} tokens, {len(labels)} labels"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
for index, token in enumerate(tokens):
|
| 86 |
+
if not isinstance(token, str):
|
| 87 |
+
raise ValueError(f"{path}:{line_no}: tokens[{index}] must be a string")
|
| 88 |
+
if check_punctuation and token_has_embedded_separator(token):
|
| 89 |
+
raise ValueError(
|
| 90 |
+
f"{path}:{line_no}: tokens[{index}] contains punctuation, symbol, or "
|
| 91 |
+
f"whitespace that should be a standalone token: {token!r}"
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
for index, label in enumerate(labels):
|
| 95 |
+
if not is_bioish_label(label):
|
| 96 |
+
raise ValueError(
|
| 97 |
+
f"{path}:{line_no}: labels[{index}] is not BIO-ish: {label!r}"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
return record
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def iter_validated_jsonl(path: Path, *, check_punctuation: bool = True) -> Iterable[dict]:
|
| 104 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 105 |
+
for line_no, line in enumerate(handle, 1):
|
| 106 |
+
line = line.strip()
|
| 107 |
+
if not line:
|
| 108 |
+
continue
|
| 109 |
+
try:
|
| 110 |
+
record = json.loads(line)
|
| 111 |
+
except json.JSONDecodeError as exc:
|
| 112 |
+
raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
|
| 113 |
+
yield validate_record(
|
| 114 |
+
record,
|
| 115 |
+
path,
|
| 116 |
+
line_no,
|
| 117 |
+
check_punctuation=check_punctuation,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def parse_args() -> argparse.Namespace:
|
| 122 |
+
parser = argparse.ArgumentParser(
|
| 123 |
+
description=(
|
| 124 |
+
"Validate annotated DMHY graph JSONL and convert it to the "
|
| 125 |
+
"character-tokenized training format."
|
| 126 |
+
),
|
| 127 |
+
epilog=(
|
| 128 |
+
"Equivalent projection logic is provided by "
|
| 129 |
+
"tools.convert_to_char_dataset.convert_record."
|
| 130 |
+
),
|
| 131 |
+
)
|
| 132 |
+
parser.add_argument(
|
| 133 |
+
"--input",
|
| 134 |
+
default=str(DEFAULT_INPUT),
|
| 135 |
+
help=f"Input dmhy_weak-compatible JSONL (default: {DEFAULT_INPUT})",
|
| 136 |
+
)
|
| 137 |
+
parser.add_argument(
|
| 138 |
+
"--output",
|
| 139 |
+
default=str(DEFAULT_OUTPUT),
|
| 140 |
+
help=f"Output character-level JSONL (default: {DEFAULT_OUTPUT})",
|
| 141 |
+
)
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
"--vocab-output",
|
| 144 |
+
default=str(DEFAULT_VOCAB_OUTPUT),
|
| 145 |
+
help=f"Output character vocab JSON (default: {DEFAULT_VOCAB_OUTPUT})",
|
| 146 |
+
)
|
| 147 |
+
parser.add_argument(
|
| 148 |
+
"--manifest-output",
|
| 149 |
+
default=str(DEFAULT_MANIFEST_OUTPUT),
|
| 150 |
+
help=(
|
| 151 |
+
"Output conversion manifest JSON "
|
| 152 |
+
f"(default: {DEFAULT_MANIFEST_OUTPUT})"
|
| 153 |
+
),
|
| 154 |
+
)
|
| 155 |
+
parser.add_argument(
|
| 156 |
+
"--max-vocab-size",
|
| 157 |
+
type=int,
|
| 158 |
+
default=None,
|
| 159 |
+
help="Optional vocab cap including special tokens",
|
| 160 |
+
)
|
| 161 |
+
parser.add_argument("--limit", type=int, default=None, help="Convert only N rows")
|
| 162 |
+
parser.add_argument(
|
| 163 |
+
"--progress",
|
| 164 |
+
type=int,
|
| 165 |
+
default=50_000,
|
| 166 |
+
help="Print progress every N records",
|
| 167 |
+
)
|
| 168 |
+
parser.add_argument(
|
| 169 |
+
"--validate-only",
|
| 170 |
+
action="store_true",
|
| 171 |
+
help="Validate input records without writing converted outputs",
|
| 172 |
+
)
|
| 173 |
+
parser.add_argument(
|
| 174 |
+
"--allow-embedded-punctuation",
|
| 175 |
+
action="store_true",
|
| 176 |
+
help=(
|
| 177 |
+
"Skip the generated-workflow check that punctuation and whitespace "
|
| 178 |
+
"must be standalone tokens."
|
| 179 |
+
),
|
| 180 |
+
)
|
| 181 |
+
return parser.parse_args()
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def main() -> None:
|
| 185 |
+
args = parse_args()
|
| 186 |
+
input_path = Path(args.input)
|
| 187 |
+
output_path = Path(args.output)
|
| 188 |
+
vocab_path = Path(args.vocab_output)
|
| 189 |
+
manifest_path = Path(args.manifest_output)
|
| 190 |
+
|
| 191 |
+
if not input_path.exists():
|
| 192 |
+
raise FileNotFoundError(f"input JSONL does not exist: {input_path}")
|
| 193 |
+
|
| 194 |
+
if not args.validate_only:
|
| 195 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 196 |
+
vocab_path.parent.mkdir(parents=True, exist_ok=True)
|
| 197 |
+
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
|
| 199 |
+
char_counter: Counter[str] = Counter()
|
| 200 |
+
label_counter: Counter[str] = Counter()
|
| 201 |
+
row_count = 0
|
| 202 |
+
source_token_count = 0
|
| 203 |
+
char_token_count = 0
|
| 204 |
+
lengths: list[int] = []
|
| 205 |
+
examples: list[dict] = []
|
| 206 |
+
|
| 207 |
+
output_handle = None
|
| 208 |
+
try:
|
| 209 |
+
if not args.validate_only:
|
| 210 |
+
output_handle = output_path.open("w", encoding="utf-8", newline="\n")
|
| 211 |
+
|
| 212 |
+
for record in iter_validated_jsonl(
|
| 213 |
+
input_path,
|
| 214 |
+
check_punctuation=not args.allow_embedded_punctuation,
|
| 215 |
+
):
|
| 216 |
+
converted = convert_record(record)
|
| 217 |
+
|
| 218 |
+
if output_handle is not None:
|
| 219 |
+
output_handle.write(
|
| 220 |
+
json.dumps(converted, ensure_ascii=False, separators=(",", ":"))
|
| 221 |
+
+ "\n"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
row_count += 1
|
| 225 |
+
source_token_count += converted["source_token_count"]
|
| 226 |
+
char_len = converted["char_token_count"]
|
| 227 |
+
char_token_count += char_len
|
| 228 |
+
lengths.append(char_len)
|
| 229 |
+
char_counter.update(converted["tokens"])
|
| 230 |
+
label_counter.update(converted["labels"])
|
| 231 |
+
if len(examples) < 5:
|
| 232 |
+
examples.append(converted)
|
| 233 |
+
|
| 234 |
+
if args.limit is not None and row_count >= args.limit:
|
| 235 |
+
break
|
| 236 |
+
if args.progress and row_count % args.progress == 0:
|
| 237 |
+
print(f"converted {row_count:,} rows; unique chars={len(char_counter):,}")
|
| 238 |
+
finally:
|
| 239 |
+
if output_handle is not None:
|
| 240 |
+
output_handle.close()
|
| 241 |
+
|
| 242 |
+
vocab = build_vocab(char_counter, args.max_vocab_size)
|
| 243 |
+
|
| 244 |
+
manifest = {
|
| 245 |
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
| 246 |
+
"input": str(input_path),
|
| 247 |
+
"output": None if args.validate_only else str(output_path),
|
| 248 |
+
"vocab_output": None if args.validate_only else str(vocab_path),
|
| 249 |
+
"manifest_output": None if args.validate_only else str(manifest_path),
|
| 250 |
+
"tokenizer_variant": "char",
|
| 251 |
+
"source_workflow": "annotated_dmhy_graph",
|
| 252 |
+
"validation": {
|
| 253 |
+
"required_fields": list(REQUIRED_FIELDS),
|
| 254 |
+
"label_contract": "O or B-*/I-* with a non-empty entity name; B/O-only is accepted",
|
| 255 |
+
"punctuation_standalone": not args.allow_embedded_punctuation,
|
| 256 |
+
},
|
| 257 |
+
"projection": {
|
| 258 |
+
"B-X": "first char keeps B-X; remaining chars become I-X",
|
| 259 |
+
"I-X": "all chars keep I-X",
|
| 260 |
+
"O": "all chars keep O",
|
| 261 |
+
},
|
| 262 |
+
"row_count": row_count,
|
| 263 |
+
"source_token_count": source_token_count,
|
| 264 |
+
"char_token_count": char_token_count,
|
| 265 |
+
"unique_char_count": len(char_counter),
|
| 266 |
+
"vocab_size": len(vocab),
|
| 267 |
+
"max_vocab_size": args.max_vocab_size,
|
| 268 |
+
"vocab_coverage": coverage(char_counter, vocab),
|
| 269 |
+
"label_counts": dict(label_counter),
|
| 270 |
+
"char_length": {
|
| 271 |
+
"min": min(lengths) if lengths else 0,
|
| 272 |
+
"mean": mean(lengths) if lengths else 0,
|
| 273 |
+
"p50": percentile(lengths, 50),
|
| 274 |
+
"p90": percentile(lengths, 90),
|
| 275 |
+
"p95": percentile(lengths, 95),
|
| 276 |
+
"p99": percentile(lengths, 99),
|
| 277 |
+
"max": max(lengths) if lengths else 0,
|
| 278 |
+
},
|
| 279 |
+
"examples": examples,
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
if not args.validate_only:
|
| 283 |
+
vocab_path.write_text(
|
| 284 |
+
json.dumps(vocab, ensure_ascii=False, indent=2) + "\n",
|
| 285 |
+
encoding="utf-8",
|
| 286 |
+
)
|
| 287 |
+
manifest_path.write_text(
|
| 288 |
+
json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
|
| 289 |
+
encoding="utf-8",
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
print(
|
| 293 |
+
json.dumps(
|
| 294 |
+
{key: value for key, value in manifest.items() if key != "examples"},
|
| 295 |
+
ensure_ascii=False,
|
| 296 |
+
indent=2,
|
| 297 |
+
)
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
if __name__ == "__main__":
|
| 302 |
+
main()
|
tools/dmhy_prefix_grouper/Cargo.lock
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically @generated by Cargo.
|
| 2 |
+
# It is not intended for manual editing.
|
| 3 |
+
version = 4
|
| 4 |
+
|
| 5 |
+
[[package]]
|
| 6 |
+
name = "aho-corasick"
|
| 7 |
+
version = "1.1.4"
|
| 8 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 9 |
+
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"memchr",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[[package]]
|
| 15 |
+
name = "anstream"
|
| 16 |
+
version = "1.0.0"
|
| 17 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 18 |
+
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
|
| 19 |
+
dependencies = [
|
| 20 |
+
"anstyle",
|
| 21 |
+
"anstyle-parse",
|
| 22 |
+
"anstyle-query",
|
| 23 |
+
"anstyle-wincon",
|
| 24 |
+
"colorchoice",
|
| 25 |
+
"is_terminal_polyfill",
|
| 26 |
+
"utf8parse",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
[[package]]
|
| 30 |
+
name = "anstyle"
|
| 31 |
+
version = "1.0.14"
|
| 32 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 33 |
+
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
|
| 34 |
+
|
| 35 |
+
[[package]]
|
| 36 |
+
name = "anstyle-parse"
|
| 37 |
+
version = "1.0.0"
|
| 38 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 39 |
+
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
|
| 40 |
+
dependencies = [
|
| 41 |
+
"utf8parse",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
[[package]]
|
| 45 |
+
name = "anstyle-query"
|
| 46 |
+
version = "1.1.5"
|
| 47 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 48 |
+
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
| 49 |
+
dependencies = [
|
| 50 |
+
"windows-sys",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
[[package]]
|
| 54 |
+
name = "anstyle-wincon"
|
| 55 |
+
version = "3.0.11"
|
| 56 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 57 |
+
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
| 58 |
+
dependencies = [
|
| 59 |
+
"anstyle",
|
| 60 |
+
"once_cell_polyfill",
|
| 61 |
+
"windows-sys",
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
[[package]]
|
| 65 |
+
name = "anyhow"
|
| 66 |
+
version = "1.0.102"
|
| 67 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 68 |
+
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
| 69 |
+
|
| 70 |
+
[[package]]
|
| 71 |
+
name = "clap"
|
| 72 |
+
version = "4.6.1"
|
| 73 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 74 |
+
checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
|
| 75 |
+
dependencies = [
|
| 76 |
+
"clap_builder",
|
| 77 |
+
"clap_derive",
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
[[package]]
|
| 81 |
+
name = "clap_builder"
|
| 82 |
+
version = "4.6.0"
|
| 83 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 84 |
+
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
|
| 85 |
+
dependencies = [
|
| 86 |
+
"anstream",
|
| 87 |
+
"anstyle",
|
| 88 |
+
"clap_lex",
|
| 89 |
+
"strsim",
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
[[package]]
|
| 93 |
+
name = "clap_derive"
|
| 94 |
+
version = "4.6.1"
|
| 95 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 96 |
+
checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
|
| 97 |
+
dependencies = [
|
| 98 |
+
"heck",
|
| 99 |
+
"proc-macro2",
|
| 100 |
+
"quote",
|
| 101 |
+
"syn",
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
[[package]]
|
| 105 |
+
name = "clap_lex"
|
| 106 |
+
version = "1.1.0"
|
| 107 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 108 |
+
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
|
| 109 |
+
|
| 110 |
+
[[package]]
|
| 111 |
+
name = "colorchoice"
|
| 112 |
+
version = "1.0.5"
|
| 113 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 114 |
+
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
| 115 |
+
|
| 116 |
+
[[package]]
|
| 117 |
+
name = "crossbeam-deque"
|
| 118 |
+
version = "0.8.6"
|
| 119 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 120 |
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
| 121 |
+
dependencies = [
|
| 122 |
+
"crossbeam-epoch",
|
| 123 |
+
"crossbeam-utils",
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
[[package]]
|
| 127 |
+
name = "crossbeam-epoch"
|
| 128 |
+
version = "0.9.18"
|
| 129 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 130 |
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
| 131 |
+
dependencies = [
|
| 132 |
+
"crossbeam-utils",
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
[[package]]
|
| 136 |
+
name = "crossbeam-utils"
|
| 137 |
+
version = "0.8.21"
|
| 138 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 139 |
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
| 140 |
+
|
| 141 |
+
[[package]]
|
| 142 |
+
name = "dmhy_prefix_grouper"
|
| 143 |
+
version = "0.1.0"
|
| 144 |
+
dependencies = [
|
| 145 |
+
"anyhow",
|
| 146 |
+
"clap",
|
| 147 |
+
"rayon",
|
| 148 |
+
"regex",
|
| 149 |
+
"serde",
|
| 150 |
+
"serde_json",
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
[[package]]
|
| 154 |
+
name = "either"
|
| 155 |
+
version = "1.16.0"
|
| 156 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 157 |
+
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
|
| 158 |
+
|
| 159 |
+
[[package]]
|
| 160 |
+
name = "heck"
|
| 161 |
+
version = "0.5.0"
|
| 162 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 163 |
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
| 164 |
+
|
| 165 |
+
[[package]]
|
| 166 |
+
name = "is_terminal_polyfill"
|
| 167 |
+
version = "1.70.2"
|
| 168 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 169 |
+
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
| 170 |
+
|
| 171 |
+
[[package]]
|
| 172 |
+
name = "itoa"
|
| 173 |
+
version = "1.0.18"
|
| 174 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 175 |
+
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
| 176 |
+
|
| 177 |
+
[[package]]
|
| 178 |
+
name = "memchr"
|
| 179 |
+
version = "2.8.1"
|
| 180 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 181 |
+
checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
|
| 182 |
+
|
| 183 |
+
[[package]]
|
| 184 |
+
name = "once_cell_polyfill"
|
| 185 |
+
version = "1.70.2"
|
| 186 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 187 |
+
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
| 188 |
+
|
| 189 |
+
[[package]]
|
| 190 |
+
name = "proc-macro2"
|
| 191 |
+
version = "1.0.106"
|
| 192 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 193 |
+
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
| 194 |
+
dependencies = [
|
| 195 |
+
"unicode-ident",
|
| 196 |
+
]
|
| 197 |
+
|
| 198 |
+
[[package]]
|
| 199 |
+
name = "quote"
|
| 200 |
+
version = "1.0.45"
|
| 201 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 202 |
+
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
| 203 |
+
dependencies = [
|
| 204 |
+
"proc-macro2",
|
| 205 |
+
]
|
| 206 |
+
|
| 207 |
+
[[package]]
|
| 208 |
+
name = "rayon"
|
| 209 |
+
version = "1.12.0"
|
| 210 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 211 |
+
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
| 212 |
+
dependencies = [
|
| 213 |
+
"either",
|
| 214 |
+
"rayon-core",
|
| 215 |
+
]
|
| 216 |
+
|
| 217 |
+
[[package]]
|
| 218 |
+
name = "rayon-core"
|
| 219 |
+
version = "1.13.0"
|
| 220 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 221 |
+
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
| 222 |
+
dependencies = [
|
| 223 |
+
"crossbeam-deque",
|
| 224 |
+
"crossbeam-utils",
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
[[package]]
|
| 228 |
+
name = "regex"
|
| 229 |
+
version = "1.12.3"
|
| 230 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 231 |
+
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
|
| 232 |
+
dependencies = [
|
| 233 |
+
"aho-corasick",
|
| 234 |
+
"memchr",
|
| 235 |
+
"regex-automata",
|
| 236 |
+
"regex-syntax",
|
| 237 |
+
]
|
| 238 |
+
|
| 239 |
+
[[package]]
|
| 240 |
+
name = "regex-automata"
|
| 241 |
+
version = "0.4.14"
|
| 242 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 243 |
+
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
| 244 |
+
dependencies = [
|
| 245 |
+
"aho-corasick",
|
| 246 |
+
"memchr",
|
| 247 |
+
"regex-syntax",
|
| 248 |
+
]
|
| 249 |
+
|
| 250 |
+
[[package]]
|
| 251 |
+
name = "regex-syntax"
|
| 252 |
+
version = "0.8.10"
|
| 253 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 254 |
+
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
| 255 |
+
|
| 256 |
+
[[package]]
|
| 257 |
+
name = "serde"
|
| 258 |
+
version = "1.0.228"
|
| 259 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 260 |
+
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
| 261 |
+
dependencies = [
|
| 262 |
+
"serde_core",
|
| 263 |
+
"serde_derive",
|
| 264 |
+
]
|
| 265 |
+
|
| 266 |
+
[[package]]
|
| 267 |
+
name = "serde_core"
|
| 268 |
+
version = "1.0.228"
|
| 269 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 270 |
+
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
| 271 |
+
dependencies = [
|
| 272 |
+
"serde_derive",
|
| 273 |
+
]
|
| 274 |
+
|
| 275 |
+
[[package]]
|
| 276 |
+
name = "serde_derive"
|
| 277 |
+
version = "1.0.228"
|
| 278 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 279 |
+
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
| 280 |
+
dependencies = [
|
| 281 |
+
"proc-macro2",
|
| 282 |
+
"quote",
|
| 283 |
+
"syn",
|
| 284 |
+
]
|
| 285 |
+
|
| 286 |
+
[[package]]
|
| 287 |
+
name = "serde_json"
|
| 288 |
+
version = "1.0.150"
|
| 289 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 290 |
+
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
|
| 291 |
+
dependencies = [
|
| 292 |
+
"itoa",
|
| 293 |
+
"memchr",
|
| 294 |
+
"serde",
|
| 295 |
+
"serde_core",
|
| 296 |
+
"zmij",
|
| 297 |
+
]
|
| 298 |
+
|
| 299 |
+
[[package]]
|
| 300 |
+
name = "strsim"
|
| 301 |
+
version = "0.11.1"
|
| 302 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 303 |
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
| 304 |
+
|
| 305 |
+
[[package]]
|
| 306 |
+
name = "syn"
|
| 307 |
+
version = "2.0.117"
|
| 308 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 309 |
+
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
| 310 |
+
dependencies = [
|
| 311 |
+
"proc-macro2",
|
| 312 |
+
"quote",
|
| 313 |
+
"unicode-ident",
|
| 314 |
+
]
|
| 315 |
+
|
| 316 |
+
[[package]]
|
| 317 |
+
name = "unicode-ident"
|
| 318 |
+
version = "1.0.24"
|
| 319 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 320 |
+
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
| 321 |
+
|
| 322 |
+
[[package]]
|
| 323 |
+
name = "utf8parse"
|
| 324 |
+
version = "0.2.2"
|
| 325 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 326 |
+
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
| 327 |
+
|
| 328 |
+
[[package]]
|
| 329 |
+
name = "windows-link"
|
| 330 |
+
version = "0.2.1"
|
| 331 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 332 |
+
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
| 333 |
+
|
| 334 |
+
[[package]]
|
| 335 |
+
name = "windows-sys"
|
| 336 |
+
version = "0.61.2"
|
| 337 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 338 |
+
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
| 339 |
+
dependencies = [
|
| 340 |
+
"windows-link",
|
| 341 |
+
]
|
| 342 |
+
|
| 343 |
+
[[package]]
|
| 344 |
+
name = "zmij"
|
| 345 |
+
version = "1.0.21"
|
| 346 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 347 |
+
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
tools/dmhy_prefix_grouper/Cargo.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[package]
|
| 2 |
+
name = "dmhy_prefix_grouper"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
edition = "2021"
|
| 5 |
+
|
| 6 |
+
[dependencies]
|
| 7 |
+
anyhow = "1.0"
|
| 8 |
+
clap = { version = "4.5", features = ["derive"] }
|
| 9 |
+
rayon = "1.10"
|
| 10 |
+
regex = "1.11"
|
| 11 |
+
serde = { version = "1.0", features = ["derive"] }
|
| 12 |
+
serde_json = "1.0"
|
tools/dmhy_prefix_grouper/src/main.rs
ADDED
|
@@ -0,0 +1,1070 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
use std::collections::{BTreeMap, BTreeSet};
|
| 2 |
+
use std::fs::{self, File};
|
| 3 |
+
use std::io::{BufRead, BufReader, BufWriter};
|
| 4 |
+
use std::path::{Path, PathBuf};
|
| 5 |
+
use std::sync::LazyLock;
|
| 6 |
+
|
| 7 |
+
use anyhow::{Context, Result};
|
| 8 |
+
use clap::Parser;
|
| 9 |
+
use rayon::prelude::*;
|
| 10 |
+
use regex::Regex;
|
| 11 |
+
use serde::{Deserialize, Serialize};
|
| 12 |
+
|
| 13 |
+
const RADIX_VERSION: &str = "prefix-radix-v1";
|
| 14 |
+
const DAG_VERSION: &str = "prefix-dag-v1";
|
| 15 |
+
|
| 16 |
+
static EPISODE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
|
| 17 |
+
vec![
|
| 18 |
+
Regex::new(r"(?i)\bS\d{1,2}E\d{1,4}\b").unwrap(),
|
| 19 |
+
Regex::new(r"(?i)\bEP?\.?\s*\d{1,4}\b").unwrap(),
|
| 20 |
+
Regex::new(r"第\s*\d{1,4}\s*[話话集回]").unwrap(),
|
| 21 |
+
Regex::new(r"\b\d{1,4}\s*[話话集回]").unwrap(),
|
| 22 |
+
Regex::new(r"[\[(]\s*\d{1,4}\s*[\])]").unwrap(),
|
| 23 |
+
]
|
| 24 |
+
});
|
| 25 |
+
|
| 26 |
+
static DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap());
|
| 27 |
+
|
| 28 |
+
#[derive(Parser)]
|
| 29 |
+
#[command(about = "Build a deterministic DMHY episode-prefix trie graph")]
|
| 30 |
+
struct Args {
|
| 31 |
+
#[arg(long, default_value = "datasets\\AnimeName\\dmhy_list.jsonl")]
|
| 32 |
+
input: PathBuf,
|
| 33 |
+
|
| 34 |
+
#[arg(long, default_value = "datasets\\AnimeName\\dmhy_prefix_graph.json")]
|
| 35 |
+
output: PathBuf,
|
| 36 |
+
|
| 37 |
+
#[arg(long, default_value_t = 2)]
|
| 38 |
+
min_count: usize,
|
| 39 |
+
|
| 40 |
+
#[arg(long, default_value_t = 5)]
|
| 41 |
+
example_count: usize,
|
| 42 |
+
|
| 43 |
+
#[arg(long)]
|
| 44 |
+
from_prefix_graph: Option<PathBuf>,
|
| 45 |
+
|
| 46 |
+
#[arg(long)]
|
| 47 |
+
dag_output: Option<PathBuf>,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
#[derive(Clone, Deserialize)]
|
| 51 |
+
struct InputRecord {
|
| 52 |
+
value: String,
|
| 53 |
+
#[serde(default)]
|
| 54 |
+
uses_path: bool,
|
| 55 |
+
#[serde(default)]
|
| 56 |
+
has_trailing_hash: bool,
|
| 57 |
+
#[serde(default, rename = "has_digits")]
|
| 58 |
+
_has_digits: bool,
|
| 59 |
+
#[serde(default)]
|
| 60 |
+
#[serde(rename = "digit_skeleton")]
|
| 61 |
+
_digit_skeleton: String,
|
| 62 |
+
#[serde(default = "default_count")]
|
| 63 |
+
count: usize,
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
#[derive(Clone)]
|
| 67 |
+
struct PatternObservation {
|
| 68 |
+
source_index: usize,
|
| 69 |
+
prefix: String,
|
| 70 |
+
digit_skeleton: String,
|
| 71 |
+
suffix: String,
|
| 72 |
+
value: String,
|
| 73 |
+
uses_path: bool,
|
| 74 |
+
has_trailing_hash: bool,
|
| 75 |
+
count: usize,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
#[derive(Default)]
|
| 79 |
+
struct GroupBuilder {
|
| 80 |
+
prefix: String,
|
| 81 |
+
digit_skeleton: String,
|
| 82 |
+
count: usize,
|
| 83 |
+
weight: usize,
|
| 84 |
+
uses_path_count: usize,
|
| 85 |
+
has_trailing_hash_count: usize,
|
| 86 |
+
suffix_examples: Vec<String>,
|
| 87 |
+
value_examples: Vec<String>,
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
struct TrieNode {
|
| 91 |
+
id: usize,
|
| 92 |
+
edge_label: String,
|
| 93 |
+
depth: usize,
|
| 94 |
+
parent: Option<usize>,
|
| 95 |
+
children_by_label: BTreeMap<String, usize>,
|
| 96 |
+
subtree_patterns: usize,
|
| 97 |
+
subtree_weight: usize,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
#[derive(Serialize)]
|
| 101 |
+
struct GraphOutput {
|
| 102 |
+
meta: Meta,
|
| 103 |
+
nodes: Vec<OutputNode>,
|
| 104 |
+
terminals: Vec<OutputTerminal>,
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
#[derive(Serialize)]
|
| 108 |
+
struct Meta {
|
| 109 |
+
input: String,
|
| 110 |
+
output: String,
|
| 111 |
+
input_records: usize,
|
| 112 |
+
observations: usize,
|
| 113 |
+
no_episode_prefix: usize,
|
| 114 |
+
groups: usize,
|
| 115 |
+
nodes: usize,
|
| 116 |
+
max_depth: usize,
|
| 117 |
+
grouped_weight: usize,
|
| 118 |
+
min_count: usize,
|
| 119 |
+
example_count: usize,
|
| 120 |
+
tokenizer: TokenizerMeta,
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
#[derive(Serialize)]
|
| 124 |
+
struct TokenizerMeta {
|
| 125 |
+
version: &'static str,
|
| 126 |
+
notes: Vec<&'static str>,
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
#[derive(Serialize)]
|
| 130 |
+
struct OutputNode {
|
| 131 |
+
id: usize,
|
| 132 |
+
edge_label: String,
|
| 133 |
+
depth: usize,
|
| 134 |
+
children: Vec<usize>,
|
| 135 |
+
subtree_patterns: usize,
|
| 136 |
+
subtree_weight: usize,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
#[derive(Serialize)]
|
| 140 |
+
struct OutputTerminal {
|
| 141 |
+
node_id: usize,
|
| 142 |
+
prefix: String,
|
| 143 |
+
digit_skeleton: String,
|
| 144 |
+
count: usize,
|
| 145 |
+
weight: usize,
|
| 146 |
+
uses_path_count: usize,
|
| 147 |
+
has_trailing_hash_count: usize,
|
| 148 |
+
suffix_examples: Vec<String>,
|
| 149 |
+
value_examples: Vec<String>,
|
| 150 |
+
annotations: TerminalAnnotations,
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
#[derive(Default, Serialize)]
|
| 154 |
+
struct TerminalAnnotations {
|
| 155 |
+
episode_title_suffixes: Vec<String>,
|
| 156 |
+
media_suffixes: Vec<String>,
|
| 157 |
+
title_candidates: Vec<String>,
|
| 158 |
+
needs_llm_review: bool,
|
| 159 |
+
llm_label: Option<String>,
|
| 160 |
+
notes: Option<String>,
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
#[derive(Deserialize)]
|
| 164 |
+
struct SourceGraph {
|
| 165 |
+
#[allow(dead_code)]
|
| 166 |
+
meta: serde_json::Value,
|
| 167 |
+
nodes: Vec<SourceNode>,
|
| 168 |
+
terminals: Vec<SourceTerminal>,
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
#[derive(Deserialize)]
|
| 172 |
+
struct SourceNode {
|
| 173 |
+
id: usize,
|
| 174 |
+
edge_label: String,
|
| 175 |
+
#[allow(dead_code)]
|
| 176 |
+
depth: usize,
|
| 177 |
+
#[serde(default)]
|
| 178 |
+
children: Vec<usize>,
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
#[derive(Clone, Deserialize, Serialize)]
|
| 182 |
+
struct SourceTerminal {
|
| 183 |
+
#[serde(default, skip_serializing_if = "Option::is_none")]
|
| 184 |
+
terminal_id: Option<String>,
|
| 185 |
+
node_id: usize,
|
| 186 |
+
prefix: String,
|
| 187 |
+
digit_skeleton: String,
|
| 188 |
+
count: usize,
|
| 189 |
+
weight: usize,
|
| 190 |
+
uses_path_count: usize,
|
| 191 |
+
has_trailing_hash_count: usize,
|
| 192 |
+
suffix_examples: Vec<String>,
|
| 193 |
+
value_examples: Vec<String>,
|
| 194 |
+
#[serde(default = "default_annotations_value")]
|
| 195 |
+
annotations: serde_json::Value,
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
#[derive(Serialize)]
|
| 199 |
+
struct DagOutput {
|
| 200 |
+
meta: DagMeta,
|
| 201 |
+
root: usize,
|
| 202 |
+
nodes: Vec<DagNode>,
|
| 203 |
+
terminals: Vec<DagTerminal>,
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
#[derive(Serialize)]
|
| 207 |
+
struct DagMeta {
|
| 208 |
+
version: &'static str,
|
| 209 |
+
input: String,
|
| 210 |
+
output: String,
|
| 211 |
+
source_nodes: usize,
|
| 212 |
+
source_terminals: usize,
|
| 213 |
+
dag_nodes: usize,
|
| 214 |
+
dag_edges: usize,
|
| 215 |
+
root: usize,
|
| 216 |
+
max_depth: usize,
|
| 217 |
+
merged_nodes: usize,
|
| 218 |
+
preserves_digits: bool,
|
| 219 |
+
merge_strategy: &'static str,
|
| 220 |
+
notes: Vec<&'static str>,
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
#[derive(Clone, Serialize)]
|
| 224 |
+
struct DagNode {
|
| 225 |
+
id: usize,
|
| 226 |
+
terminal: bool,
|
| 227 |
+
children: Vec<DagEdge>,
|
| 228 |
+
incoming_count: usize,
|
| 229 |
+
reachable_terminals: usize,
|
| 230 |
+
reachable_weight: usize,
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
#[derive(Clone, Serialize)]
|
| 234 |
+
struct DagEdge {
|
| 235 |
+
label: String,
|
| 236 |
+
target: usize,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
#[derive(Serialize)]
|
| 240 |
+
struct DagTerminal {
|
| 241 |
+
terminal_id: String,
|
| 242 |
+
node_id: usize,
|
| 243 |
+
prefix: String,
|
| 244 |
+
digit_skeleton: String,
|
| 245 |
+
count: usize,
|
| 246 |
+
weight: usize,
|
| 247 |
+
uses_path_count: usize,
|
| 248 |
+
has_trailing_hash_count: usize,
|
| 249 |
+
suffix_examples: Vec<String>,
|
| 250 |
+
value_examples: Vec<String>,
|
| 251 |
+
annotations: serde_json::Value,
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
#[derive(Clone, Eq, Ord, PartialEq, PartialOrd)]
|
| 255 |
+
struct NodeSignature {
|
| 256 |
+
terminal: bool,
|
| 257 |
+
children: Vec<(String, usize)>,
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
#[derive(Clone)]
|
| 261 |
+
struct TempDagNode {
|
| 262 |
+
terminal: bool,
|
| 263 |
+
children: Vec<DagEdge>,
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
fn main() -> Result<()> {
|
| 267 |
+
let args = Args::parse();
|
| 268 |
+
if args.from_prefix_graph.is_some() || args.dag_output.is_some() {
|
| 269 |
+
let dag = build_dag_from_args(&args)?;
|
| 270 |
+
println!("{}", serde_json::to_string_pretty(&dag.meta)?);
|
| 271 |
+
} else {
|
| 272 |
+
let graph = build_graph(&args)?;
|
| 273 |
+
println!("{}", serde_json::to_string_pretty(&graph.meta)?);
|
| 274 |
+
}
|
| 275 |
+
Ok(())
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
fn build_graph(args: &Args) -> Result<GraphOutput> {
|
| 279 |
+
ensure_output_parent(&args.output)?;
|
| 280 |
+
|
| 281 |
+
let records = read_records(&args.input)?;
|
| 282 |
+
let input_records = records.len();
|
| 283 |
+
|
| 284 |
+
let mut observations = records
|
| 285 |
+
.par_iter()
|
| 286 |
+
.enumerate()
|
| 287 |
+
.filter_map(|(source_index, record)| build_observation(source_index, record))
|
| 288 |
+
.collect::<Vec<_>>();
|
| 289 |
+
|
| 290 |
+
observations.sort_by(|left, right| {
|
| 291 |
+
left.prefix
|
| 292 |
+
.cmp(&right.prefix)
|
| 293 |
+
.then(left.source_index.cmp(&right.source_index))
|
| 294 |
+
});
|
| 295 |
+
|
| 296 |
+
let no_episode_prefix = input_records.saturating_sub(observations.len());
|
| 297 |
+
let groups = aggregate_observations(&observations, args.min_count, args.example_count);
|
| 298 |
+
let (nodes, terminals, max_depth, grouped_weight) = build_trie(&groups);
|
| 299 |
+
|
| 300 |
+
let graph = GraphOutput {
|
| 301 |
+
meta: Meta {
|
| 302 |
+
input: display_path(&args.input),
|
| 303 |
+
output: display_path(&args.output),
|
| 304 |
+
input_records,
|
| 305 |
+
observations: observations.len(),
|
| 306 |
+
no_episode_prefix,
|
| 307 |
+
groups: terminals.len(),
|
| 308 |
+
nodes: nodes.len(),
|
| 309 |
+
max_depth,
|
| 310 |
+
grouped_weight,
|
| 311 |
+
min_count: args.min_count,
|
| 312 |
+
example_count: args.example_count,
|
| 313 |
+
tokenizer: TokenizerMeta {
|
| 314 |
+
version: RADIX_VERSION,
|
| 315 |
+
notes: vec![
|
| 316 |
+
"Episode prefixes are detected with the legacy regex/boundary rules.",
|
| 317 |
+
"Graph insertion preserves original prefix digits; digit_skeleton is secondary metadata only.",
|
| 318 |
+
"Graph nodes form a compressed radix trie over complete original prefix strings.",
|
| 319 |
+
"Edge labels split only at actual branch points; punctuation and separators do not force levels.",
|
| 320 |
+
],
|
| 321 |
+
},
|
| 322 |
+
},
|
| 323 |
+
nodes,
|
| 324 |
+
terminals,
|
| 325 |
+
};
|
| 326 |
+
|
| 327 |
+
let output = File::create(&args.output)
|
| 328 |
+
.with_context(|| format!("failed to create {}", args.output.display()))?;
|
| 329 |
+
let writer = BufWriter::new(output);
|
| 330 |
+
serde_json::to_writer_pretty(writer, &graph)
|
| 331 |
+
.with_context(|| format!("failed to write {}", args.output.display()))?;
|
| 332 |
+
|
| 333 |
+
Ok(graph)
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
fn build_dag_from_args(args: &Args) -> Result<DagOutput> {
|
| 337 |
+
let input = args
|
| 338 |
+
.from_prefix_graph
|
| 339 |
+
.as_ref()
|
| 340 |
+
.unwrap_or(&args.output)
|
| 341 |
+
.to_path_buf();
|
| 342 |
+
let output = args
|
| 343 |
+
.dag_output
|
| 344 |
+
.as_ref()
|
| 345 |
+
.cloned()
|
| 346 |
+
.unwrap_or_else(|| PathBuf::from("datasets\\AnimeName\\dmhy_prefix_dag.json"));
|
| 347 |
+
|
| 348 |
+
build_dag(&input, &output)
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
fn build_dag(input_path: &Path, output_path: &Path) -> Result<DagOutput> {
|
| 352 |
+
ensure_output_parent(output_path)?;
|
| 353 |
+
|
| 354 |
+
let input = File::open(input_path)
|
| 355 |
+
.with_context(|| format!("failed to open {}", input_path.display()))?;
|
| 356 |
+
let reader = BufReader::new(input);
|
| 357 |
+
let source: SourceGraph = serde_json::from_reader(reader)
|
| 358 |
+
.with_context(|| format!("failed to parse {}", input_path.display()))?;
|
| 359 |
+
|
| 360 |
+
validate_source_graph(&source)?;
|
| 361 |
+
let dag = minimize_source_graph(&source, input_path, output_path)?;
|
| 362 |
+
|
| 363 |
+
let output = File::create(output_path)
|
| 364 |
+
.with_context(|| format!("failed to create {}", output_path.display()))?;
|
| 365 |
+
let writer = BufWriter::new(output);
|
| 366 |
+
serde_json::to_writer_pretty(writer, &dag)
|
| 367 |
+
.with_context(|| format!("failed to write {}", output_path.display()))?;
|
| 368 |
+
|
| 369 |
+
Ok(dag)
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
fn validate_source_graph(source: &SourceGraph) -> Result<()> {
|
| 373 |
+
for (index, node) in source.nodes.iter().enumerate() {
|
| 374 |
+
anyhow::ensure!(
|
| 375 |
+
node.id == index,
|
| 376 |
+
"source node id {} appears at index {}",
|
| 377 |
+
node.id,
|
| 378 |
+
index
|
| 379 |
+
);
|
| 380 |
+
for &child_id in &node.children {
|
| 381 |
+
anyhow::ensure!(
|
| 382 |
+
child_id < source.nodes.len(),
|
| 383 |
+
"source node {} references missing child {}",
|
| 384 |
+
node.id,
|
| 385 |
+
child_id
|
| 386 |
+
);
|
| 387 |
+
}
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
for terminal in &source.terminals {
|
| 391 |
+
anyhow::ensure!(
|
| 392 |
+
terminal.node_id < source.nodes.len(),
|
| 393 |
+
"terminal prefix {:?} references missing node {}",
|
| 394 |
+
terminal.prefix,
|
| 395 |
+
terminal.node_id
|
| 396 |
+
);
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
Ok(())
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
fn minimize_source_graph(
|
| 403 |
+
source: &SourceGraph,
|
| 404 |
+
input_path: &Path,
|
| 405 |
+
output_path: &Path,
|
| 406 |
+
) -> Result<DagOutput> {
|
| 407 |
+
let terminal_source_nodes = source
|
| 408 |
+
.terminals
|
| 409 |
+
.iter()
|
| 410 |
+
.map(|terminal| terminal.node_id)
|
| 411 |
+
.collect::<BTreeSet<_>>();
|
| 412 |
+
let postorder = source_postorder(source);
|
| 413 |
+
let mut source_to_temp_dag = vec![usize::MAX; source.nodes.len()];
|
| 414 |
+
let mut signatures = BTreeMap::<NodeSignature, usize>::new();
|
| 415 |
+
let mut temp_nodes = Vec::<TempDagNode>::new();
|
| 416 |
+
|
| 417 |
+
for source_id in postorder {
|
| 418 |
+
let source_node = &source.nodes[source_id];
|
| 419 |
+
let mut children = source_node
|
| 420 |
+
.children
|
| 421 |
+
.iter()
|
| 422 |
+
.map(|&child_id| {
|
| 423 |
+
let child = &source.nodes[child_id];
|
| 424 |
+
let target = source_to_temp_dag[child_id];
|
| 425 |
+
anyhow::ensure!(
|
| 426 |
+
target != usize::MAX,
|
| 427 |
+
"source child {} was not canonicalized before parent {}",
|
| 428 |
+
child_id,
|
| 429 |
+
source_id
|
| 430 |
+
);
|
| 431 |
+
Ok((child.edge_label.clone(), target))
|
| 432 |
+
})
|
| 433 |
+
.collect::<Result<Vec<_>>>()?;
|
| 434 |
+
children.sort();
|
| 435 |
+
|
| 436 |
+
let signature = NodeSignature {
|
| 437 |
+
terminal: terminal_source_nodes.contains(&source_id),
|
| 438 |
+
children,
|
| 439 |
+
};
|
| 440 |
+
|
| 441 |
+
let temp_id = if let Some(&existing) = signatures.get(&signature) {
|
| 442 |
+
existing
|
| 443 |
+
} else {
|
| 444 |
+
let temp_id = temp_nodes.len();
|
| 445 |
+
temp_nodes.push(TempDagNode {
|
| 446 |
+
terminal: signature.terminal,
|
| 447 |
+
children: signature
|
| 448 |
+
.children
|
| 449 |
+
.iter()
|
| 450 |
+
.map(|(label, target)| DagEdge {
|
| 451 |
+
label: label.clone(),
|
| 452 |
+
target: *target,
|
| 453 |
+
})
|
| 454 |
+
.collect(),
|
| 455 |
+
});
|
| 456 |
+
signatures.insert(signature, temp_id);
|
| 457 |
+
temp_id
|
| 458 |
+
};
|
| 459 |
+
|
| 460 |
+
source_to_temp_dag[source_id] = temp_id;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
let root_temp_id = source_to_temp_dag[0];
|
| 464 |
+
let (temp_to_dag, mut dag_nodes) = renumber_dag(root_temp_id, &temp_nodes);
|
| 465 |
+
for node in &mut dag_nodes {
|
| 466 |
+
for edge in &mut node.children {
|
| 467 |
+
edge.target = temp_to_dag[edge.target];
|
| 468 |
+
}
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
let terminals = source
|
| 472 |
+
.terminals
|
| 473 |
+
.iter()
|
| 474 |
+
.enumerate()
|
| 475 |
+
.map(|(index, terminal)| {
|
| 476 |
+
let temp_id = source_to_temp_dag[terminal.node_id];
|
| 477 |
+
DagTerminal {
|
| 478 |
+
terminal_id: terminal
|
| 479 |
+
.terminal_id
|
| 480 |
+
.clone()
|
| 481 |
+
.unwrap_or_else(|| format!("t{}", index)),
|
| 482 |
+
node_id: temp_to_dag[temp_id],
|
| 483 |
+
prefix: terminal.prefix.clone(),
|
| 484 |
+
digit_skeleton: terminal.digit_skeleton.clone(),
|
| 485 |
+
count: terminal.count,
|
| 486 |
+
weight: terminal.weight,
|
| 487 |
+
uses_path_count: terminal.uses_path_count,
|
| 488 |
+
has_trailing_hash_count: terminal.has_trailing_hash_count,
|
| 489 |
+
suffix_examples: terminal.suffix_examples.clone(),
|
| 490 |
+
value_examples: terminal.value_examples.clone(),
|
| 491 |
+
annotations: terminal.annotations.clone(),
|
| 492 |
+
}
|
| 493 |
+
})
|
| 494 |
+
.collect::<Vec<_>>();
|
| 495 |
+
|
| 496 |
+
fill_dag_counts(&source, &source_to_temp_dag, &temp_to_dag, &mut dag_nodes);
|
| 497 |
+
let dag_edges = dag_nodes.iter().map(|node| node.children.len()).sum();
|
| 498 |
+
let max_depth = dag_max_depth(&dag_nodes);
|
| 499 |
+
|
| 500 |
+
Ok(DagOutput {
|
| 501 |
+
meta: DagMeta {
|
| 502 |
+
version: DAG_VERSION,
|
| 503 |
+
input: display_path(input_path),
|
| 504 |
+
output: display_path(output_path),
|
| 505 |
+
source_nodes: source.nodes.len(),
|
| 506 |
+
source_terminals: source.terminals.len(),
|
| 507 |
+
dag_nodes: dag_nodes.len(),
|
| 508 |
+
dag_edges,
|
| 509 |
+
root: 0,
|
| 510 |
+
max_depth,
|
| 511 |
+
merged_nodes: source.nodes.len().saturating_sub(dag_nodes.len()),
|
| 512 |
+
preserves_digits: true,
|
| 513 |
+
merge_strategy:
|
| 514 |
+
"bottom-up suffix equivalence over radix trie nodes; edge labels remain raw substrings",
|
| 515 |
+
notes: vec![
|
| 516 |
+
"Terminal prefixes and digit_skeleton values are copied from the source graph; digits are not normalized for merging.",
|
| 517 |
+
"Node reachable_terminals and reachable_weight count terminal instances mapped onto the shared DAG suffix, so shared nodes report aggregate suffix usage rather than one caller-specific path.",
|
| 518 |
+
],
|
| 519 |
+
},
|
| 520 |
+
root: 0,
|
| 521 |
+
nodes: dag_nodes,
|
| 522 |
+
terminals,
|
| 523 |
+
})
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
fn source_postorder(source: &SourceGraph) -> Vec<usize> {
|
| 527 |
+
let mut order = Vec::with_capacity(source.nodes.len());
|
| 528 |
+
let mut stack = vec![(0, false)];
|
| 529 |
+
while let Some((node_id, expanded)) = stack.pop() {
|
| 530 |
+
if expanded {
|
| 531 |
+
order.push(node_id);
|
| 532 |
+
} else {
|
| 533 |
+
stack.push((node_id, true));
|
| 534 |
+
for &child_id in source.nodes[node_id].children.iter().rev() {
|
| 535 |
+
stack.push((child_id, false));
|
| 536 |
+
}
|
| 537 |
+
}
|
| 538 |
+
}
|
| 539 |
+
order
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
fn renumber_dag(root_temp_id: usize, temp_nodes: &[TempDagNode]) -> (Vec<usize>, Vec<DagNode>) {
|
| 543 |
+
let mut temp_to_dag = vec![usize::MAX; temp_nodes.len()];
|
| 544 |
+
let mut dag_nodes = Vec::<DagNode>::new();
|
| 545 |
+
let mut stack = vec![root_temp_id];
|
| 546 |
+
|
| 547 |
+
while let Some(temp_id) = stack.pop() {
|
| 548 |
+
if temp_to_dag[temp_id] != usize::MAX {
|
| 549 |
+
continue;
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
let dag_id = dag_nodes.len();
|
| 553 |
+
temp_to_dag[temp_id] = dag_id;
|
| 554 |
+
dag_nodes.push(DagNode {
|
| 555 |
+
id: dag_id,
|
| 556 |
+
terminal: temp_nodes[temp_id].terminal,
|
| 557 |
+
children: temp_nodes[temp_id].children.clone(),
|
| 558 |
+
incoming_count: 0,
|
| 559 |
+
reachable_terminals: 0,
|
| 560 |
+
reachable_weight: 0,
|
| 561 |
+
});
|
| 562 |
+
|
| 563 |
+
for edge in temp_nodes[temp_id].children.iter().rev() {
|
| 564 |
+
stack.push(edge.target);
|
| 565 |
+
}
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
(temp_to_dag, dag_nodes)
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
fn fill_dag_counts(
|
| 572 |
+
source: &SourceGraph,
|
| 573 |
+
source_to_temp_dag: &[usize],
|
| 574 |
+
temp_to_dag: &[usize],
|
| 575 |
+
nodes: &mut [DagNode],
|
| 576 |
+
) {
|
| 577 |
+
let mut terminal_ids_by_node = vec![BTreeSet::<usize>::new(); nodes.len()];
|
| 578 |
+
|
| 579 |
+
let all_edges = nodes
|
| 580 |
+
.iter()
|
| 581 |
+
.flat_map(|node| node.children.iter().map(|edge| edge.target))
|
| 582 |
+
.collect::<Vec<_>>();
|
| 583 |
+
for target in all_edges {
|
| 584 |
+
nodes[target].incoming_count += 1;
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
let source_parents = source_parent_map(source);
|
| 588 |
+
for (terminal_index, terminal) in source.terminals.iter().enumerate() {
|
| 589 |
+
let mut source_node_id = Some(terminal.node_id);
|
| 590 |
+
while let Some(node_id) = source_node_id {
|
| 591 |
+
let temp_id = source_to_temp_dag[node_id];
|
| 592 |
+
let dag_id = temp_to_dag[temp_id];
|
| 593 |
+
terminal_ids_by_node[dag_id].insert(terminal_index);
|
| 594 |
+
source_node_id = source_parents[node_id];
|
| 595 |
+
}
|
| 596 |
+
}
|
| 597 |
+
|
| 598 |
+
for (node_id, terminal_ids) in terminal_ids_by_node.into_iter().enumerate() {
|
| 599 |
+
nodes[node_id].reachable_terminals = terminal_ids.len();
|
| 600 |
+
nodes[node_id].reachable_weight = terminal_ids
|
| 601 |
+
.into_iter()
|
| 602 |
+
.map(|terminal_id| source.terminals[terminal_id].weight)
|
| 603 |
+
.sum();
|
| 604 |
+
}
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
fn source_parent_map(source: &SourceGraph) -> Vec<Option<usize>> {
|
| 608 |
+
let mut parents = vec![None; source.nodes.len()];
|
| 609 |
+
for node in &source.nodes {
|
| 610 |
+
for &child_id in &node.children {
|
| 611 |
+
parents[child_id] = Some(node.id);
|
| 612 |
+
}
|
| 613 |
+
}
|
| 614 |
+
parents
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
fn dag_max_depth(nodes: &[DagNode]) -> usize {
|
| 618 |
+
let mut max_depth = 0;
|
| 619 |
+
let mut stack = vec![(0, 0)];
|
| 620 |
+
while let Some((node_id, depth)) = stack.pop() {
|
| 621 |
+
max_depth = max_depth.max(depth);
|
| 622 |
+
for edge in &nodes[node_id].children {
|
| 623 |
+
stack.push((edge.target, depth + 1));
|
| 624 |
+
}
|
| 625 |
+
}
|
| 626 |
+
max_depth
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
fn ensure_output_parent(output: &Path) -> Result<()> {
|
| 630 |
+
if let Some(parent) = output.parent() {
|
| 631 |
+
if !parent.as_os_str().is_empty() {
|
| 632 |
+
fs::create_dir_all(parent)
|
| 633 |
+
.with_context(|| format!("failed to create {}", parent.display()))?;
|
| 634 |
+
}
|
| 635 |
+
}
|
| 636 |
+
Ok(())
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
fn read_records(input_path: &Path) -> Result<Vec<InputRecord>> {
|
| 640 |
+
let input = File::open(input_path)
|
| 641 |
+
.with_context(|| format!("failed to open {}", input_path.display()))?;
|
| 642 |
+
let reader = BufReader::new(input);
|
| 643 |
+
let mut records = Vec::new();
|
| 644 |
+
|
| 645 |
+
for (line_number, line) in reader.lines().enumerate() {
|
| 646 |
+
let line = line.with_context(|| {
|
| 647 |
+
format!(
|
| 648 |
+
"failed to read line {} from {}",
|
| 649 |
+
line_number + 1,
|
| 650 |
+
input_path.display()
|
| 651 |
+
)
|
| 652 |
+
})?;
|
| 653 |
+
let line = line.trim();
|
| 654 |
+
if line.is_empty() {
|
| 655 |
+
continue;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
let record = serde_json::from_str(line).with_context(|| {
|
| 659 |
+
format!(
|
| 660 |
+
"failed to parse line {} from {}",
|
| 661 |
+
line_number + 1,
|
| 662 |
+
input_path.display()
|
| 663 |
+
)
|
| 664 |
+
})?;
|
| 665 |
+
records.push(record);
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
Ok(records)
|
| 669 |
+
}
|
| 670 |
+
|
| 671 |
+
fn build_observation(source_index: usize, record: &InputRecord) -> Option<PatternObservation> {
|
| 672 |
+
let (prefix, suffix) = find_episode_prefix(&record.value)?;
|
| 673 |
+
if prefix.is_empty() {
|
| 674 |
+
return None;
|
| 675 |
+
}
|
| 676 |
+
let digit_skeleton = digit_skeleton(&prefix);
|
| 677 |
+
|
| 678 |
+
Some(PatternObservation {
|
| 679 |
+
source_index,
|
| 680 |
+
prefix,
|
| 681 |
+
digit_skeleton,
|
| 682 |
+
suffix,
|
| 683 |
+
value: record.value.clone(),
|
| 684 |
+
uses_path: record.uses_path,
|
| 685 |
+
has_trailing_hash: record.has_trailing_hash,
|
| 686 |
+
count: record.count,
|
| 687 |
+
})
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
fn aggregate_observations(
|
| 691 |
+
observations: &[PatternObservation],
|
| 692 |
+
min_count: usize,
|
| 693 |
+
example_count: usize,
|
| 694 |
+
) -> Vec<GroupBuilder> {
|
| 695 |
+
let mut groups = BTreeMap::<String, GroupBuilder>::new();
|
| 696 |
+
|
| 697 |
+
for observation in observations {
|
| 698 |
+
let group = groups
|
| 699 |
+
.entry(observation.prefix.clone())
|
| 700 |
+
.or_insert_with(|| GroupBuilder {
|
| 701 |
+
prefix: observation.prefix.clone(),
|
| 702 |
+
digit_skeleton: observation.digit_skeleton.clone(),
|
| 703 |
+
..GroupBuilder::default()
|
| 704 |
+
});
|
| 705 |
+
|
| 706 |
+
group.count += 1;
|
| 707 |
+
group.weight += observation.count;
|
| 708 |
+
group.uses_path_count += observation.count * usize::from(observation.uses_path);
|
| 709 |
+
group.has_trailing_hash_count +=
|
| 710 |
+
observation.count * usize::from(observation.has_trailing_hash);
|
| 711 |
+
if !observation.suffix.is_empty() && group.suffix_examples.len() < example_count {
|
| 712 |
+
group.suffix_examples.push(observation.suffix.clone());
|
| 713 |
+
}
|
| 714 |
+
if group.value_examples.len() < example_count {
|
| 715 |
+
group.value_examples.push(observation.value.clone());
|
| 716 |
+
}
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
groups
|
| 720 |
+
.into_values()
|
| 721 |
+
.filter(|group| group.count >= min_count)
|
| 722 |
+
.collect()
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
fn build_trie(groups: &[GroupBuilder]) -> (Vec<OutputNode>, Vec<OutputTerminal>, usize, usize) {
|
| 726 |
+
let mut nodes = vec![TrieNode::root()];
|
| 727 |
+
let mut terminal_node_ids = Vec::with_capacity(groups.len());
|
| 728 |
+
|
| 729 |
+
for group in groups {
|
| 730 |
+
let (terminal_node_id, path) = insert_prefix(&mut nodes, &group.prefix);
|
| 731 |
+
terminal_node_ids.push(terminal_node_id);
|
| 732 |
+
|
| 733 |
+
for id in path {
|
| 734 |
+
nodes[id].subtree_patterns += group.count;
|
| 735 |
+
nodes[id].subtree_weight += group.weight;
|
| 736 |
+
}
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
assign_depths(&mut nodes);
|
| 740 |
+
|
| 741 |
+
let terminals = groups
|
| 742 |
+
.iter()
|
| 743 |
+
.zip(terminal_node_ids)
|
| 744 |
+
.map(|(group, node_id)| OutputTerminal {
|
| 745 |
+
node_id,
|
| 746 |
+
prefix: group.prefix.clone(),
|
| 747 |
+
digit_skeleton: group.digit_skeleton.clone(),
|
| 748 |
+
count: group.count,
|
| 749 |
+
weight: group.weight,
|
| 750 |
+
uses_path_count: group.uses_path_count,
|
| 751 |
+
has_trailing_hash_count: group.has_trailing_hash_count,
|
| 752 |
+
suffix_examples: group.suffix_examples.clone(),
|
| 753 |
+
value_examples: group.value_examples.clone(),
|
| 754 |
+
annotations: TerminalAnnotations::default(),
|
| 755 |
+
})
|
| 756 |
+
.collect::<Vec<_>>();
|
| 757 |
+
|
| 758 |
+
let max_depth = nodes.iter().map(|node| node.depth).max().unwrap_or(0);
|
| 759 |
+
let grouped_weight = groups.iter().map(|group| group.weight).sum();
|
| 760 |
+
let output_nodes = nodes
|
| 761 |
+
.into_iter()
|
| 762 |
+
.map(|node| OutputNode {
|
| 763 |
+
id: node.id,
|
| 764 |
+
edge_label: node.edge_label,
|
| 765 |
+
depth: node.depth,
|
| 766 |
+
children: node.children_by_label.into_values().collect(),
|
| 767 |
+
subtree_patterns: node.subtree_patterns,
|
| 768 |
+
subtree_weight: node.subtree_weight,
|
| 769 |
+
})
|
| 770 |
+
.collect();
|
| 771 |
+
|
| 772 |
+
(output_nodes, terminals, max_depth, grouped_weight)
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
impl TrieNode {
|
| 776 |
+
fn root() -> Self {
|
| 777 |
+
Self {
|
| 778 |
+
id: 0,
|
| 779 |
+
edge_label: String::new(),
|
| 780 |
+
depth: 0,
|
| 781 |
+
parent: None,
|
| 782 |
+
children_by_label: BTreeMap::new(),
|
| 783 |
+
subtree_patterns: 0,
|
| 784 |
+
subtree_weight: 0,
|
| 785 |
+
}
|
| 786 |
+
}
|
| 787 |
+
}
|
| 788 |
+
|
| 789 |
+
fn insert_prefix(nodes: &mut Vec<TrieNode>, prefix: &str) -> (usize, Vec<usize>) {
|
| 790 |
+
let mut current = 0;
|
| 791 |
+
let mut remaining = prefix;
|
| 792 |
+
let mut path = vec![current];
|
| 793 |
+
|
| 794 |
+
loop {
|
| 795 |
+
if remaining.is_empty() {
|
| 796 |
+
return (current, path);
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
let matching_child =
|
| 800 |
+
nodes[current]
|
| 801 |
+
.children_by_label
|
| 802 |
+
.iter()
|
| 803 |
+
.find_map(|(label, &child_id)| {
|
| 804 |
+
let common_len = common_prefix_len(remaining, label);
|
| 805 |
+
(common_len > 0).then(|| (label.clone(), child_id, common_len))
|
| 806 |
+
});
|
| 807 |
+
|
| 808 |
+
let Some((child_label, child_id, common_len)) = matching_child else {
|
| 809 |
+
let child_id = push_node(nodes, current, remaining.to_owned());
|
| 810 |
+
nodes[current]
|
| 811 |
+
.children_by_label
|
| 812 |
+
.insert(remaining.to_owned(), child_id);
|
| 813 |
+
path.push(child_id);
|
| 814 |
+
return (child_id, path);
|
| 815 |
+
};
|
| 816 |
+
|
| 817 |
+
if common_len == child_label.len() {
|
| 818 |
+
current = child_id;
|
| 819 |
+
remaining = &remaining[common_len..];
|
| 820 |
+
path.push(current);
|
| 821 |
+
continue;
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
let shared_label = child_label[..common_len].to_owned();
|
| 825 |
+
let old_suffix = child_label[common_len..].to_owned();
|
| 826 |
+
let new_suffix = remaining[common_len..].to_owned();
|
| 827 |
+
|
| 828 |
+
let split_id = push_node(nodes, current, shared_label.clone());
|
| 829 |
+
nodes[current].children_by_label.remove(&child_label);
|
| 830 |
+
nodes[current]
|
| 831 |
+
.children_by_label
|
| 832 |
+
.insert(shared_label, split_id);
|
| 833 |
+
|
| 834 |
+
nodes[child_id].edge_label = old_suffix.clone();
|
| 835 |
+
nodes[child_id].parent = Some(split_id);
|
| 836 |
+
nodes[split_id].subtree_patterns = nodes[child_id].subtree_patterns;
|
| 837 |
+
nodes[split_id].subtree_weight = nodes[child_id].subtree_weight;
|
| 838 |
+
nodes[split_id]
|
| 839 |
+
.children_by_label
|
| 840 |
+
.insert(old_suffix, child_id);
|
| 841 |
+
|
| 842 |
+
path.push(split_id);
|
| 843 |
+
if new_suffix.is_empty() {
|
| 844 |
+
return (split_id, path);
|
| 845 |
+
}
|
| 846 |
+
|
| 847 |
+
let new_child_id = push_node(nodes, split_id, new_suffix.clone());
|
| 848 |
+
nodes[split_id]
|
| 849 |
+
.children_by_label
|
| 850 |
+
.insert(new_suffix, new_child_id);
|
| 851 |
+
path.push(new_child_id);
|
| 852 |
+
return (new_child_id, path);
|
| 853 |
+
}
|
| 854 |
+
}
|
| 855 |
+
|
| 856 |
+
fn push_node(nodes: &mut Vec<TrieNode>, parent_id: usize, edge_label: String) -> usize {
|
| 857 |
+
let child_id = nodes.len();
|
| 858 |
+
nodes.push(TrieNode {
|
| 859 |
+
id: child_id,
|
| 860 |
+
edge_label,
|
| 861 |
+
depth: nodes[parent_id].depth + 1,
|
| 862 |
+
parent: Some(parent_id),
|
| 863 |
+
children_by_label: BTreeMap::new(),
|
| 864 |
+
subtree_patterns: 0,
|
| 865 |
+
subtree_weight: 0,
|
| 866 |
+
});
|
| 867 |
+
child_id
|
| 868 |
+
}
|
| 869 |
+
|
| 870 |
+
fn common_prefix_len(left: &str, right: &str) -> usize {
|
| 871 |
+
let mut len = 0;
|
| 872 |
+
for ((left_index, left_ch), (_, right_ch)) in left.char_indices().zip(right.char_indices()) {
|
| 873 |
+
if left_ch != right_ch {
|
| 874 |
+
break;
|
| 875 |
+
}
|
| 876 |
+
len = left_index + left_ch.len_utf8();
|
| 877 |
+
}
|
| 878 |
+
len
|
| 879 |
+
}
|
| 880 |
+
|
| 881 |
+
fn assign_depths(nodes: &mut [TrieNode]) {
|
| 882 |
+
let mut stack = vec![(0, 0)];
|
| 883 |
+
while let Some((node_id, depth)) = stack.pop() {
|
| 884 |
+
nodes[node_id].depth = depth;
|
| 885 |
+
let child_ids = nodes[node_id]
|
| 886 |
+
.children_by_label
|
| 887 |
+
.values()
|
| 888 |
+
.copied()
|
| 889 |
+
.collect::<Vec<_>>();
|
| 890 |
+
for child_id in child_ids {
|
| 891 |
+
stack.push((child_id, depth + 1));
|
| 892 |
+
}
|
| 893 |
+
}
|
| 894 |
+
}
|
| 895 |
+
|
| 896 |
+
fn find_episode_prefix(value: &str) -> Option<(String, String)> {
|
| 897 |
+
let best_end = EPISODE_PATTERNS
|
| 898 |
+
.iter()
|
| 899 |
+
.filter_map(|pattern| pattern.find(value).map(|matched| matched.end()))
|
| 900 |
+
.chain(find_delimited_number_episode_end(value))
|
| 901 |
+
.max()?;
|
| 902 |
+
|
| 903 |
+
let prefix = value[..best_end].trim_end().to_owned();
|
| 904 |
+
let suffix = value[best_end..].trim().to_owned();
|
| 905 |
+
Some((prefix, suffix))
|
| 906 |
+
}
|
| 907 |
+
|
| 908 |
+
fn find_delimited_number_episode_end(value: &str) -> Option<usize> {
|
| 909 |
+
let mut digits_start = None;
|
| 910 |
+
let mut digit_count = 0;
|
| 911 |
+
|
| 912 |
+
for (index, ch) in value
|
| 913 |
+
.char_indices()
|
| 914 |
+
.chain(std::iter::once((value.len(), '\0')))
|
| 915 |
+
{
|
| 916 |
+
if ch.is_ascii_digit() {
|
| 917 |
+
if digits_start.is_none() {
|
| 918 |
+
digits_start = Some(index);
|
| 919 |
+
}
|
| 920 |
+
digit_count += 1;
|
| 921 |
+
continue;
|
| 922 |
+
}
|
| 923 |
+
|
| 924 |
+
if let Some(start) = digits_start {
|
| 925 |
+
if (1..=4).contains(&digit_count)
|
| 926 |
+
&& has_episode_left_boundary(value, start)
|
| 927 |
+
&& has_episode_right_boundary(ch)
|
| 928 |
+
{
|
| 929 |
+
return Some(index);
|
| 930 |
+
}
|
| 931 |
+
}
|
| 932 |
+
|
| 933 |
+
digits_start = None;
|
| 934 |
+
digit_count = 0;
|
| 935 |
+
}
|
| 936 |
+
|
| 937 |
+
None
|
| 938 |
+
}
|
| 939 |
+
|
| 940 |
+
fn has_episode_left_boundary(value: &str, digits_start: usize) -> bool {
|
| 941 |
+
if digits_start == 0 {
|
| 942 |
+
return true;
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
value[..digits_start]
|
| 946 |
+
.chars()
|
| 947 |
+
.next_back()
|
| 948 |
+
.is_some_and(|ch| ch.is_whitespace() || matches!(ch, '.' | '_' | '-'))
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
fn has_episode_right_boundary(ch: char) -> bool {
|
| 952 |
+
ch == '\0'
|
| 953 |
+
|| ch.is_whitespace()
|
| 954 |
+
|| matches!(ch, '.' | '_' | '-' | ']' | ')' | '【' | '】' | '[')
|
| 955 |
+
}
|
| 956 |
+
|
| 957 |
+
fn digit_skeleton(text: &str) -> String {
|
| 958 |
+
DIGITS.replace_all(text, "<NUM>").into_owned()
|
| 959 |
+
}
|
| 960 |
+
|
| 961 |
+
fn display_path(path: &Path) -> String {
|
| 962 |
+
path.display().to_string()
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
fn default_count() -> usize {
|
| 966 |
+
1
|
| 967 |
+
}
|
| 968 |
+
|
| 969 |
+
fn default_annotations_value() -> serde_json::Value {
|
| 970 |
+
serde_json::json!({})
|
| 971 |
+
}
|
| 972 |
+
|
| 973 |
+
#[cfg(test)]
|
| 974 |
+
mod tests {
|
| 975 |
+
use super::*;
|
| 976 |
+
|
| 977 |
+
#[test]
|
| 978 |
+
fn dag_merges_equal_suffix_nodes_without_normalizing_digits() {
|
| 979 |
+
let source = SourceGraph {
|
| 980 |
+
meta: serde_json::json!({}),
|
| 981 |
+
nodes: vec![
|
| 982 |
+
SourceNode {
|
| 983 |
+
id: 0,
|
| 984 |
+
edge_label: String::new(),
|
| 985 |
+
depth: 0,
|
| 986 |
+
children: vec![1, 3],
|
| 987 |
+
},
|
| 988 |
+
SourceNode {
|
| 989 |
+
id: 1,
|
| 990 |
+
edge_label: "A01".to_owned(),
|
| 991 |
+
depth: 1,
|
| 992 |
+
children: vec![2],
|
| 993 |
+
},
|
| 994 |
+
SourceNode {
|
| 995 |
+
id: 2,
|
| 996 |
+
edge_label: "X".to_owned(),
|
| 997 |
+
depth: 2,
|
| 998 |
+
children: vec![],
|
| 999 |
+
},
|
| 1000 |
+
SourceNode {
|
| 1001 |
+
id: 3,
|
| 1002 |
+
edge_label: "B02".to_owned(),
|
| 1003 |
+
depth: 1,
|
| 1004 |
+
children: vec![4],
|
| 1005 |
+
},
|
| 1006 |
+
SourceNode {
|
| 1007 |
+
id: 4,
|
| 1008 |
+
edge_label: "X".to_owned(),
|
| 1009 |
+
depth: 2,
|
| 1010 |
+
children: vec![],
|
| 1011 |
+
},
|
| 1012 |
+
],
|
| 1013 |
+
terminals: vec![
|
| 1014 |
+
SourceTerminal {
|
| 1015 |
+
terminal_id: None,
|
| 1016 |
+
node_id: 2,
|
| 1017 |
+
prefix: "A01X".to_owned(),
|
| 1018 |
+
digit_skeleton: "A<NUM>X".to_owned(),
|
| 1019 |
+
count: 1,
|
| 1020 |
+
weight: 1,
|
| 1021 |
+
uses_path_count: 0,
|
| 1022 |
+
has_trailing_hash_count: 0,
|
| 1023 |
+
suffix_examples: vec![],
|
| 1024 |
+
value_examples: vec!["A01X".to_owned()],
|
| 1025 |
+
annotations: serde_json::json!({}),
|
| 1026 |
+
},
|
| 1027 |
+
SourceTerminal {
|
| 1028 |
+
terminal_id: None,
|
| 1029 |
+
node_id: 4,
|
| 1030 |
+
prefix: "B02X".to_owned(),
|
| 1031 |
+
digit_skeleton: "B<NUM>X".to_owned(),
|
| 1032 |
+
count: 1,
|
| 1033 |
+
weight: 1,
|
| 1034 |
+
uses_path_count: 0,
|
| 1035 |
+
has_trailing_hash_count: 0,
|
| 1036 |
+
suffix_examples: vec![],
|
| 1037 |
+
value_examples: vec!["B02X".to_owned()],
|
| 1038 |
+
annotations: serde_json::json!({}),
|
| 1039 |
+
},
|
| 1040 |
+
],
|
| 1041 |
+
};
|
| 1042 |
+
|
| 1043 |
+
let dag = minimize_source_graph(
|
| 1044 |
+
&source,
|
| 1045 |
+
Path::new("dmhy_prefix_graph.json"),
|
| 1046 |
+
Path::new("dmhy_prefix_dag.json"),
|
| 1047 |
+
)
|
| 1048 |
+
.unwrap();
|
| 1049 |
+
|
| 1050 |
+
assert!(dag.meta.preserves_digits);
|
| 1051 |
+
assert!(dag.meta.merged_nodes >= 2);
|
| 1052 |
+
assert_eq!(dag.terminals[0].prefix, "A01X");
|
| 1053 |
+
assert_eq!(dag.terminals[1].prefix, "B02X");
|
| 1054 |
+
assert_eq!(dag.terminals[0].node_id, dag.terminals[1].node_id);
|
| 1055 |
+
|
| 1056 |
+
let shared_parent_targets = dag.nodes[0]
|
| 1057 |
+
.children
|
| 1058 |
+
.iter()
|
| 1059 |
+
.filter(|edge| edge.label == "A01" || edge.label == "B02")
|
| 1060 |
+
.map(|edge| edge.target)
|
| 1061 |
+
.collect::<BTreeSet<_>>();
|
| 1062 |
+
assert_eq!(shared_parent_targets.len(), 1);
|
| 1063 |
+
|
| 1064 |
+
let shared_parent = shared_parent_targets.into_iter().next().unwrap();
|
| 1065 |
+
assert_eq!(dag.nodes[shared_parent].children.len(), 1);
|
| 1066 |
+
assert_eq!(dag.nodes[shared_parent].children[0].label, "X");
|
| 1067 |
+
assert_eq!(dag.nodes[shared_parent].reachable_terminals, 2);
|
| 1068 |
+
assert_eq!(dag.nodes[shared_parent].reachable_weight, 2);
|
| 1069 |
+
}
|
| 1070 |
+
}
|
tools/test_annotated_dmhy_workflow.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke tests for annotated DMHY graph dataset helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import tempfile
|
| 6 |
+
import json
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
import unittest
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from tools.annotate_dmhy_prefix_graph import normalize_generated_tokens
|
| 13 |
+
from tools.convert_annotated_dmhy_dataset import (
|
| 14 |
+
iter_validated_jsonl,
|
| 15 |
+
validate_record,
|
| 16 |
+
)
|
| 17 |
+
from tools.convert_to_char_dataset import convert_record
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AnnotatedDmhyWorkflowTests(unittest.TestCase):
|
| 21 |
+
def test_generated_tokens_split_punctuation_and_use_b_only_labels(self) -> None:
|
| 22 |
+
tokens, labels = normalize_generated_tokens(
|
| 23 |
+
["[ANi]", " ", "Title-Name", "07"],
|
| 24 |
+
["B-GROUP", "O", "I-TITLE", "B-EPISODE"],
|
| 25 |
+
)
|
| 26 |
+
self.assertEqual(tokens, ["[", "ANi", "]", " ", "Title", "-", "Name", "07"])
|
| 27 |
+
self.assertEqual(
|
| 28 |
+
labels,
|
| 29 |
+
["O", "B-GROUP", "O", "O", "B-TITLE", "O", "B-TITLE", "B-EPISODE"],
|
| 30 |
+
)
|
| 31 |
+
self.assertTrue(all(label == "O" or label.startswith("B-") for label in labels))
|
| 32 |
+
|
| 33 |
+
def test_preserve_i_labels_keeps_i_on_non_separator_pieces(self) -> None:
|
| 34 |
+
tokens, labels = normalize_generated_tokens(
|
| 35 |
+
["Title-Name"],
|
| 36 |
+
["I-TITLE"],
|
| 37 |
+
preserve_i_labels=True,
|
| 38 |
+
)
|
| 39 |
+
self.assertEqual(tokens, ["Title", "-", "Name"])
|
| 40 |
+
self.assertEqual(labels, ["I-TITLE", "O", "I-TITLE"])
|
| 41 |
+
|
| 42 |
+
def test_validation_rejects_embedded_punctuation(self) -> None:
|
| 43 |
+
record = {
|
| 44 |
+
"filename": "Title-Name 07",
|
| 45 |
+
"tokens": ["Title-Name", "07"],
|
| 46 |
+
"labels": ["B-TITLE", "B-EPISODE"],
|
| 47 |
+
}
|
| 48 |
+
with self.assertRaisesRegex(ValueError, "contains punctuation"):
|
| 49 |
+
validate_record(record, Path("sample.jsonl"), 1)
|
| 50 |
+
|
| 51 |
+
def test_validation_rejects_embedded_symbol_separator(self) -> None:
|
| 52 |
+
record = {
|
| 53 |
+
"filename": "Title 1920×1080 07",
|
| 54 |
+
"tokens": ["Title", "1920×1080", "07"],
|
| 55 |
+
"labels": ["B-TITLE", "B-RESOLUTION", "B-EPISODE"],
|
| 56 |
+
}
|
| 57 |
+
with self.assertRaisesRegex(ValueError, "contains punctuation"):
|
| 58 |
+
validate_record(record, Path("sample.jsonl"), 1)
|
| 59 |
+
|
| 60 |
+
def test_b_only_input_converts_to_char_i_labels(self) -> None:
|
| 61 |
+
record = {
|
| 62 |
+
"filename": "Title-Name 07",
|
| 63 |
+
"tokens": ["Title", "-", "Name", " ", "07"],
|
| 64 |
+
"labels": ["B-TITLE", "O", "B-TITLE", "O", "B-EPISODE"],
|
| 65 |
+
}
|
| 66 |
+
validate_record(record, Path("sample.jsonl"), 1)
|
| 67 |
+
converted = convert_record(record)
|
| 68 |
+
self.assertIn("I-TITLE", converted["labels"])
|
| 69 |
+
self.assertEqual(converted["tokens"][:5], ["T", "i", "t", "l", "e"])
|
| 70 |
+
|
| 71 |
+
def test_iter_validated_jsonl_accepts_generated_shape(self) -> None:
|
| 72 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 73 |
+
path = Path(tmpdir) / "records.jsonl"
|
| 74 |
+
path.write_text(
|
| 75 |
+
'{"filename":"A 01","tokens":["A"," ","01"],"labels":["B-TITLE","O","B-EPISODE"]}\n',
|
| 76 |
+
encoding="utf-8",
|
| 77 |
+
)
|
| 78 |
+
rows = list(iter_validated_jsonl(path))
|
| 79 |
+
self.assertEqual(len(rows), 1)
|
| 80 |
+
self.assertEqual(rows[0]["filename"], "A 01")
|
| 81 |
+
|
| 82 |
+
def test_cli_smoke_annotate_then_convert_with_temp_files(self) -> None:
|
| 83 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 84 |
+
tmp = Path(tmpdir)
|
| 85 |
+
graph_path = tmp / "graph.json"
|
| 86 |
+
dataset_path = tmp / "dmhy_weak.generated.jsonl"
|
| 87 |
+
char_path = tmp / "dmhy_weak.generated_char.jsonl"
|
| 88 |
+
vocab_path = tmp / "vocab.generated.char.json"
|
| 89 |
+
manifest_path = tmp / "manifest.json"
|
| 90 |
+
graph_path.write_text(
|
| 91 |
+
json.dumps(
|
| 92 |
+
{
|
| 93 |
+
"terminals": [
|
| 94 |
+
{
|
| 95 |
+
"terminal_id": "t0",
|
| 96 |
+
"weight": 1,
|
| 97 |
+
"value_examples": [
|
| 98 |
+
"[ANi] Test Show - 01 [1080P][WEB-DL].mkv"
|
| 99 |
+
],
|
| 100 |
+
"suffix_examples": [" [1080P][WEB-DL]"],
|
| 101 |
+
}
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
ensure_ascii=False,
|
| 105 |
+
),
|
| 106 |
+
encoding="utf-8",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
annotate = subprocess.run(
|
| 110 |
+
[
|
| 111 |
+
sys.executable,
|
| 112 |
+
"-m",
|
| 113 |
+
"tools.annotate_dmhy_prefix_graph",
|
| 114 |
+
"--graph",
|
| 115 |
+
str(graph_path),
|
| 116 |
+
"--output",
|
| 117 |
+
str(dataset_path),
|
| 118 |
+
"--patch-output",
|
| 119 |
+
"",
|
| 120 |
+
"--examples-only",
|
| 121 |
+
],
|
| 122 |
+
check=False,
|
| 123 |
+
capture_output=True,
|
| 124 |
+
text=True,
|
| 125 |
+
)
|
| 126 |
+
self.assertEqual(annotate.returncode, 0, annotate.stderr)
|
| 127 |
+
|
| 128 |
+
rows = [
|
| 129 |
+
json.loads(line)
|
| 130 |
+
for line in dataset_path.read_text(encoding="utf-8").splitlines()
|
| 131 |
+
if line.strip()
|
| 132 |
+
]
|
| 133 |
+
self.assertEqual(len(rows), 1)
|
| 134 |
+
self.assertIn("annotations", rows[0])
|
| 135 |
+
self.assertEqual(rows[0]["tokens"][0], "[")
|
| 136 |
+
self.assertEqual(rows[0]["labels"][0], "O")
|
| 137 |
+
|
| 138 |
+
convert = subprocess.run(
|
| 139 |
+
[
|
| 140 |
+
sys.executable,
|
| 141 |
+
"-m",
|
| 142 |
+
"tools.convert_annotated_dmhy_dataset",
|
| 143 |
+
"--input",
|
| 144 |
+
str(dataset_path),
|
| 145 |
+
"--output",
|
| 146 |
+
str(char_path),
|
| 147 |
+
"--vocab-output",
|
| 148 |
+
str(vocab_path),
|
| 149 |
+
"--manifest-output",
|
| 150 |
+
str(manifest_path),
|
| 151 |
+
"--progress",
|
| 152 |
+
"0",
|
| 153 |
+
],
|
| 154 |
+
check=False,
|
| 155 |
+
capture_output=True,
|
| 156 |
+
text=True,
|
| 157 |
+
)
|
| 158 |
+
self.assertEqual(convert.returncode, 0, convert.stderr)
|
| 159 |
+
self.assertTrue(char_path.exists())
|
| 160 |
+
self.assertTrue(vocab_path.exists())
|
| 161 |
+
self.assertTrue(manifest_path.exists())
|
| 162 |
+
|
| 163 |
+
def test_cli_source_list_mode_expands_beyond_value_examples(self) -> None:
|
| 164 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 165 |
+
tmp = Path(tmpdir)
|
| 166 |
+
graph_path = tmp / "graph.json"
|
| 167 |
+
source_path = tmp / "dmhy_list.jsonl"
|
| 168 |
+
dataset_path = tmp / "dmhy_weak.generated.jsonl"
|
| 169 |
+
graph_path.write_text(
|
| 170 |
+
json.dumps(
|
| 171 |
+
{
|
| 172 |
+
"terminals": [
|
| 173 |
+
{
|
| 174 |
+
"terminal_id": "t0",
|
| 175 |
+
"prefix": "[ANi] Full Show - ",
|
| 176 |
+
"weight": 10,
|
| 177 |
+
"value_examples": [
|
| 178 |
+
"[ANi] Full Show - 01 [1080P][WEB-DL].mkv"
|
| 179 |
+
],
|
| 180 |
+
"suffix_examples": ["01 [1080P][WEB-DL]"],
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"terminal_id": "t1",
|
| 184 |
+
"prefix": "[ANi] Other Show - ",
|
| 185 |
+
"weight": 10,
|
| 186 |
+
"value_examples": [
|
| 187 |
+
"[ANi] Other Show - 01 [1080P][WEB-DL].mkv"
|
| 188 |
+
],
|
| 189 |
+
"suffix_examples": ["01 [1080P][WEB-DL]"],
|
| 190 |
+
},
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
ensure_ascii=False,
|
| 194 |
+
),
|
| 195 |
+
encoding="utf-8",
|
| 196 |
+
)
|
| 197 |
+
source_path.write_text(
|
| 198 |
+
"\n".join(
|
| 199 |
+
json.dumps({"value": value}, ensure_ascii=False)
|
| 200 |
+
for value in [
|
| 201 |
+
"[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
|
| 202 |
+
"[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
|
| 203 |
+
"[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
|
| 204 |
+
"[ANi] Other Show - 01 [1080P][WEB-DL].mkv",
|
| 205 |
+
]
|
| 206 |
+
)
|
| 207 |
+
+ "\n",
|
| 208 |
+
encoding="utf-8",
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
annotate = subprocess.run(
|
| 212 |
+
[
|
| 213 |
+
sys.executable,
|
| 214 |
+
"-m",
|
| 215 |
+
"tools.annotate_dmhy_prefix_graph",
|
| 216 |
+
"--graph",
|
| 217 |
+
str(graph_path),
|
| 218 |
+
"--source-list",
|
| 219 |
+
str(source_path),
|
| 220 |
+
"--output",
|
| 221 |
+
str(dataset_path),
|
| 222 |
+
"--patch-output",
|
| 223 |
+
"",
|
| 224 |
+
"--limit",
|
| 225 |
+
"1",
|
| 226 |
+
],
|
| 227 |
+
check=False,
|
| 228 |
+
capture_output=True,
|
| 229 |
+
text=True,
|
| 230 |
+
)
|
| 231 |
+
self.assertEqual(annotate.returncode, 0, annotate.stderr)
|
| 232 |
+
rows = [
|
| 233 |
+
json.loads(line)
|
| 234 |
+
for line in dataset_path.read_text(encoding="utf-8").splitlines()
|
| 235 |
+
if line.strip()
|
| 236 |
+
]
|
| 237 |
+
self.assertEqual(len(rows), 3)
|
| 238 |
+
self.assertEqual([row["filename"] for row in rows], [
|
| 239 |
+
"[ANi] Full Show - 01 [1080P][WEB-DL].mkv",
|
| 240 |
+
"[ANi] Full Show - 02 [1080P][WEB-DL].mkv",
|
| 241 |
+
"[ANi] Full Show - 03 [1080P][WEB-DL].mkv",
|
| 242 |
+
])
|
| 243 |
+
self.assertTrue(all(row["terminal_id"] == "t0" for row in rows))
|
| 244 |
+
|
| 245 |
+
def test_cli_examples_only_uses_terminal_value_examples(self) -> None:
|
| 246 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 247 |
+
tmp = Path(tmpdir)
|
| 248 |
+
graph_path = tmp / "graph.json"
|
| 249 |
+
source_path = tmp / "dmhy_list.jsonl"
|
| 250 |
+
dataset_path = tmp / "dmhy_weak.generated.jsonl"
|
| 251 |
+
graph_path.write_text(
|
| 252 |
+
json.dumps(
|
| 253 |
+
{
|
| 254 |
+
"terminals": [
|
| 255 |
+
{
|
| 256 |
+
"terminal_id": "t0",
|
| 257 |
+
"prefix": "[ANi] Example Show - ",
|
| 258 |
+
"weight": 10,
|
| 259 |
+
"value_examples": [
|
| 260 |
+
"[ANi] Example Show - 01 [1080P][WEB-DL].mkv"
|
| 261 |
+
],
|
| 262 |
+
"suffix_examples": ["01 [1080P][WEB-DL]"],
|
| 263 |
+
}
|
| 264 |
+
]
|
| 265 |
+
},
|
| 266 |
+
ensure_ascii=False,
|
| 267 |
+
),
|
| 268 |
+
encoding="utf-8",
|
| 269 |
+
)
|
| 270 |
+
source_path.write_text(
|
| 271 |
+
"\n".join(
|
| 272 |
+
json.dumps({"value": value}, ensure_ascii=False)
|
| 273 |
+
for value in [
|
| 274 |
+
"[ANi] Example Show - 01 [1080P][WEB-DL].mkv",
|
| 275 |
+
"[ANi] Example Show - 02 [1080P][WEB-DL].mkv",
|
| 276 |
+
]
|
| 277 |
+
)
|
| 278 |
+
+ "\n",
|
| 279 |
+
encoding="utf-8",
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
annotate = subprocess.run(
|
| 283 |
+
[
|
| 284 |
+
sys.executable,
|
| 285 |
+
"-m",
|
| 286 |
+
"tools.annotate_dmhy_prefix_graph",
|
| 287 |
+
"--graph",
|
| 288 |
+
str(graph_path),
|
| 289 |
+
"--source-list",
|
| 290 |
+
str(source_path),
|
| 291 |
+
"--output",
|
| 292 |
+
str(dataset_path),
|
| 293 |
+
"--patch-output",
|
| 294 |
+
"",
|
| 295 |
+
"--examples-only",
|
| 296 |
+
],
|
| 297 |
+
check=False,
|
| 298 |
+
capture_output=True,
|
| 299 |
+
text=True,
|
| 300 |
+
)
|
| 301 |
+
self.assertEqual(annotate.returncode, 0, annotate.stderr)
|
| 302 |
+
rows = [
|
| 303 |
+
json.loads(line)
|
| 304 |
+
for line in dataset_path.read_text(encoding="utf-8").splitlines()
|
| 305 |
+
if line.strip()
|
| 306 |
+
]
|
| 307 |
+
self.assertEqual(len(rows), 1)
|
| 308 |
+
self.assertEqual(rows[0]["filename"], "[ANi] Example Show - 01 [1080P][WEB-DL].mkv")
|
| 309 |
+
|
| 310 |
+
def test_cli_dag_annotation_units_include_shared_node_terminals(self) -> None:
|
| 311 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 312 |
+
tmp = Path(tmpdir)
|
| 313 |
+
dag_path = tmp / "dmhy_prefix_dag.json"
|
| 314 |
+
output_path = tmp / "dmhy_prefix_dag.annotation_units.jsonl"
|
| 315 |
+
dag_path.write_text(
|
| 316 |
+
json.dumps(
|
| 317 |
+
{
|
| 318 |
+
"meta": {"version": "prefix-dag-v1"},
|
| 319 |
+
"root": 0,
|
| 320 |
+
"nodes": [
|
| 321 |
+
{
|
| 322 |
+
"id": 0,
|
| 323 |
+
"terminal": False,
|
| 324 |
+
"children": [
|
| 325 |
+
{"label": "A", "target": 1},
|
| 326 |
+
{"label": "B", "target": 2},
|
| 327 |
+
],
|
| 328 |
+
"incoming_count": 0,
|
| 329 |
+
"reachable_terminals": 2,
|
| 330 |
+
"reachable_weight": 20,
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"id": 1,
|
| 334 |
+
"terminal": False,
|
| 335 |
+
"children": [{"label": " shared", "target": 3}],
|
| 336 |
+
"incoming_count": 1,
|
| 337 |
+
"reachable_terminals": 1,
|
| 338 |
+
"reachable_weight": 10,
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"id": 2,
|
| 342 |
+
"terminal": False,
|
| 343 |
+
"children": [{"label": " shared", "target": 3}],
|
| 344 |
+
"incoming_count": 1,
|
| 345 |
+
"reachable_terminals": 1,
|
| 346 |
+
"reachable_weight": 10,
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"id": 3,
|
| 350 |
+
"terminal": False,
|
| 351 |
+
"children": [
|
| 352 |
+
{"label": " 01", "target": 4},
|
| 353 |
+
{"label": " 02", "target": 5},
|
| 354 |
+
],
|
| 355 |
+
"incoming_count": 2,
|
| 356 |
+
"reachable_terminals": 2,
|
| 357 |
+
"reachable_weight": 20,
|
| 358 |
+
},
|
| 359 |
+
{
|
| 360 |
+
"id": 4,
|
| 361 |
+
"terminal": True,
|
| 362 |
+
"children": [],
|
| 363 |
+
"incoming_count": 1,
|
| 364 |
+
"reachable_terminals": 1,
|
| 365 |
+
"reachable_weight": 10,
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"id": 5,
|
| 369 |
+
"terminal": True,
|
| 370 |
+
"children": [],
|
| 371 |
+
"incoming_count": 1,
|
| 372 |
+
"reachable_terminals": 1,
|
| 373 |
+
"reachable_weight": 10,
|
| 374 |
+
},
|
| 375 |
+
],
|
| 376 |
+
"terminals": [
|
| 377 |
+
{
|
| 378 |
+
"terminal_id": "t0",
|
| 379 |
+
"node_id": 4,
|
| 380 |
+
"prefix": "Show A shared 01",
|
| 381 |
+
"digit_skeleton": "Show A shared <NUM>",
|
| 382 |
+
"count": 10,
|
| 383 |
+
"weight": 10,
|
| 384 |
+
"suffix_examples": [" [1080P][WEB-DL]"],
|
| 385 |
+
"value_examples": ["Show A shared 01 [1080P][WEB-DL].mkv"],
|
| 386 |
+
"annotations": {},
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"terminal_id": "t1",
|
| 390 |
+
"node_id": 5,
|
| 391 |
+
"prefix": "Show B shared 02",
|
| 392 |
+
"digit_skeleton": "Show B shared <NUM>",
|
| 393 |
+
"count": 10,
|
| 394 |
+
"weight": 10,
|
| 395 |
+
"suffix_examples": [" [1080P][WEB-DL]"],
|
| 396 |
+
"value_examples": ["Show B shared 02 [1080P][WEB-DL].mkv"],
|
| 397 |
+
"annotations": {},
|
| 398 |
+
},
|
| 399 |
+
],
|
| 400 |
+
},
|
| 401 |
+
ensure_ascii=False,
|
| 402 |
+
),
|
| 403 |
+
encoding="utf-8",
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
annotate = subprocess.run(
|
| 407 |
+
[
|
| 408 |
+
sys.executable,
|
| 409 |
+
"-m",
|
| 410 |
+
"tools.annotate_dmhy_prefix_dag",
|
| 411 |
+
"--dag",
|
| 412 |
+
str(dag_path),
|
| 413 |
+
"--output",
|
| 414 |
+
str(output_path),
|
| 415 |
+
"--min-reachable-terminals",
|
| 416 |
+
"2",
|
| 417 |
+
"--min-incoming-count",
|
| 418 |
+
"2",
|
| 419 |
+
"--limit",
|
| 420 |
+
"1",
|
| 421 |
+
],
|
| 422 |
+
check=False,
|
| 423 |
+
capture_output=True,
|
| 424 |
+
text=True,
|
| 425 |
+
)
|
| 426 |
+
self.assertEqual(annotate.returncode, 0, annotate.stderr)
|
| 427 |
+
rows = [
|
| 428 |
+
json.loads(line)
|
| 429 |
+
for line in output_path.read_text(encoding="utf-8").splitlines()
|
| 430 |
+
if line.strip()
|
| 431 |
+
]
|
| 432 |
+
self.assertEqual(len(rows), 1)
|
| 433 |
+
self.assertEqual(rows[0]["unit_id"], "dag-node-3")
|
| 434 |
+
self.assertEqual(rows[0]["kind"], "shared_suffix")
|
| 435 |
+
self.assertEqual(rows[0]["terminal_ids"], ["t0", "t1"])
|
| 436 |
+
self.assertEqual(
|
| 437 |
+
rows[0]["prefix_examples"],
|
| 438 |
+
["Show A shared 01", "Show B shared 02"],
|
| 439 |
+
)
|
| 440 |
+
self.assertEqual(rows[0]["common_edge_labels"], [" 01", " 02"])
|
| 441 |
+
self.assertIn("annotations", rows[0])
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
if __name__ == "__main__":
|
| 445 |
+
unittest.main()
|