Add robust LLM relabel pipeline and enforce contiguous title

Browse files

Files changed (3) hide show

datasets/AnimeName +1 -1
tools/enforce_contiguous_title.py +176 -0
tools/llm_relabel_rows.py +159 -25

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~5de6ddeed7dafd43207953072a9e197f13b32077~~


1	+ Subproject commit ad48d8da74cf8e611a14f22ffc2a9734872e1f03

tools/enforce_contiguous_title.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+"""
+Enforce a single contiguous TITLE span for every JSONL row.
+This script is deterministic and streaming-friendly for very large datasets.
+It is intended as a hard safety pass before/alongside LLM relabeling.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, List, Sequence, Tuple
+from anifilebert.label_repairs import repair_jsonl_item
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Force contiguous TITLE spans in JSONL labels")
+    parser.add_argument("--input", required=True, help="Input JSONL")
+    parser.add_argument("--output", required=True, help="Output JSONL")
+    parser.add_argument("--manifest-output", default="", help="Optional manifest JSON")
+    parser.add_argument("--progress", type=int, default=50000, help="Progress print interval")
+    return parser.parse_args()
+def normalize_iob2(labels: Sequence[str]) -> List[str]:
+    out: List[str] = []
+    prev = ""
+    for lb in labels:
+        if not isinstance(lb, str) or not lb.startswith(("B-", "I-")):
+            out.append("O")
+            prev = ""
+            continue
+        entity = lb.split("-", 1)[1]
+        prefix = "I" if prev == entity else "B"
+        out.append(f"{prefix}-{entity}")
+        prev = entity
+    return out
+def is_discontinuous_title(labels: Sequence[str]) -> bool:
+    seen_title = False
+    seen_gap = False
+    for lb in labels:
+        is_title = isinstance(lb, str) and lb.endswith("TITLE")
+        if is_title:
+            if seen_title and seen_gap:
+                return True
+            seen_title = True
+        elif seen_title:
+            seen_gap = True
+    return False
+def title_segments(labels: Sequence[str]) -> List[Tuple[int, int]]:
+    segs: List[Tuple[int, int]] = []
+    i = 0
+    n = len(labels)
+    while i < n:
+        if str(labels[i]).endswith("TITLE"):
+            j = i + 1
+            while j < n and str(labels[j]).endswith("TITLE"):
+                j += 1
+            segs.append((i, j))
+            i = j
+        else:
+            i += 1
+    return segs
+def first_episode_or_special_index(labels: Sequence[str]) -> int:
+    for idx, lb in enumerate(labels):
+        text = str(lb)
+        if text.endswith("EPISODE") or text.endswith("SPECIAL"):
+            return idx
+    return len(labels)
+def pick_primary_title_segment(labels: Sequence[str], segs: Sequence[Tuple[int, int]]) -> Tuple[int, int]:
+    if not segs:
+        return (-1, -1)
+    bound = first_episode_or_special_index(labels)
+    before = [seg for seg in segs if seg[0] < bound]
+    # Prefer the earliest title span before episode/special boundary.
+    if before:
+        return min(before, key=lambda seg: seg[0])
+    return min(segs, key=lambda seg: seg[0])
+def enforce_contiguous_title(labels: Sequence[str]) -> List[str]:
+    fixed = normalize_iob2(labels)
+    segs = title_segments(fixed)
+    if len(segs) <= 1:
+        return fixed
+    keep_start, keep_end = pick_primary_title_segment(fixed, segs)
+    if keep_start < 0:
+        return fixed
+    out = list(fixed)
+    for idx, lb in enumerate(out):
+        if str(lb).endswith("TITLE") and not (keep_start <= idx < keep_end):
+            out[idx] = "O"
+    return normalize_iob2(out)
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".contiguous_title.manifest.json")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    rows = 0
+    changed_rows = 0
+    bad_before = 0
+    bad_after = 0
+    invalid_rows = 0
+    tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
+    with input_path.open("r", encoding="utf-8") as src, tmp_path.open("w", encoding="utf-8", newline="\n") as dst:
+        for line in src:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            rows += 1
+            rec = json.loads(line)
+            tokens = rec.get("tokens", [])
+            labels = rec.get("labels", [])
+            if not isinstance(tokens, list) or not isinstance(labels, list) or len(tokens) != len(labels):
+                invalid_rows += 1
+                dst.write(json.dumps(rec, ensure_ascii=False, separators=(",", ":")) + "\n")
+                continue
+            if is_discontinuous_title(labels):
+                bad_before += 1
+            new_labels = enforce_contiguous_title(labels)
+            out_rec: Dict = dict(rec)
+            out_rec["labels"] = new_labels
+            repaired, _ = repair_jsonl_item(out_rec)
+            out_labels = repaired.get("labels", new_labels)
+            if is_discontinuous_title(out_labels):
+                bad_after += 1
+            if out_labels != labels:
+                changed_rows += 1
+            repaired["labels"] = out_labels
+            dst.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
+            if args.progress > 0 and rows % args.progress == 0:
+                print(
+                    f"rows={rows} changed={changed_rows} "
+                    f"bad_before={bad_before} bad_after={bad_after} invalid={invalid_rows}"
+                )
+    tmp_path.replace(output_path)
+    manifest = {
+        "input": str(input_path),
+        "output": str(output_path),
+        "rows": rows,
+        "changed_rows": changed_rows,
+        "discontinuous_before": bad_before,
+        "discontinuous_after": bad_after,
+        "invalid_rows": invalid_rows,
+    }
+    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(json.dumps(manifest, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()

tools/llm_relabel_rows.py CHANGED Viewed

@@ -20,6 +20,7 @@ from pathlib import Path
 from typing import Any, Dict, List, Sequence
 import requests
 ALLOWED_LABELS = {
@@ -151,6 +152,7 @@ def parse_args() -> argparse.Namespace:
     p.add_argument("--checkpoint-rows", type=int, default=100, help="Write checkpoint every N processed rows")
     p.add_argument("--failure-log", default="reports/llm_relabel_failures.log", help="Failure log path")
     p.add_argument("--perf-log", default="", help="Optional JSON perf summary path")
     p.add_argument("--usd-per-1m-input", type=float, default=0.75, help="Input token price (USD per 1M tokens)")
     p.add_argument("--usd-per-1m-output", type=float, default=4.5, help="Output token price (USD per 1M tokens)")
     p.add_argument(
@@ -244,6 +246,64 @@ def validate_labels(tokens: Sequence[str], labels: Sequence[str]) -> bool:
     return True
 def response_schema() -> Dict[str, Any]:
     return {
         "type": "object",
@@ -276,6 +336,44 @@ def append_failure_log(path: str, message: str) -> None:
         f.write(message.rstrip() + "\n")
 def parse_usage(response_obj: Dict[str, Any]) -> UsageStats:
     usage = response_obj.get("usage", {}) or {}
     in_details = usage.get("input_tokens_details", {}) or {}
@@ -299,6 +397,7 @@ def relabel_batch(
     user_agent: str,
     retries: int,
     failure_log: str,
 ) -> tuple[Dict[int, List[str]], UsageStats]:
     url = f"{api_base.rstrip('/')}/responses"
     headers = {
@@ -308,29 +407,30 @@ def relabel_batch(
     }
     user_payload = build_user_payload(batch_rows)
-    body = {
-        "model": model,
-        "instructions": SYSTEM_INSTRUCTIONS,
-        "input": user_payload,
-        "prompt_cache_key": prompt_cache_key,
-        "prompt_cache_retention": prompt_cache_retention,
-        "reasoning": {"effort": reasoning_effort},
-        "tools": [
-            {
-                "type": "function",
-                "name": "submit_labels",
-                "description": "Submit relabeled BIO labels.",
-                "parameters": response_schema(),
-                "strict": True,
-            }
-        ],
-        "tool_choice": {"type": "function", "name": "submit_labels"},
     }
     last_error: Exception | None = None
     for attempt in range(1, retries + 1):
         try:
-            resp = requests.post(url, headers=headers, json=body, timeout=120)
             resp.raise_for_status()
             obj = resp.json()
             usage_stats = parse_usage(obj)
@@ -378,12 +478,25 @@ def relabel_batch(
             return mapping, usage_stats
         except Exception as exc:  # noqa: BLE001
             last_error = exc
-            # Some compatible gateways may not support prompt caching or reasoning fields.
             if isinstance(exc, requests.HTTPError) and exc.response is not None and exc.response.status_code == 400:
-                body.pop("prompt_cache_retention", None)
-                body.pop("reasoning", None)
-                body.pop("tools", None)
-                body.pop("tool_choice", None)
             if attempt == retries:
                 break
             time.sleep(0.8 * attempt)
@@ -410,6 +523,7 @@ def process_batch_with_fallback(
     user_agent: str,
     retries: int,
     failure_log: str,
 ) -> List[tuple[Row, List[str]]]:
     usage_total = UsageStats()
     try:
@@ -424,6 +538,7 @@ def process_batch_with_fallback(
             user_agent=user_agent,
             retries=retries,
             failure_log=failure_log,
         )
         usage_total.add(usage)
     except RuntimeError:
@@ -441,6 +556,7 @@ def process_batch_with_fallback(
                     user_agent=user_agent,
                     retries=max(retries, 4),
                     failure_log=failure_log,
                 )
                 usage_total.add(usage)
                 mapping[idx] = single[0]
@@ -449,8 +565,23 @@ def process_batch_with_fallback(
                     failure_log,
                     f"[row-skip] file_id={row.record.get('file_id')} line={row.line_no} reason={exc}",
                 )
-                mapping[idx] = row.record.get("labels", [])
-    return [(batch[row_id], labels) for row_id, labels in mapping.items()], usage_total
 def process_batch_timed(
@@ -465,6 +596,7 @@ def process_batch_timed(
     user_agent: str,
     retries: int,
     failure_log: str,
 ) -> Dict[str, Any]:
     meter.task_start()
     t0 = time.time()
@@ -480,6 +612,7 @@ def process_batch_timed(
             user_agent=user_agent,
             retries=retries,
             failure_log=failure_log,
         )
         return {
             "updates": updates,
@@ -552,6 +685,7 @@ def main() -> None:
                 user_agent=args.user_agent,
                 retries=args.retries,
                 failure_log=args.failure_log,
             )
             for batch in batches
         ]

 from typing import Any, Dict, List, Sequence
 import requests
+from anifilebert.label_repairs import repair_jsonl_item
 ALLOWED_LABELS = {
     p.add_argument("--checkpoint-rows", type=int, default=100, help="Write checkpoint every N processed rows")
     p.add_argument("--failure-log", default="reports/llm_relabel_failures.log", help="Failure log path")
     p.add_argument("--perf-log", default="", help="Optional JSON perf summary path")
+    p.add_argument("--http-timeout", type=int, default=240, help="HTTP timeout in seconds per request")
     p.add_argument("--usd-per-1m-input", type=float, default=0.75, help="Input token price (USD per 1M tokens)")
     p.add_argument("--usd-per-1m-output", type=float, default=4.5, help="Output token price (USD per 1M tokens)")
     p.add_argument(
     return True
+def normalize_iob2_labels(labels: Sequence[str]) -> List[str]:
+    normalized: List[str] = []
+    prev_entity = ""
+    for lb in labels:
+        if not isinstance(lb, str) or not lb.startswith(("B-", "I-")):
+            normalized.append("O")
+            prev_entity = ""
+            continue
+        entity = lb.split("-", 1)[1]
+        prefix = "I" if prev_entity == entity else "B"
+        normalized.append(f"{prefix}-{entity}")
+        prev_entity = entity
+    return normalized
+def title_segments(labels: Sequence[str]) -> List[tuple[int, int]]:
+    segments: List[tuple[int, int]] = []
+    i = 0
+    n = len(labels)
+    while i < n:
+        if str(labels[i]).endswith("TITLE"):
+            j = i + 1
+            while j < n and str(labels[j]).endswith("TITLE"):
+                j += 1
+            segments.append((i, j))
+            i = j
+        else:
+            i += 1
+    return segments
+def force_single_title_segment(tokens: Sequence[str], labels: Sequence[str]) -> List[str]:
+    """Guarantee TITLE is a single contiguous segment."""
+    if len(tokens) != len(labels):
+        return list(labels)
+    fixed = normalize_iob2_labels(labels)
+    segs = title_segments(fixed)
+    if len(segs) <= 1:
+        return fixed
+    first_episode = next((idx for idx, lb in enumerate(fixed) if str(lb).endswith("EPISODE")), len(fixed))
+    def score(seg: tuple[int, int]) -> tuple[int, int, int]:
+        start, end = seg
+        length = end - start
+        before_episode = 1 if start < first_episode else 0
+        return (before_episode, length, -start)
+    keep = max(segs, key=score)
+    ks, ke = keep
+    out = list(fixed)
+    for i in range(len(out)):
+        if str(out[i]).endswith("TITLE") and not (ks <= i < ke):
+            out[i] = "O"
+    out = normalize_iob2_labels(out)
+    return out
 def response_schema() -> Dict[str, Any]:
     return {
         "type": "object",
         f.write(message.rstrip() + "\n")
+def build_request_body(
+    model: str,
+    user_payload: str,
+    prompt_cache_key: str,
+    prompt_cache_retention: str,
+    reasoning_effort: str,
+    include_tools: bool = True,
+    include_tool_choice: bool = True,
+    include_reasoning: bool = True,
+    include_cache_key: bool = True,
+    include_cache_retention: bool = True,
+) -> Dict[str, Any]:
+    body: Dict[str, Any] = {
+        "model": model,
+        "instructions": SYSTEM_INSTRUCTIONS,
+        "input": user_payload,
+    }
+    if include_cache_key:
+        body["prompt_cache_key"] = prompt_cache_key
+    if include_cache_retention:
+        body["prompt_cache_retention"] = prompt_cache_retention
+    if include_reasoning:
+        body["reasoning"] = {"effort": reasoning_effort}
+    if include_tools:
+        body["tools"] = [
+            {
+                "type": "function",
+                "name": "submit_labels",
+                "description": "Submit relabeled BIO labels.",
+                "parameters": response_schema(),
+                "strict": True,
+            }
+        ]
+    if include_tool_choice and include_tools:
+        body["tool_choice"] = {"type": "function", "name": "submit_labels"}
+    return body
 def parse_usage(response_obj: Dict[str, Any]) -> UsageStats:
     usage = response_obj.get("usage", {}) or {}
     in_details = usage.get("input_tokens_details", {}) or {}
     user_agent: str,
     retries: int,
     failure_log: str,
+    http_timeout: int,
 ) -> tuple[Dict[int, List[str]], UsageStats]:
     url = f"{api_base.rstrip('/')}/responses"
     headers = {
     }
     user_payload = build_user_payload(batch_rows)
+    cfg = {
+        "include_tools": True,
+        "include_tool_choice": True,
+        "include_reasoning": True,
+        "include_cache_key": True,
+        "include_cache_retention": True,
     }
     last_error: Exception | None = None
     for attempt in range(1, retries + 1):
         try:
+            body = build_request_body(
+                model=model,
+                user_payload=user_payload,
+                prompt_cache_key=prompt_cache_key,
+                prompt_cache_retention=prompt_cache_retention,
+                reasoning_effort=reasoning_effort,
+                include_tools=cfg["include_tools"],
+                include_tool_choice=cfg["include_tool_choice"],
+                include_reasoning=cfg["include_reasoning"],
+                include_cache_key=cfg["include_cache_key"],
+                include_cache_retention=cfg["include_cache_retention"],
+            )
+            resp = requests.post(url, headers=headers, json=body, timeout=http_timeout)
             resp.raise_for_status()
             obj = resp.json()
             usage_stats = parse_usage(obj)
             return mapping, usage_stats
         except Exception as exc:  # noqa: BLE001
             last_error = exc
+            # Some compatible gateways may not support all optional fields.
+            # Downgrade progressively and keep structured tool output whenever possible.
             if isinstance(exc, requests.HTTPError) and exc.response is not None and exc.response.status_code == 400:
+                response_text = (exc.response.text or "")[:1200]
+                lowered = response_text.lower()
+                append_failure_log(
+                    failure_log,
+                    f"[http400] attempt={attempt} model={model} body_cfg={cfg} response={response_text!r}",
+                )
+                if "prompt_cache_retention" in lowered and cfg["include_cache_retention"]:
+                    cfg["include_cache_retention"] = False
+                elif "prompt_cache_key" in lowered and cfg["include_cache_key"]:
+                    cfg["include_cache_key"] = False
+                elif "reasoning" in lowered and cfg["include_reasoning"]:
+                    cfg["include_reasoning"] = False
+                elif "tool_choice" in lowered and cfg["include_tool_choice"]:
+                    cfg["include_tool_choice"] = False
+                elif "tools" in lowered and cfg["include_tools"]:
+                    cfg["include_tools"] = False
             if attempt == retries:
                 break
             time.sleep(0.8 * attempt)
     user_agent: str,
     retries: int,
     failure_log: str,
+    http_timeout: int,
 ) -> List[tuple[Row, List[str]]]:
     usage_total = UsageStats()
     try:
             user_agent=user_agent,
             retries=retries,
             failure_log=failure_log,
+            http_timeout=http_timeout,
         )
         usage_total.add(usage)
     except RuntimeError:
                     user_agent=user_agent,
                     retries=max(retries, 4),
                     failure_log=failure_log,
+                    http_timeout=http_timeout,
                 )
                 usage_total.add(usage)
                 mapping[idx] = single[0]
                     failure_log,
                     f"[row-skip] file_id={row.record.get('file_id')} line={row.line_no} reason={exc}",
                 )
+                # Hard fallback: enforce contiguous TITLE rather than keeping polluted labels.
+                toks = row.record.get("tokens", [])
+                lbs = row.record.get("labels", [])
+                if isinstance(toks, list) and isinstance(lbs, list) and len(toks) == len(lbs):
+                    mapping[idx] = force_single_title_segment(toks, lbs)
+                else:
+                    mapping[idx] = lbs
+    updates: List[tuple[Row, List[str]]] = []
+    for row_id, labels in mapping.items():
+        row = batch[row_id]
+        rec = dict(row.record)
+        rec["labels"] = force_single_title_segment(rec.get("tokens", []), labels)
+        repaired, _repairs = repair_jsonl_item(rec)
+        new_labels = repaired.get("labels", rec.get("labels", []))
+        updates.append((row, new_labels))
+    return updates, usage_total
 def process_batch_timed(
     user_agent: str,
     retries: int,
     failure_log: str,
+    http_timeout: int,
 ) -> Dict[str, Any]:
     meter.task_start()
     t0 = time.time()
             user_agent=user_agent,
             retries=retries,
             failure_log=failure_log,
+            http_timeout=http_timeout,
         )
         return {
             "updates": updates,
                 user_agent=args.user_agent,
                 retries=args.retries,
                 failure_log=args.failure_log,
+                http_timeout=args.http_timeout,
             )
             for batch in batches
         ]