feat: add tool-call based llm relabel pipeline and update dataset pointer

Browse files

Files changed (2) hide show

datasets/AnimeName +1 -1
tools/llm_relabel_rows.py +444 -0

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~56c54f9fb664335fc0c98f6c9dce8f2fbcc145a0~~


1	+ Subproject commit 9987cc8d7b7bf829d0022ee6e6a0b08de5327975

tools/llm_relabel_rows.py ADDED Viewed

	@@ -0,0 +1,444 @@

+#!/usr/bin/env python3
+"""
+Relabel selected rows in a JSONL dataset via an OpenAI-compatible Responses API.
+Designed for high-throughput cleanup with a stable prompt prefix and
+`prompt_cache_key` to improve cache hit rates across calls.
+"""
+from __future__ import annotations
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Sequence
+import requests
+ALLOWED_LABELS = {
+    "O",
+    "B-TITLE", "I-TITLE",
+    "B-SEASON", "I-SEASON",
+    "B-EPISODE", "I-EPISODE",
+    "B-SPECIAL", "I-SPECIAL",
+    "B-GROUP", "I-GROUP",
+    "B-RESOLUTION", "I-RESOLUTION",
+    "B-SOURCE", "I-SOURCE",
+}
+LANG_MARKERS = (
+    "中文版",
+    "日语版",
+    "国语版",
+    "粤语版",
+    "英语版",
+    "英配版",
+    "中配版",
+    "日配版",
+)
+SYSTEM_INSTRUCTIONS = """You relabel anime filename tokens with BIO tags.
+Allowed labels only:
+O, B/I-TITLE, B/I-SEASON, B/I-EPISODE, B/I-SPECIAL, B/I-GROUP, B/I-RESOLUTION, B/I-SOURCE.
+Hard rules:
+1) Output exactly one label per token.
+2) Language markers like 中文版/日语版/国语版/粤语版/英语版/英配版/中配版/日配版 must be SOURCE.
+3) Episode identifiers (e.g. 01, 13, EP13, 第13集/話/话) must be EPISODE.
+4) If title already appears before episode number, episode-name text after the episode number should be O (not TITLE).
+5) Preserve obvious GROUP/RESOLUTION/SOURCE tags when present.
+Return strict JSON only:
+{"results":[{"row_id":int,"labels":[str,...]}]}
+No markdown. No explanation.
+"""
+@dataclass
+class Row:
+    line_no: int
+    record: Dict[str, Any]
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Relabel selected JSONL rows via Responses API")
+    p.add_argument("--input", required=True, help="Input JSONL")
+    p.add_argument("--output", required=True, help="Output JSONL (can equal input)")
+    p.add_argument("--api-base", required=True, help="API base URL, e.g. http://host:port/v1")
+    p.add_argument("--api-key", default=None, help="API key; falls back to env ANIFILEBERT_RELABEL_API_KEY")
+    p.add_argument("--model", default="gpt-5.4-mini", help="Model name")
+    p.add_argument(
+        "--selector",
+        choices=("language", "discontinuous_title", "all"),
+        default="language",
+        help="Row selector",
+    )
+    p.add_argument("--batch-size", type=int, default=12, help="Rows per request")
+    p.add_argument("--concurrency", type=int, default=4, help="Parallel request workers")
+    p.add_argument("--max-rows", type=int, default=0, help="Optional cap; 0 means no cap")
+    p.add_argument("--skip-selected", type=int, default=0, help="Skip this many selected rows before processing")
+    p.add_argument("--retries", type=int, default=3, help="Retries per batch")
+    p.add_argument("--sleep-ms", type=int, default=150, help="Delay between successful calls")
+    p.add_argument("--prompt-cache-key", default="anifilebert-relabel-v1", help="Stable prompt cache key")
+    p.add_argument("--prompt-cache-retention", default="24h", help="Prompt cache retention hint")
+    p.add_argument("--reasoning-effort", default="medium", help="Reasoning effort (e.g. low/medium/high)")
+    p.add_argument("--checkpoint-rows", type=int, default=100, help="Write checkpoint every N processed rows")
+    p.add_argument("--failure-log", default="reports/llm_relabel_failures.log", help="Failure log path")
+    p.add_argument(
+        "--user-agent",
+        default="Codex Desktop/0.133.0-alpha.1 (Windows 10.0.22631; x86_64) unknown (Codex Desktop; 26.519.41501)",
+        help="User-Agent header",
+    )
+    return p.parse_args()
+def select_row(record: Dict[str, Any], selector: str) -> bool:
+    if selector == "all":
+        return True
+    if selector == "discontinuous_title":
+        labels = record.get("labels", [])
+        if not isinstance(labels, list):
+            return False
+        in_title = [lb.endswith("TITLE") for lb in labels]
+        seen_title = False
+        seen_gap = False
+        for flag in in_title:
+            if flag:
+                if seen_title and seen_gap:
+                    return True
+                seen_title = True
+            elif seen_title:
+                seen_gap = True
+        return False
+    filename = str(record.get("filename", ""))
+    return any(marker in filename for marker in LANG_MARKERS)
+def load_rows(path: Path, selector: str) -> tuple[List[Dict[str, Any]], List[Row]]:
+    all_records: List[Dict[str, Any]] = []
+    selected: List[Row] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, 1):
+            rec = json.loads(line)
+            all_records.append(rec)
+            if select_row(rec, selector):
+                selected.append(Row(line_no=line_no, record=rec))
+    return all_records, selected
+def parse_model_json(text: str) -> Dict[str, Any]:
+    raw = text.strip()
+    raw = re.sub(r"^```(?:json)?\s*", "", raw)
+    raw = re.sub(r"\s*```$", "", raw)
+    return json.loads(raw)
+def build_user_payload(batch_rows: Sequence[Row]) -> str:
+    rows: List[Dict[str, Any]] = []
+    for i, row in enumerate(batch_rows):
+        rec = row.record
+        rows.append(
+            {
+                "row_id": i,
+                "file_id": rec.get("file_id"),
+                "filename": rec.get("filename"),
+                "tokens": rec.get("tokens"),
+                "current_labels": rec.get("labels"),
+            }
+        )
+    return json.dumps({"rows": rows}, ensure_ascii=False)
+def extract_output_text(response_obj: Dict[str, Any]) -> str:
+    output = response_obj.get("output", [])
+    for item in output:
+        for content in item.get("content", []):
+            if content.get("type") == "output_text":
+                return content.get("text", "")
+    raise ValueError("No output_text found in response")
+def extract_function_args(response_obj: Dict[str, Any], func_name: str) -> Dict[str, Any]:
+    output = response_obj.get("output", [])
+    for item in output:
+        if item.get("type") == "function_call" and item.get("name") == func_name:
+            return json.loads(item.get("arguments", "{}"))
+    raise ValueError(f"No function_call '{func_name}' found in response")
+def validate_labels(tokens: Sequence[str], labels: Sequence[str]) -> bool:
+    if len(tokens) != len(labels):
+        return False
+    for lb in labels:
+        if lb not in ALLOWED_LABELS:
+            return False
+    return True
+def response_schema() -> Dict[str, Any]:
+    return {
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {
+            "results": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "row_id": {"type": "integer"},
+                        "labels": {
+                            "type": "array",
+                            "items": {"type": "string", "enum": sorted(ALLOWED_LABELS)},
+                        },
+                    },
+                    "required": ["row_id", "labels"],
+                },
+            }
+        },
+        "required": ["results"],
+    }
+def append_failure_log(path: str, message: str) -> None:
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with p.open("a", encoding="utf-8") as f:
+        f.write(message.rstrip() + "\n")
+def relabel_batch(
+    api_base: str,
+    api_key: str,
+    model: str,
+    batch_rows: Sequence[Row],
+    prompt_cache_key: str,
+    prompt_cache_retention: str,
+    reasoning_effort: str,
+    user_agent: str,
+    retries: int,
+    failure_log: str,
+) -> Dict[int, List[str]]:
+    url = f"{api_base.rstrip('/')}/responses"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "User-Agent": user_agent,
+    }
+    user_payload = build_user_payload(batch_rows)
+    body = {
+        "model": model,
+        "instructions": SYSTEM_INSTRUCTIONS,
+        "input": user_payload,
+        "prompt_cache_key": prompt_cache_key,
+        "prompt_cache_retention": prompt_cache_retention,
+        "reasoning": {"effort": reasoning_effort},
+        "tools": [
+            {
+                "type": "function",
+                "name": "submit_labels",
+                "description": "Submit relabeled BIO labels.",
+                "parameters": response_schema(),
+                "strict": True,
+            }
+        ],
+        "tool_choice": {"type": "function", "name": "submit_labels"},
+    }
+    last_error: Exception | None = None
+    for attempt in range(1, retries + 1):
+        try:
+            resp = requests.post(url, headers=headers, json=body, timeout=120)
+            resp.raise_for_status()
+            obj = resp.json()
+            try:
+                parsed = extract_function_args(obj, "submit_labels")
+            except Exception:
+                text = extract_output_text(obj)
+                parsed = parse_model_json(text)
+            results = parsed.get("results")
+            if not isinstance(results, list):
+                append_failure_log(
+                    failure_log,
+                    f"[invalid-results] model={model} batch={len(batch_rows)} parsed_keys={list(parsed.keys())}",
+                )
+                raise ValueError("response JSON missing 'results' list")
+            mapping: Dict[int, List[str]] = {}
+            for item in results:
+                if not isinstance(item, dict):
+                    continue
+                row_id = item.get("row_id")
+                labels = item.get("labels")
+                if not isinstance(row_id, int) or not isinstance(labels, list):
+                    continue
+                if row_id < 0 or row_id >= len(batch_rows):
+                    continue
+                tokens = batch_rows[row_id].record.get("tokens", [])
+                if not validate_labels(tokens, labels):
+                    append_failure_log(
+                        failure_log,
+                        f"[invalid-labels] file_id={batch_rows[row_id].record.get('file_id')} "
+                        f"tokens_len={len(tokens)} labels_len={len(labels)}",
+                    )
+                    continue
+                mapping[row_id] = labels
+            if len(mapping) != len(batch_rows):
+                missing = sorted(set(range(len(batch_rows))) - set(mapping))
+                append_failure_log(
+                    failure_log,
+                    f"[missing] model={model} batch={len(batch_rows)} missing={missing}",
+                )
+                raise ValueError(f"incomplete/invalid rows from model: missing={missing}")
+            return mapping
+        except Exception as exc:  # noqa: BLE001
+            last_error = exc
+            # Some compatible gateways may not support prompt caching or reasoning fields.
+            if isinstance(exc, requests.HTTPError) and exc.response is not None and exc.response.status_code == 400:
+                body.pop("prompt_cache_retention", None)
+                body.pop("reasoning", None)
+                body.pop("tools", None)
+                body.pop("tool_choice", None)
+            if attempt == retries:
+                break
+            time.sleep(0.8 * attempt)
+    raise RuntimeError(f"failed relabel batch after {retries} attempts: {last_error}")
+def write_jsonl(path: Path, records: Sequence[Dict[str, Any]]) -> None:
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with tmp.open("w", encoding="utf-8", newline="") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False, separators=(",", ":")) + "\n")
+    tmp.replace(path)
+def process_batch_with_fallback(
+    api_base: str,
+    api_key: str,
+    model: str,
+    batch: Sequence[Row],
+    prompt_cache_key: str,
+    prompt_cache_retention: str,
+    reasoning_effort: str,
+    user_agent: str,
+    retries: int,
+    failure_log: str,
+) -> List[tuple[Row, List[str]]]:
+    try:
+        mapping = relabel_batch(
+            api_base=api_base,
+            api_key=api_key,
+            model=model,
+            batch_rows=batch,
+            prompt_cache_key=prompt_cache_key,
+            prompt_cache_retention=prompt_cache_retention,
+            reasoning_effort=reasoning_effort,
+            user_agent=user_agent,
+            retries=retries,
+            failure_log=failure_log,
+        )
+    except RuntimeError:
+        mapping = {}
+        for idx, row in enumerate(batch):
+            try:
+                single = relabel_batch(
+                    api_base=api_base,
+                    api_key=api_key,
+                    model=model,
+                    batch_rows=[row],
+                    prompt_cache_key=prompt_cache_key,
+                    prompt_cache_retention=prompt_cache_retention,
+                    reasoning_effort=reasoning_effort,
+                    user_agent=user_agent,
+                    retries=max(retries, 4),
+                    failure_log=failure_log,
+                )
+                mapping[idx] = single[0]
+            except RuntimeError as exc:
+                append_failure_log(
+                    failure_log,
+                    f"[row-skip] file_id={row.record.get('file_id')} line={row.line_no} reason={exc}",
+                )
+                mapping[idx] = row.record.get("labels", [])
+    return [(batch[row_id], labels) for row_id, labels in mapping.items()]
+def main() -> None:
+    args = parse_args()
+    api_key = args.api_key or os.environ.get("ANIFILEBERT_RELABEL_API_KEY")
+    if not api_key:
+        raise SystemExit("Missing API key. Use --api-key or env ANIFILEBERT_RELABEL_API_KEY")
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    all_records, selected_rows = load_rows(input_path, args.selector)
+    if args.skip_selected > 0:
+        selected_rows = selected_rows[args.skip_selected:]
+    if args.max_rows > 0:
+        selected_rows = selected_rows[: args.max_rows]
+    if not selected_rows:
+        print("selected_rows=0; nothing to do")
+        if output_path != input_path:
+            write_jsonl(output_path, all_records)
+        return
+    total = len(selected_rows)
+    changed = 0
+    concurrency = max(1, min(args.concurrency, 8))
+    batches: List[List[Row]] = [
+        selected_rows[i:i + args.batch_size]
+        for i in range(0, total, args.batch_size)
+    ]
+    done_rows = 0
+    with ThreadPoolExecutor(max_workers=concurrency) as executor:
+        futures = [
+            executor.submit(
+                process_batch_with_fallback,
+                api_base=args.api_base,
+                api_key=api_key,
+                model=args.model,
+                batch=batch,
+                prompt_cache_key=args.prompt_cache_key,
+                prompt_cache_retention=args.prompt_cache_retention,
+                reasoning_effort=args.reasoning_effort,
+                user_agent=args.user_agent,
+                retries=args.retries,
+                failure_log=args.failure_log,
+            )
+            for batch in batches
+        ]
+        for fut in as_completed(futures):
+            updates = fut.result()
+            for row, new_labels in updates:
+                rec = row.record
+                if rec.get("labels") != new_labels:
+                    rec["labels"] = new_labels
+                    changed += 1
+            done_rows += len(updates)
+            print(f"processed={done_rows}/{total} changed={changed}")
+            if args.checkpoint_rows > 0 and (done_rows % args.checkpoint_rows == 0 or done_rows == total):
+                write_jsonl(output_path, all_records)
+            if args.sleep_ms > 0:
+                time.sleep(args.sleep_ms / 1000.0)
+    # rows in selected_rows reference dicts in all_records by identity, so changes are already reflected.
+    write_jsonl(output_path, all_records)
+    print(f"done selected_rows={total} changed_rows={changed} output={output_path}")
+if __name__ == "__main__":
+    main()