Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 7,702 Bytes

2e1a095

from __future__ import annotations

import argparse
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any


PERMISSIVE_LICENSE_MARKERS = ("apache-2.0", "mit", "bsd", "openrail")
RESTRICTED_LICENSE_MARKERS = (
    "cc-by-nc",
    "non-commercial",
    "noncommercial",
    "llama",
    "gemma",
    "gpl",
    "license not declared",
    "not established",
    "check model card",
    "other",
)


@dataclass(frozen=True)
class GateCheck:
    name: str
    status: str
    detail: str


def load_json(path: Path) -> dict[str, Any]:
    if not path.exists():
        raise FileNotFoundError(f"Score JSON not found: {path}")
    payload = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(payload, dict):
        raise ValueError(f"Score JSON must contain an object: {path}")
    return payload


def license_status(license_text: str, *, allow_restricted: bool = False) -> GateCheck:
    normalized = license_text.strip().lower()
    if not normalized:
        return GateCheck("license", "FAIL", "missing license")
    if any(marker in normalized for marker in RESTRICTED_LICENSE_MARKERS):
        status = "PASS" if allow_restricted else "FAIL"
        return GateCheck("license", status, f"restricted or unclear: {license_text}")
    if any(marker in normalized for marker in PERMISSIVE_LICENSE_MARKERS):
        return GateCheck("license", "PASS", license_text)
    status = "PASS" if allow_restricted else "WARN"
    return GateCheck("license", status, f"unrecognized license: {license_text}")


def score_status(payload: dict[str, Any], kind: str) -> GateCheck:
    promotion_ready = bool(payload.get("promotionReady") or payload.get("ready"))
    if kind == "ocr":
        comparison = payload.get("comparison") or {}
        best = payload.get("best") or {}
        if not promotion_ready:
            return GateCheck("score", "FAIL", "OCR score report is not promotion-ready")
        if not comparison.get("beatsBaseline"):
            return GateCheck("score", "FAIL", "OCR candidate does not beat the wired baseline")
        return GateCheck(
            "score",
            "PASS",
            f"best={best.get('label', '-')} quality={best.get('quality', '-')} delta={comparison.get('scoreDelta', '-')}",
        )
    if not promotion_ready:
        return GateCheck("score", "FAIL", f"{kind} score report is not promotion-ready")
    best = payload.get("best") or {}
    score = best.get("weightedScore", "-")
    label = best.get("voiceId") or best.get("label") or "-"
    return GateCheck("score", "PASS", f"best={label} weighted={score}")


def bool_check(name: str, ok: bool, detail: str) -> GateCheck:
    return GateCheck(name, "PASS" if ok else "FAIL", detail)


def evaluate_promotion(
    *,
    candidate_name: str,
    kind: str,
    license_text: str,
    score_payload: dict[str, Any],
    same_sample: bool,
    runtime_ok: bool,
    privacy_ok: bool,
    human_reviewed: bool,
    allow_restricted_license: bool = False,
) -> dict[str, Any]:
    checks = [
        score_status(score_payload, kind),
        bool_check("same sample", same_sample, "same pages/text used for baseline and candidate"),
        license_status(license_text, allow_restricted=allow_restricted_license),
        bool_check("runtime", runtime_ok, "worker can handle model size, speed, cold starts, and memory"),
        bool_check("privacy/deployment", privacy_ok, "no unsafe external upload path or unclear service/API terms"),
        bool_check("human review", human_reviewed, "Arabic text/audio manually reviewed for meaning, order, and comfort"),
    ]
    failed = [check for check in checks if check.status == "FAIL"]
    warned = [check for check in checks if check.status == "WARN"]
    ready = not failed and not warned
    return {
        "ready": ready,
        "candidate": candidate_name,
        "kind": kind,
        "checks": [asdict(check) for check in checks],
        "summary": "promotion ready" if ready else "keep benchmark-only",
    }


def markdown_value(value: Any) -> str:
    if value is None or value == "":
        return "-"
    return str(value)


def write_report(path: Path, payload: dict[str, Any]) -> None:
    lines = [
        "# Model Promotion Gate",
        "",
        f"Candidate: {markdown_value(payload.get('candidate'))}",
        f"Type: {markdown_value(payload.get('kind'))}",
        f"Decision: {markdown_value(payload.get('summary'))}",
        "",
        "| Check | Status | Detail |",
        "| --- | --- | --- |",
    ]
    for check in payload.get("checks", []):
        lines.append(
            "| "
            + " | ".join(
                [
                    markdown_value(check.get("name")),
                    markdown_value(check.get("status")),
                    markdown_value(check.get("detail")),
                ]
            )
            + " |"
        )
    lines.extend(
        [
            "",
            "## Rule",
            "",
            "Promote only when the candidate wins on the exact same Arabic sample, the license is acceptable, the worker can run it, privacy/deployment terms are clear, and a human review confirms the Arabic text or audio is comfortable and faithful.",
            "Anything else stays benchmark-only.",
        ]
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")


def main() -> None:
    parser = argparse.ArgumentParser(description="Gate OCR/TTS candidates before promoting them into the Arabic audio reader stack.")
    parser.add_argument("--candidate-name", required=True)
    parser.add_argument("--kind", choices=["ocr", "tts", "preprocessor"], required=True)
    parser.add_argument("--license", required=True, dest="license_text")
    parser.add_argument("--score-json", type=Path, required=True, help="JSON output from score_external_ocr.py, score_voice_listening.py, or score_tts_preprocessor.py.")
    parser.add_argument("--same-sample", action="store_true", help="Confirm the candidate and baseline used the exact same pages/text.")
    parser.add_argument("--runtime-ok", action="store_true", help="Confirm the target worker can run the candidate acceptably.")
    parser.add_argument("--privacy-ok", action="store_true", help="Confirm external service/API or local deployment terms are acceptable.")
    parser.add_argument("--human-reviewed", action="store_true", help="Confirm Arabic text/audio was manually reviewed.")
    parser.add_argument("--allow-restricted-license", action="store_true", help="Allow restricted licenses for personal-only experiments.")
    parser.add_argument("--write-report", type=Path)
    parser.add_argument("--json", action="store_true")
    args = parser.parse_args()

    payload = evaluate_promotion(
        candidate_name=args.candidate_name,
        kind=args.kind,
        license_text=args.license_text,
        score_payload=load_json(args.score_json),
        same_sample=args.same_sample,
        runtime_ok=args.runtime_ok,
        privacy_ok=args.privacy_ok,
        human_reviewed=args.human_reviewed,
        allow_restricted_license=args.allow_restricted_license,
    )
    if args.write_report:
        write_report(args.write_report, payload)
        payload["reportPath"] = str(args.write_report)
    if args.json:
        print(json.dumps(payload, ensure_ascii=False, indent=2))
    else:
        print(f"{payload['candidate']}: {payload['summary']}")
        for check in payload["checks"]:
            print(f"- {check['status']} {check['name']}: {check['detail']}")
    if not payload["ready"]:
        raise SystemExit(1)


if __name__ == "__main__":
    main()