arabic-audio-reader-worker / scripts /model_promotion_gate.py
Syncre's picture
Deploy Arabic Audio Reader worker
2e1a095 verified
from __future__ import annotations
import argparse
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
PERMISSIVE_LICENSE_MARKERS = ("apache-2.0", "mit", "bsd", "openrail")
RESTRICTED_LICENSE_MARKERS = (
"cc-by-nc",
"non-commercial",
"noncommercial",
"llama",
"gemma",
"gpl",
"license not declared",
"not established",
"check model card",
"other",
)
@dataclass(frozen=True)
class GateCheck:
name: str
status: str
detail: str
def load_json(path: Path) -> dict[str, Any]:
if not path.exists():
raise FileNotFoundError(f"Score JSON not found: {path}")
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError(f"Score JSON must contain an object: {path}")
return payload
def license_status(license_text: str, *, allow_restricted: bool = False) -> GateCheck:
normalized = license_text.strip().lower()
if not normalized:
return GateCheck("license", "FAIL", "missing license")
if any(marker in normalized for marker in RESTRICTED_LICENSE_MARKERS):
status = "PASS" if allow_restricted else "FAIL"
return GateCheck("license", status, f"restricted or unclear: {license_text}")
if any(marker in normalized for marker in PERMISSIVE_LICENSE_MARKERS):
return GateCheck("license", "PASS", license_text)
status = "PASS" if allow_restricted else "WARN"
return GateCheck("license", status, f"unrecognized license: {license_text}")
def score_status(payload: dict[str, Any], kind: str) -> GateCheck:
promotion_ready = bool(payload.get("promotionReady") or payload.get("ready"))
if kind == "ocr":
comparison = payload.get("comparison") or {}
best = payload.get("best") or {}
if not promotion_ready:
return GateCheck("score", "FAIL", "OCR score report is not promotion-ready")
if not comparison.get("beatsBaseline"):
return GateCheck("score", "FAIL", "OCR candidate does not beat the wired baseline")
return GateCheck(
"score",
"PASS",
f"best={best.get('label', '-')} quality={best.get('quality', '-')} delta={comparison.get('scoreDelta', '-')}",
)
if not promotion_ready:
return GateCheck("score", "FAIL", f"{kind} score report is not promotion-ready")
best = payload.get("best") or {}
score = best.get("weightedScore", "-")
label = best.get("voiceId") or best.get("label") or "-"
return GateCheck("score", "PASS", f"best={label} weighted={score}")
def bool_check(name: str, ok: bool, detail: str) -> GateCheck:
return GateCheck(name, "PASS" if ok else "FAIL", detail)
def evaluate_promotion(
*,
candidate_name: str,
kind: str,
license_text: str,
score_payload: dict[str, Any],
same_sample: bool,
runtime_ok: bool,
privacy_ok: bool,
human_reviewed: bool,
allow_restricted_license: bool = False,
) -> dict[str, Any]:
checks = [
score_status(score_payload, kind),
bool_check("same sample", same_sample, "same pages/text used for baseline and candidate"),
license_status(license_text, allow_restricted=allow_restricted_license),
bool_check("runtime", runtime_ok, "worker can handle model size, speed, cold starts, and memory"),
bool_check("privacy/deployment", privacy_ok, "no unsafe external upload path or unclear service/API terms"),
bool_check("human review", human_reviewed, "Arabic text/audio manually reviewed for meaning, order, and comfort"),
]
failed = [check for check in checks if check.status == "FAIL"]
warned = [check for check in checks if check.status == "WARN"]
ready = not failed and not warned
return {
"ready": ready,
"candidate": candidate_name,
"kind": kind,
"checks": [asdict(check) for check in checks],
"summary": "promotion ready" if ready else "keep benchmark-only",
}
def markdown_value(value: Any) -> str:
if value is None or value == "":
return "-"
return str(value)
def write_report(path: Path, payload: dict[str, Any]) -> None:
lines = [
"# Model Promotion Gate",
"",
f"Candidate: {markdown_value(payload.get('candidate'))}",
f"Type: {markdown_value(payload.get('kind'))}",
f"Decision: {markdown_value(payload.get('summary'))}",
"",
"| Check | Status | Detail |",
"| --- | --- | --- |",
]
for check in payload.get("checks", []):
lines.append(
"| "
+ " | ".join(
[
markdown_value(check.get("name")),
markdown_value(check.get("status")),
markdown_value(check.get("detail")),
]
)
+ " |"
)
lines.extend(
[
"",
"## Rule",
"",
"Promote only when the candidate wins on the exact same Arabic sample, the license is acceptable, the worker can run it, privacy/deployment terms are clear, and a human review confirms the Arabic text or audio is comfortable and faithful.",
"Anything else stays benchmark-only.",
]
)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
def main() -> None:
parser = argparse.ArgumentParser(description="Gate OCR/TTS candidates before promoting them into the Arabic audio reader stack.")
parser.add_argument("--candidate-name", required=True)
parser.add_argument("--kind", choices=["ocr", "tts", "preprocessor"], required=True)
parser.add_argument("--license", required=True, dest="license_text")
parser.add_argument("--score-json", type=Path, required=True, help="JSON output from score_external_ocr.py, score_voice_listening.py, or score_tts_preprocessor.py.")
parser.add_argument("--same-sample", action="store_true", help="Confirm the candidate and baseline used the exact same pages/text.")
parser.add_argument("--runtime-ok", action="store_true", help="Confirm the target worker can run the candidate acceptably.")
parser.add_argument("--privacy-ok", action="store_true", help="Confirm external service/API or local deployment terms are acceptable.")
parser.add_argument("--human-reviewed", action="store_true", help="Confirm Arabic text/audio was manually reviewed.")
parser.add_argument("--allow-restricted-license", action="store_true", help="Allow restricted licenses for personal-only experiments.")
parser.add_argument("--write-report", type=Path)
parser.add_argument("--json", action="store_true")
args = parser.parse_args()
payload = evaluate_promotion(
candidate_name=args.candidate_name,
kind=args.kind,
license_text=args.license_text,
score_payload=load_json(args.score_json),
same_sample=args.same_sample,
runtime_ok=args.runtime_ok,
privacy_ok=args.privacy_ok,
human_reviewed=args.human_reviewed,
allow_restricted_license=args.allow_restricted_license,
)
if args.write_report:
write_report(args.write_report, payload)
payload["reportPath"] = str(args.write_report)
if args.json:
print(json.dumps(payload, ensure_ascii=False, indent=2))
else:
print(f"{payload['candidate']}: {payload['summary']}")
for check in payload["checks"]:
print(f"- {check['status']} {check['name']}: {check['detail']}")
if not payload["ready"]:
raise SystemExit(1)
if __name__ == "__main__":
main()