File size: 7,702 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from __future__ import annotations

import argparse
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any


PERMISSIVE_LICENSE_MARKERS = ("apache-2.0", "mit", "bsd", "openrail")
RESTRICTED_LICENSE_MARKERS = (
    "cc-by-nc",
    "non-commercial",
    "noncommercial",
    "llama",
    "gemma",
    "gpl",
    "license not declared",
    "not established",
    "check model card",
    "other",
)


@dataclass(frozen=True)
class GateCheck:
    name: str
    status: str
    detail: str


def load_json(path: Path) -> dict[str, Any]:
    if not path.exists():
        raise FileNotFoundError(f"Score JSON not found: {path}")
    payload = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(payload, dict):
        raise ValueError(f"Score JSON must contain an object: {path}")
    return payload


def license_status(license_text: str, *, allow_restricted: bool = False) -> GateCheck:
    normalized = license_text.strip().lower()
    if not normalized:
        return GateCheck("license", "FAIL", "missing license")
    if any(marker in normalized for marker in RESTRICTED_LICENSE_MARKERS):
        status = "PASS" if allow_restricted else "FAIL"
        return GateCheck("license", status, f"restricted or unclear: {license_text}")
    if any(marker in normalized for marker in PERMISSIVE_LICENSE_MARKERS):
        return GateCheck("license", "PASS", license_text)
    status = "PASS" if allow_restricted else "WARN"
    return GateCheck("license", status, f"unrecognized license: {license_text}")


def score_status(payload: dict[str, Any], kind: str) -> GateCheck:
    promotion_ready = bool(payload.get("promotionReady") or payload.get("ready"))
    if kind == "ocr":
        comparison = payload.get("comparison") or {}
        best = payload.get("best") or {}
        if not promotion_ready:
            return GateCheck("score", "FAIL", "OCR score report is not promotion-ready")
        if not comparison.get("beatsBaseline"):
            return GateCheck("score", "FAIL", "OCR candidate does not beat the wired baseline")
        return GateCheck(
            "score",
            "PASS",
            f"best={best.get('label', '-')} quality={best.get('quality', '-')} delta={comparison.get('scoreDelta', '-')}",
        )
    if not promotion_ready:
        return GateCheck("score", "FAIL", f"{kind} score report is not promotion-ready")
    best = payload.get("best") or {}
    score = best.get("weightedScore", "-")
    label = best.get("voiceId") or best.get("label") or "-"
    return GateCheck("score", "PASS", f"best={label} weighted={score}")


def bool_check(name: str, ok: bool, detail: str) -> GateCheck:
    return GateCheck(name, "PASS" if ok else "FAIL", detail)


def evaluate_promotion(
    *,
    candidate_name: str,
    kind: str,
    license_text: str,
    score_payload: dict[str, Any],
    same_sample: bool,
    runtime_ok: bool,
    privacy_ok: bool,
    human_reviewed: bool,
    allow_restricted_license: bool = False,
) -> dict[str, Any]:
    checks = [
        score_status(score_payload, kind),
        bool_check("same sample", same_sample, "same pages/text used for baseline and candidate"),
        license_status(license_text, allow_restricted=allow_restricted_license),
        bool_check("runtime", runtime_ok, "worker can handle model size, speed, cold starts, and memory"),
        bool_check("privacy/deployment", privacy_ok, "no unsafe external upload path or unclear service/API terms"),
        bool_check("human review", human_reviewed, "Arabic text/audio manually reviewed for meaning, order, and comfort"),
    ]
    failed = [check for check in checks if check.status == "FAIL"]
    warned = [check for check in checks if check.status == "WARN"]
    ready = not failed and not warned
    return {
        "ready": ready,
        "candidate": candidate_name,
        "kind": kind,
        "checks": [asdict(check) for check in checks],
        "summary": "promotion ready" if ready else "keep benchmark-only",
    }


def markdown_value(value: Any) -> str:
    if value is None or value == "":
        return "-"
    return str(value)


def write_report(path: Path, payload: dict[str, Any]) -> None:
    lines = [
        "# Model Promotion Gate",
        "",
        f"Candidate: {markdown_value(payload.get('candidate'))}",
        f"Type: {markdown_value(payload.get('kind'))}",
        f"Decision: {markdown_value(payload.get('summary'))}",
        "",
        "| Check | Status | Detail |",
        "| --- | --- | --- |",
    ]
    for check in payload.get("checks", []):
        lines.append(
            "| "
            + " | ".join(
                [
                    markdown_value(check.get("name")),
                    markdown_value(check.get("status")),
                    markdown_value(check.get("detail")),
                ]
            )
            + " |"
        )
    lines.extend(
        [
            "",
            "## Rule",
            "",
            "Promote only when the candidate wins on the exact same Arabic sample, the license is acceptable, the worker can run it, privacy/deployment terms are clear, and a human review confirms the Arabic text or audio is comfortable and faithful.",
            "Anything else stays benchmark-only.",
        ]
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")


def main() -> None:
    parser = argparse.ArgumentParser(description="Gate OCR/TTS candidates before promoting them into the Arabic audio reader stack.")
    parser.add_argument("--candidate-name", required=True)
    parser.add_argument("--kind", choices=["ocr", "tts", "preprocessor"], required=True)
    parser.add_argument("--license", required=True, dest="license_text")
    parser.add_argument("--score-json", type=Path, required=True, help="JSON output from score_external_ocr.py, score_voice_listening.py, or score_tts_preprocessor.py.")
    parser.add_argument("--same-sample", action="store_true", help="Confirm the candidate and baseline used the exact same pages/text.")
    parser.add_argument("--runtime-ok", action="store_true", help="Confirm the target worker can run the candidate acceptably.")
    parser.add_argument("--privacy-ok", action="store_true", help="Confirm external service/API or local deployment terms are acceptable.")
    parser.add_argument("--human-reviewed", action="store_true", help="Confirm Arabic text/audio was manually reviewed.")
    parser.add_argument("--allow-restricted-license", action="store_true", help="Allow restricted licenses for personal-only experiments.")
    parser.add_argument("--write-report", type=Path)
    parser.add_argument("--json", action="store_true")
    args = parser.parse_args()

    payload = evaluate_promotion(
        candidate_name=args.candidate_name,
        kind=args.kind,
        license_text=args.license_text,
        score_payload=load_json(args.score_json),
        same_sample=args.same_sample,
        runtime_ok=args.runtime_ok,
        privacy_ok=args.privacy_ok,
        human_reviewed=args.human_reviewed,
        allow_restricted_license=args.allow_restricted_license,
    )
    if args.write_report:
        write_report(args.write_report, payload)
        payload["reportPath"] = str(args.write_report)
    if args.json:
        print(json.dumps(payload, ensure_ascii=False, indent=2))
    else:
        print(f"{payload['candidate']}: {payload['summary']}")
        for check in payload["checks"]:
            print(f"- {check['status']} {check['name']}: {check['detail']}")
    if not payload["ready"]:
        raise SystemExit(1)


if __name__ == "__main__":
    main()