File size: 9,200 Bytes
88d2f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""Backfill canonical per-judge keys into ``quality_scores`` JSON columns.

W13-B Phase 4 closure. Historical rows persisted before this wave have
``translation_scores`` keyed by the raw panel emissions (``bleu``,
``comet``, ``mqm``) and ``style_alignment_passes`` keyed by the short
``d1`` .. ``d8`` aliases. The W12 audit flagged these as missing the
canonical judge names (``mqm_llm``, ``d1_structural`` .. ``d8_duplicate_detection``).

This script walks every ``quality_scores`` row and, for each row,
derives the canonical name -> score/passed mapping from the existing
``translation_scores._judges`` dossier (W9-A smuggle). Rows that
predate the dossier emission are tagged with ``backfill_unknown=True``
under both columns so consumers can distinguish "never had a dossier"
from "dossier present, canonical keys derived".

The script is idempotent: rows that already carry the canonical keys
are skipped (no rewrite). Safe to re-run.

Usage::

    .venv/bin/python scripts/backfill_qualityscore_json.py

Add ``--dry-run`` to preview the change set without writing.
"""

from __future__ import annotations

import argparse
import json
import sqlite3
import sys
from pathlib import Path
from typing import Any, Optional

# Canonical translation judge names. Order must match
# ``judges/panel.py::_MOCK_JUDGE_NAMES`` for the first three entries.
_TRANSLATION_CANONICAL: tuple[str, ...] = ("bleu", "comet", "mqm_llm")

# Short alias -> canonical full name for the 8 style judges.
_STYLE_CANONICAL_BY_SHORT: dict[str, str] = {
    "d1": "d1_structural",
    "d2": "d2_stylistic",
    "d3": "d3_framing",
    "d4": "d4_granularity",
    "d5": "d5_resolution_clarity",
    "d6": "d6_source_reliability",
    "d7": "d7_leading_check",
    "d8": "d8_duplicate_detection",
}


def _derive_canonical(
    translation_scores: dict[str, Any],
    style_alignment_passes: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any], bool]:
    """Return updated copies + a ``changed`` flag.

    Strategy:
      * Prefer the ``_judges`` dossier when present (carries the
        canonical names + normalized 0-1 ``score`` field).
      * Fallback: map short ``d1``..``d8`` aliases to their canonical
        names by hardcoded lookup; for ``mqm_llm`` synthesize from the
        existing ``mqm`` dict's ``score`` (0-100 -> 0-1).
      * Last-resort: stamp ``backfill_unknown`` so consumers know the
        row could not be enriched.
    """

    ts_out = dict(translation_scores)
    sap_out = dict(style_alignment_passes)
    changed = False

    dossier = translation_scores.get("_judges") if isinstance(
        translation_scores, dict
    ) else None

    if isinstance(dossier, list) and dossier:
        for j in dossier:
            if not isinstance(j, dict):
                continue
            raw_name = j.get("name")
            if not isinstance(raw_name, str):
                continue
            # Legacy live-verify rows wrote uppercase names (BLEU/COMET/MQM).
            # Normalize to lowercase so the canonical-name lookup matches.
            name_lower = raw_name.lower()
            # Map legacy ``mqm`` (without _llm suffix) onto the canonical
            # judge name. The dossier score is on the 0-1 scale (or 0-100 for
            # legacy rows that smuggled the raw MQM); both fall through to
            # the float conversion below — the consumer normalizes display.
            if name_lower == "mqm":
                name_lower = "mqm_llm"
            if name_lower in _TRANSLATION_CANONICAL:
                raw = j.get("score")
                if name_lower not in ts_out and isinstance(raw, (int, float)):
                    score = float(raw)
                    # Legacy uppercase MQM rows stored 0-100 directly; map
                    # to 0-1 so all canonical translation scores share scale.
                    if name_lower == "mqm_llm" and score > 1:
                        score = score / 100.0
                    ts_out[name_lower] = score
                    changed = True
            elif name_lower.startswith("d") and name_lower not in sap_out:
                sap_out[name_lower] = bool(j.get("passed"))
                changed = True
        return ts_out, sap_out, changed

    # No dossier — try direct alias mapping for style passes.
    for short, full in _STYLE_CANONICAL_BY_SHORT.items():
        if short in sap_out and full not in sap_out:
            sap_out[full] = bool(sap_out[short])
            changed = True

    # mqm_llm fallback from the panel-emitted ``mqm`` dict.
    if "mqm_llm" not in ts_out:
        mqm_raw = ts_out.get("mqm")
        if isinstance(mqm_raw, dict):
            mqm_score = mqm_raw.get("score")
            if isinstance(mqm_score, (int, float)):
                # 0-100 -> 0-1 normalized.
                normalized = float(mqm_score) / 100.0 if mqm_score > 1 else float(
                    mqm_score
                )
                ts_out["mqm_llm"] = normalized
                changed = True

    # If we still couldn't fill in any canonical translation key, tag
    # the row so consumers can flag it instead of treating absence as
    # "judge didn't run".
    has_any_canonical_ts = any(k in ts_out for k in _TRANSLATION_CANONICAL)
    has_any_canonical_sap = any(
        full in sap_out for full in _STYLE_CANONICAL_BY_SHORT.values()
    )
    if not has_any_canonical_ts and "backfill_unknown" not in ts_out:
        ts_out["backfill_unknown"] = True
        changed = True
    if not has_any_canonical_sap and "backfill_unknown" not in sap_out:
        sap_out["backfill_unknown"] = True
        changed = True

    return ts_out, sap_out, changed


def _row_already_canonical(
    ts: dict[str, Any], sap: dict[str, Any]
) -> bool:
    """Return True when both columns already carry canonical keys."""

    ts_has_all = all(k in ts for k in _TRANSLATION_CANONICAL)
    sap_has_all = all(
        full in sap for full in _STYLE_CANONICAL_BY_SHORT.values()
    )
    return ts_has_all and sap_has_all


def backfill(db_path: Path, dry_run: bool = False) -> dict[str, int]:
    """Walk every quality_scores row, derive canonical keys, write back.

    Returns a stats dict: ``{"total", "already_canonical", "updated",
    "tagged_unknown", "no_change"}``.
    """

    conn = sqlite3.connect(str(db_path))
    try:
        rows = conn.execute(
            "SELECT event_id, translation_scores, style_alignment_passes "
            "FROM quality_scores"
        ).fetchall()
    finally:
        # we'll re-open for writes below
        pass

    stats = {
        "total": len(rows),
        "already_canonical": 0,
        "updated": 0,
        "tagged_unknown": 0,
        "no_change": 0,
    }

    updates: list[tuple[int, str, str]] = []
    for event_id, ts_raw, sap_raw in rows:
        try:
            ts_obj: dict[str, Any] = json.loads(ts_raw) if ts_raw else {}
        except (TypeError, json.JSONDecodeError):
            ts_obj = {}
        try:
            sap_obj: dict[str, Any] = json.loads(sap_raw) if sap_raw else {}
        except (TypeError, json.JSONDecodeError):
            sap_obj = {}

        if _row_already_canonical(ts_obj, sap_obj):
            stats["already_canonical"] += 1
            continue

        new_ts, new_sap, changed = _derive_canonical(ts_obj, sap_obj)
        if not changed:
            stats["no_change"] += 1
            continue

        if new_ts.get("backfill_unknown") or new_sap.get("backfill_unknown"):
            stats["tagged_unknown"] += 1
        else:
            stats["updated"] += 1

        updates.append(
            (event_id, json.dumps(new_ts), json.dumps(new_sap))
        )

    if not dry_run and updates:
        with conn:
            conn.executemany(
                "UPDATE quality_scores SET translation_scores = ?, "
                "style_alignment_passes = ? WHERE event_id = ?",
                [(ts, sap, eid) for eid, ts, sap in updates],
            )
    conn.close()
    return stats


def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--db",
        type=Path,
        default=Path("polyglot_alpha.db"),
        help="Path to the SQLite database (default: polyglot_alpha.db).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Audit and print stats without writing.",
    )
    return parser.parse_args(argv)


def main(argv: Optional[list[str]] = None) -> int:
    args = _parse_args(argv)
    if not args.db.exists():
        print(f"error: database not found: {args.db}", file=sys.stderr)
        return 1

    stats = backfill(args.db, dry_run=args.dry_run)
    mode = "DRY RUN" if args.dry_run else "APPLIED"
    print(f"[{mode}] backfill_qualityscore_json on {args.db}")
    print(f"  total rows:         {stats['total']}")
    print(f"  already canonical:  {stats['already_canonical']}")
    print(f"  updated (dossier):  {stats['updated']}")
    print(f"  tagged unknown:     {stats['tagged_unknown']}")
    print(f"  no-change:          {stats['no_change']}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())