"""Backfill canonical per-judge keys into ``quality_scores`` JSON columns. W13-B Phase 4 closure. Historical rows persisted before this wave have ``translation_scores`` keyed by the raw panel emissions (``bleu``, ``comet``, ``mqm``) and ``style_alignment_passes`` keyed by the short ``d1`` .. ``d8`` aliases. The W12 audit flagged these as missing the canonical judge names (``mqm_llm``, ``d1_structural`` .. ``d8_duplicate_detection``). This script walks every ``quality_scores`` row and, for each row, derives the canonical name -> score/passed mapping from the existing ``translation_scores._judges`` dossier (W9-A smuggle). Rows that predate the dossier emission are tagged with ``backfill_unknown=True`` under both columns so consumers can distinguish "never had a dossier" from "dossier present, canonical keys derived". The script is idempotent: rows that already carry the canonical keys are skipped (no rewrite). Safe to re-run. Usage:: .venv/bin/python scripts/backfill_qualityscore_json.py Add ``--dry-run`` to preview the change set without writing. """ from __future__ import annotations import argparse import json import sqlite3 import sys from pathlib import Path from typing import Any, Optional # Canonical translation judge names. Order must match # ``judges/panel.py::_MOCK_JUDGE_NAMES`` for the first three entries. _TRANSLATION_CANONICAL: tuple[str, ...] = ("bleu", "comet", "mqm_llm") # Short alias -> canonical full name for the 8 style judges. _STYLE_CANONICAL_BY_SHORT: dict[str, str] = { "d1": "d1_structural", "d2": "d2_stylistic", "d3": "d3_framing", "d4": "d4_granularity", "d5": "d5_resolution_clarity", "d6": "d6_source_reliability", "d7": "d7_leading_check", "d8": "d8_duplicate_detection", } def _derive_canonical( translation_scores: dict[str, Any], style_alignment_passes: dict[str, Any], ) -> tuple[dict[str, Any], dict[str, Any], bool]: """Return updated copies + a ``changed`` flag. Strategy: * Prefer the ``_judges`` dossier when present (carries the canonical names + normalized 0-1 ``score`` field). * Fallback: map short ``d1``..``d8`` aliases to their canonical names by hardcoded lookup; for ``mqm_llm`` synthesize from the existing ``mqm`` dict's ``score`` (0-100 -> 0-1). * Last-resort: stamp ``backfill_unknown`` so consumers know the row could not be enriched. """ ts_out = dict(translation_scores) sap_out = dict(style_alignment_passes) changed = False dossier = translation_scores.get("_judges") if isinstance( translation_scores, dict ) else None if isinstance(dossier, list) and dossier: for j in dossier: if not isinstance(j, dict): continue raw_name = j.get("name") if not isinstance(raw_name, str): continue # Legacy live-verify rows wrote uppercase names (BLEU/COMET/MQM). # Normalize to lowercase so the canonical-name lookup matches. name_lower = raw_name.lower() # Map legacy ``mqm`` (without _llm suffix) onto the canonical # judge name. The dossier score is on the 0-1 scale (or 0-100 for # legacy rows that smuggled the raw MQM); both fall through to # the float conversion below — the consumer normalizes display. if name_lower == "mqm": name_lower = "mqm_llm" if name_lower in _TRANSLATION_CANONICAL: raw = j.get("score") if name_lower not in ts_out and isinstance(raw, (int, float)): score = float(raw) # Legacy uppercase MQM rows stored 0-100 directly; map # to 0-1 so all canonical translation scores share scale. if name_lower == "mqm_llm" and score > 1: score = score / 100.0 ts_out[name_lower] = score changed = True elif name_lower.startswith("d") and name_lower not in sap_out: sap_out[name_lower] = bool(j.get("passed")) changed = True return ts_out, sap_out, changed # No dossier — try direct alias mapping for style passes. for short, full in _STYLE_CANONICAL_BY_SHORT.items(): if short in sap_out and full not in sap_out: sap_out[full] = bool(sap_out[short]) changed = True # mqm_llm fallback from the panel-emitted ``mqm`` dict. if "mqm_llm" not in ts_out: mqm_raw = ts_out.get("mqm") if isinstance(mqm_raw, dict): mqm_score = mqm_raw.get("score") if isinstance(mqm_score, (int, float)): # 0-100 -> 0-1 normalized. normalized = float(mqm_score) / 100.0 if mqm_score > 1 else float( mqm_score ) ts_out["mqm_llm"] = normalized changed = True # If we still couldn't fill in any canonical translation key, tag # the row so consumers can flag it instead of treating absence as # "judge didn't run". has_any_canonical_ts = any(k in ts_out for k in _TRANSLATION_CANONICAL) has_any_canonical_sap = any( full in sap_out for full in _STYLE_CANONICAL_BY_SHORT.values() ) if not has_any_canonical_ts and "backfill_unknown" not in ts_out: ts_out["backfill_unknown"] = True changed = True if not has_any_canonical_sap and "backfill_unknown" not in sap_out: sap_out["backfill_unknown"] = True changed = True return ts_out, sap_out, changed def _row_already_canonical( ts: dict[str, Any], sap: dict[str, Any] ) -> bool: """Return True when both columns already carry canonical keys.""" ts_has_all = all(k in ts for k in _TRANSLATION_CANONICAL) sap_has_all = all( full in sap for full in _STYLE_CANONICAL_BY_SHORT.values() ) return ts_has_all and sap_has_all def backfill(db_path: Path, dry_run: bool = False) -> dict[str, int]: """Walk every quality_scores row, derive canonical keys, write back. Returns a stats dict: ``{"total", "already_canonical", "updated", "tagged_unknown", "no_change"}``. """ conn = sqlite3.connect(str(db_path)) try: rows = conn.execute( "SELECT event_id, translation_scores, style_alignment_passes " "FROM quality_scores" ).fetchall() finally: # we'll re-open for writes below pass stats = { "total": len(rows), "already_canonical": 0, "updated": 0, "tagged_unknown": 0, "no_change": 0, } updates: list[tuple[int, str, str]] = [] for event_id, ts_raw, sap_raw in rows: try: ts_obj: dict[str, Any] = json.loads(ts_raw) if ts_raw else {} except (TypeError, json.JSONDecodeError): ts_obj = {} try: sap_obj: dict[str, Any] = json.loads(sap_raw) if sap_raw else {} except (TypeError, json.JSONDecodeError): sap_obj = {} if _row_already_canonical(ts_obj, sap_obj): stats["already_canonical"] += 1 continue new_ts, new_sap, changed = _derive_canonical(ts_obj, sap_obj) if not changed: stats["no_change"] += 1 continue if new_ts.get("backfill_unknown") or new_sap.get("backfill_unknown"): stats["tagged_unknown"] += 1 else: stats["updated"] += 1 updates.append( (event_id, json.dumps(new_ts), json.dumps(new_sap)) ) if not dry_run and updates: with conn: conn.executemany( "UPDATE quality_scores SET translation_scores = ?, " "style_alignment_passes = ? WHERE event_id = ?", [(ts, sap, eid) for eid, ts, sap in updates], ) conn.close() return stats def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--db", type=Path, default=Path("polyglot_alpha.db"), help="Path to the SQLite database (default: polyglot_alpha.db).", ) parser.add_argument( "--dry-run", action="store_true", help="Audit and print stats without writing.", ) return parser.parse_args(argv) def main(argv: Optional[list[str]] = None) -> int: args = _parse_args(argv) if not args.db.exists(): print(f"error: database not found: {args.db}", file=sys.stderr) return 1 stats = backfill(args.db, dry_run=args.dry_run) mode = "DRY RUN" if args.dry_run else "APPLIED" print(f"[{mode}] backfill_qualityscore_json on {args.db}") print(f" total rows: {stats['total']}") print(f" already canonical: {stats['already_canonical']}") print(f" updated (dossier): {stats['updated']}") print(f" tagged unknown: {stats['tagged_unknown']}") print(f" no-change: {stats['no_change']}") return 0 if __name__ == "__main__": raise SystemExit(main())