Spaces:

messili
/

polyglot-alpha

Running

File size: 9,200 Bytes

88d2f2a

"""Backfill canonical per-judge keys into ``quality_scores`` JSON columns.

W13-B Phase 4 closure. Historical rows persisted before this wave have
``translation_scores`` keyed by the raw panel emissions (``bleu``,
``comet``, ``mqm``) and ``style_alignment_passes`` keyed by the short
``d1`` .. ``d8`` aliases. The W12 audit flagged these as missing the
canonical judge names (``mqm_llm``, ``d1_structural`` .. ``d8_duplicate_detection``).

This script walks every ``quality_scores`` row and, for each row,
derives the canonical name -> score/passed mapping from the existing
``translation_scores._judges`` dossier (W9-A smuggle). Rows that
predate the dossier emission are tagged with ``backfill_unknown=True``
under both columns so consumers can distinguish "never had a dossier"
from "dossier present, canonical keys derived".

The script is idempotent: rows that already carry the canonical keys
are skipped (no rewrite). Safe to re-run.

Usage::

    .venv/bin/python scripts/backfill_qualityscore_json.py

Add ``--dry-run`` to preview the change set without writing.
"""

from __future__ import annotations

import argparse
import json
import sqlite3
import sys
from pathlib import Path
from typing import Any, Optional

# Canonical translation judge names. Order must match
# ``judges/panel.py::_MOCK_JUDGE_NAMES`` for the first three entries.
_TRANSLATION_CANONICAL: tuple[str, ...] = ("bleu", "comet", "mqm_llm")

# Short alias -> canonical full name for the 8 style judges.
_STYLE_CANONICAL_BY_SHORT: dict[str, str] = {
    "d1": "d1_structural",
    "d2": "d2_stylistic",
    "d3": "d3_framing",
    "d4": "d4_granularity",
    "d5": "d5_resolution_clarity",
    "d6": "d6_source_reliability",
    "d7": "d7_leading_check",
    "d8": "d8_duplicate_detection",
}


def _derive_canonical(
    translation_scores: dict[str, Any],
    style_alignment_passes: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any], bool]:
    """Return updated copies + a ``changed`` flag.

    Strategy:
      * Prefer the ``_judges`` dossier when present (carries the
        canonical names + normalized 0-1 ``score`` field).
      * Fallback: map short ``d1``..``d8`` aliases to their canonical
        names by hardcoded lookup; for ``mqm_llm`` synthesize from the
        existing ``mqm`` dict's ``score`` (0-100 -> 0-1).
      * Last-resort: stamp ``backfill_unknown`` so consumers know the
        row could not be enriched.
    """

    ts_out = dict(translation_scores)
    sap_out = dict(style_alignment_passes)
    changed = False

    dossier = translation_scores.get("_judges") if isinstance(
        translation_scores, dict
    ) else None

    if isinstance(dossier, list) and dossier:
        for j in dossier:
            if not isinstance(j, dict):
                continue
            raw_name = j.get("name")
            if not isinstance(raw_name, str):
                continue
            # Legacy live-verify rows wrote uppercase names (BLEU/COMET/MQM).
            # Normalize to lowercase so the canonical-name lookup matches.
            name_lower = raw_name.lower()
            # Map legacy ``mqm`` (without _llm suffix) onto the canonical
            # judge name. The dossier score is on the 0-1 scale (or 0-100 for
            # legacy rows that smuggled the raw MQM); both fall through to
            # the float conversion below — the consumer normalizes display.
            if name_lower == "mqm":
                name_lower = "mqm_llm"
            if name_lower in _TRANSLATION_CANONICAL:
                raw = j.get("score")
                if name_lower not in ts_out and isinstance(raw, (int, float)):
                    score = float(raw)
                    # Legacy uppercase MQM rows stored 0-100 directly; map
                    # to 0-1 so all canonical translation scores share scale.
                    if name_lower == "mqm_llm" and score > 1:
                        score = score / 100.0
                    ts_out[name_lower] = score
                    changed = True
            elif name_lower.startswith("d") and name_lower not in sap_out:
                sap_out[name_lower] = bool(j.get("passed"))
                changed = True
        return ts_out, sap_out, changed

    # No dossier — try direct alias mapping for style passes.
    for short, full in _STYLE_CANONICAL_BY_SHORT.items():
        if short in sap_out and full not in sap_out:
            sap_out[full] = bool(sap_out[short])
            changed = True

    # mqm_llm fallback from the panel-emitted ``mqm`` dict.
    if "mqm_llm" not in ts_out:
        mqm_raw = ts_out.get("mqm")
        if isinstance(mqm_raw, dict):
            mqm_score = mqm_raw.get("score")
            if isinstance(mqm_score, (int, float)):
                # 0-100 -> 0-1 normalized.
                normalized = float(mqm_score) / 100.0 if mqm_score > 1 else float(
                    mqm_score
                )
                ts_out["mqm_llm"] = normalized
                changed = True

    # If we still couldn't fill in any canonical translation key, tag
    # the row so consumers can flag it instead of treating absence as
    # "judge didn't run".
    has_any_canonical_ts = any(k in ts_out for k in _TRANSLATION_CANONICAL)
    has_any_canonical_sap = any(
        full in sap_out for full in _STYLE_CANONICAL_BY_SHORT.values()
    )
    if not has_any_canonical_ts and "backfill_unknown" not in ts_out:
        ts_out["backfill_unknown"] = True
        changed = True
    if not has_any_canonical_sap and "backfill_unknown" not in sap_out:
        sap_out["backfill_unknown"] = True
        changed = True

    return ts_out, sap_out, changed


def _row_already_canonical(
    ts: dict[str, Any], sap: dict[str, Any]
) -> bool:
    """Return True when both columns already carry canonical keys."""

    ts_has_all = all(k in ts for k in _TRANSLATION_CANONICAL)
    sap_has_all = all(
        full in sap for full in _STYLE_CANONICAL_BY_SHORT.values()
    )
    return ts_has_all and sap_has_all


def backfill(db_path: Path, dry_run: bool = False) -> dict[str, int]:
    """Walk every quality_scores row, derive canonical keys, write back.

    Returns a stats dict: ``{"total", "already_canonical", "updated",
    "tagged_unknown", "no_change"}``.
    """

    conn = sqlite3.connect(str(db_path))
    try:
        rows = conn.execute(
            "SELECT event_id, translation_scores, style_alignment_passes "
            "FROM quality_scores"
        ).fetchall()
    finally:
        # we'll re-open for writes below
        pass

    stats = {
        "total": len(rows),
        "already_canonical": 0,
        "updated": 0,
        "tagged_unknown": 0,
        "no_change": 0,
    }

    updates: list[tuple[int, str, str]] = []
    for event_id, ts_raw, sap_raw in rows:
        try:
            ts_obj: dict[str, Any] = json.loads(ts_raw) if ts_raw else {}
        except (TypeError, json.JSONDecodeError):
            ts_obj = {}
        try:
            sap_obj: dict[str, Any] = json.loads(sap_raw) if sap_raw else {}
        except (TypeError, json.JSONDecodeError):
            sap_obj = {}

        if _row_already_canonical(ts_obj, sap_obj):
            stats["already_canonical"] += 1
            continue

        new_ts, new_sap, changed = _derive_canonical(ts_obj, sap_obj)
        if not changed:
            stats["no_change"] += 1
            continue

        if new_ts.get("backfill_unknown") or new_sap.get("backfill_unknown"):
            stats["tagged_unknown"] += 1
        else:
            stats["updated"] += 1

        updates.append(
            (event_id, json.dumps(new_ts), json.dumps(new_sap))
        )

    if not dry_run and updates:
        with conn:
            conn.executemany(
                "UPDATE quality_scores SET translation_scores = ?, "
                "style_alignment_passes = ? WHERE event_id = ?",
                [(ts, sap, eid) for eid, ts, sap in updates],
            )
    conn.close()
    return stats


def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--db",
        type=Path,
        default=Path("polyglot_alpha.db"),
        help="Path to the SQLite database (default: polyglot_alpha.db).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Audit and print stats without writing.",
    )
    return parser.parse_args(argv)


def main(argv: Optional[list[str]] = None) -> int:
    args = _parse_args(argv)
    if not args.db.exists():
        print(f"error: database not found: {args.db}", file=sys.stderr)
        return 1

    stats = backfill(args.db, dry_run=args.dry_run)
    mode = "DRY RUN" if args.dry_run else "APPLIED"
    print(f"[{mode}] backfill_qualityscore_json on {args.db}")
    print(f"  total rows:         {stats['total']}")
    print(f"  already canonical:  {stats['already_canonical']}")
    print(f"  updated (dossier):  {stats['updated']}")
    print(f"  tagged unknown:     {stats['tagged_unknown']}")
    print(f"  no-change:          {stats['no_change']}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())