polyglot-alpha / scripts /backfill_qualityscore_json.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Backfill canonical per-judge keys into ``quality_scores`` JSON columns.
W13-B Phase 4 closure. Historical rows persisted before this wave have
``translation_scores`` keyed by the raw panel emissions (``bleu``,
``comet``, ``mqm``) and ``style_alignment_passes`` keyed by the short
``d1`` .. ``d8`` aliases. The W12 audit flagged these as missing the
canonical judge names (``mqm_llm``, ``d1_structural`` .. ``d8_duplicate_detection``).
This script walks every ``quality_scores`` row and, for each row,
derives the canonical name -> score/passed mapping from the existing
``translation_scores._judges`` dossier (W9-A smuggle). Rows that
predate the dossier emission are tagged with ``backfill_unknown=True``
under both columns so consumers can distinguish "never had a dossier"
from "dossier present, canonical keys derived".
The script is idempotent: rows that already carry the canonical keys
are skipped (no rewrite). Safe to re-run.
Usage::
.venv/bin/python scripts/backfill_qualityscore_json.py
Add ``--dry-run`` to preview the change set without writing.
"""
from __future__ import annotations
import argparse
import json
import sqlite3
import sys
from pathlib import Path
from typing import Any, Optional
# Canonical translation judge names. Order must match
# ``judges/panel.py::_MOCK_JUDGE_NAMES`` for the first three entries.
_TRANSLATION_CANONICAL: tuple[str, ...] = ("bleu", "comet", "mqm_llm")
# Short alias -> canonical full name for the 8 style judges.
_STYLE_CANONICAL_BY_SHORT: dict[str, str] = {
"d1": "d1_structural",
"d2": "d2_stylistic",
"d3": "d3_framing",
"d4": "d4_granularity",
"d5": "d5_resolution_clarity",
"d6": "d6_source_reliability",
"d7": "d7_leading_check",
"d8": "d8_duplicate_detection",
}
def _derive_canonical(
translation_scores: dict[str, Any],
style_alignment_passes: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any], bool]:
"""Return updated copies + a ``changed`` flag.
Strategy:
* Prefer the ``_judges`` dossier when present (carries the
canonical names + normalized 0-1 ``score`` field).
* Fallback: map short ``d1``..``d8`` aliases to their canonical
names by hardcoded lookup; for ``mqm_llm`` synthesize from the
existing ``mqm`` dict's ``score`` (0-100 -> 0-1).
* Last-resort: stamp ``backfill_unknown`` so consumers know the
row could not be enriched.
"""
ts_out = dict(translation_scores)
sap_out = dict(style_alignment_passes)
changed = False
dossier = translation_scores.get("_judges") if isinstance(
translation_scores, dict
) else None
if isinstance(dossier, list) and dossier:
for j in dossier:
if not isinstance(j, dict):
continue
raw_name = j.get("name")
if not isinstance(raw_name, str):
continue
# Legacy live-verify rows wrote uppercase names (BLEU/COMET/MQM).
# Normalize to lowercase so the canonical-name lookup matches.
name_lower = raw_name.lower()
# Map legacy ``mqm`` (without _llm suffix) onto the canonical
# judge name. The dossier score is on the 0-1 scale (or 0-100 for
# legacy rows that smuggled the raw MQM); both fall through to
# the float conversion below — the consumer normalizes display.
if name_lower == "mqm":
name_lower = "mqm_llm"
if name_lower in _TRANSLATION_CANONICAL:
raw = j.get("score")
if name_lower not in ts_out and isinstance(raw, (int, float)):
score = float(raw)
# Legacy uppercase MQM rows stored 0-100 directly; map
# to 0-1 so all canonical translation scores share scale.
if name_lower == "mqm_llm" and score > 1:
score = score / 100.0
ts_out[name_lower] = score
changed = True
elif name_lower.startswith("d") and name_lower not in sap_out:
sap_out[name_lower] = bool(j.get("passed"))
changed = True
return ts_out, sap_out, changed
# No dossier — try direct alias mapping for style passes.
for short, full in _STYLE_CANONICAL_BY_SHORT.items():
if short in sap_out and full not in sap_out:
sap_out[full] = bool(sap_out[short])
changed = True
# mqm_llm fallback from the panel-emitted ``mqm`` dict.
if "mqm_llm" not in ts_out:
mqm_raw = ts_out.get("mqm")
if isinstance(mqm_raw, dict):
mqm_score = mqm_raw.get("score")
if isinstance(mqm_score, (int, float)):
# 0-100 -> 0-1 normalized.
normalized = float(mqm_score) / 100.0 if mqm_score > 1 else float(
mqm_score
)
ts_out["mqm_llm"] = normalized
changed = True
# If we still couldn't fill in any canonical translation key, tag
# the row so consumers can flag it instead of treating absence as
# "judge didn't run".
has_any_canonical_ts = any(k in ts_out for k in _TRANSLATION_CANONICAL)
has_any_canonical_sap = any(
full in sap_out for full in _STYLE_CANONICAL_BY_SHORT.values()
)
if not has_any_canonical_ts and "backfill_unknown" not in ts_out:
ts_out["backfill_unknown"] = True
changed = True
if not has_any_canonical_sap and "backfill_unknown" not in sap_out:
sap_out["backfill_unknown"] = True
changed = True
return ts_out, sap_out, changed
def _row_already_canonical(
ts: dict[str, Any], sap: dict[str, Any]
) -> bool:
"""Return True when both columns already carry canonical keys."""
ts_has_all = all(k in ts for k in _TRANSLATION_CANONICAL)
sap_has_all = all(
full in sap for full in _STYLE_CANONICAL_BY_SHORT.values()
)
return ts_has_all and sap_has_all
def backfill(db_path: Path, dry_run: bool = False) -> dict[str, int]:
"""Walk every quality_scores row, derive canonical keys, write back.
Returns a stats dict: ``{"total", "already_canonical", "updated",
"tagged_unknown", "no_change"}``.
"""
conn = sqlite3.connect(str(db_path))
try:
rows = conn.execute(
"SELECT event_id, translation_scores, style_alignment_passes "
"FROM quality_scores"
).fetchall()
finally:
# we'll re-open for writes below
pass
stats = {
"total": len(rows),
"already_canonical": 0,
"updated": 0,
"tagged_unknown": 0,
"no_change": 0,
}
updates: list[tuple[int, str, str]] = []
for event_id, ts_raw, sap_raw in rows:
try:
ts_obj: dict[str, Any] = json.loads(ts_raw) if ts_raw else {}
except (TypeError, json.JSONDecodeError):
ts_obj = {}
try:
sap_obj: dict[str, Any] = json.loads(sap_raw) if sap_raw else {}
except (TypeError, json.JSONDecodeError):
sap_obj = {}
if _row_already_canonical(ts_obj, sap_obj):
stats["already_canonical"] += 1
continue
new_ts, new_sap, changed = _derive_canonical(ts_obj, sap_obj)
if not changed:
stats["no_change"] += 1
continue
if new_ts.get("backfill_unknown") or new_sap.get("backfill_unknown"):
stats["tagged_unknown"] += 1
else:
stats["updated"] += 1
updates.append(
(event_id, json.dumps(new_ts), json.dumps(new_sap))
)
if not dry_run and updates:
with conn:
conn.executemany(
"UPDATE quality_scores SET translation_scores = ?, "
"style_alignment_passes = ? WHERE event_id = ?",
[(ts, sap, eid) for eid, ts, sap in updates],
)
conn.close()
return stats
def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--db",
type=Path,
default=Path("polyglot_alpha.db"),
help="Path to the SQLite database (default: polyglot_alpha.db).",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Audit and print stats without writing.",
)
return parser.parse_args(argv)
def main(argv: Optional[list[str]] = None) -> int:
args = _parse_args(argv)
if not args.db.exists():
print(f"error: database not found: {args.db}", file=sys.stderr)
return 1
stats = backfill(args.db, dry_run=args.dry_run)
mode = "DRY RUN" if args.dry_run else "APPLIED"
print(f"[{mode}] backfill_qualityscore_json on {args.db}")
print(f" total rows: {stats['total']}")
print(f" already canonical: {stats['already_canonical']}")
print(f" updated (dossier): {stats['updated']}")
print(f" tagged unknown: {stats['tagged_unknown']}")
print(f" no-change: {stats['no_change']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())