Spaces:
Running
Running
File size: 9,200 Bytes
88d2f2a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | """Backfill canonical per-judge keys into ``quality_scores`` JSON columns.
W13-B Phase 4 closure. Historical rows persisted before this wave have
``translation_scores`` keyed by the raw panel emissions (``bleu``,
``comet``, ``mqm``) and ``style_alignment_passes`` keyed by the short
``d1`` .. ``d8`` aliases. The W12 audit flagged these as missing the
canonical judge names (``mqm_llm``, ``d1_structural`` .. ``d8_duplicate_detection``).
This script walks every ``quality_scores`` row and, for each row,
derives the canonical name -> score/passed mapping from the existing
``translation_scores._judges`` dossier (W9-A smuggle). Rows that
predate the dossier emission are tagged with ``backfill_unknown=True``
under both columns so consumers can distinguish "never had a dossier"
from "dossier present, canonical keys derived".
The script is idempotent: rows that already carry the canonical keys
are skipped (no rewrite). Safe to re-run.
Usage::
.venv/bin/python scripts/backfill_qualityscore_json.py
Add ``--dry-run`` to preview the change set without writing.
"""
from __future__ import annotations
import argparse
import json
import sqlite3
import sys
from pathlib import Path
from typing import Any, Optional
# Canonical translation judge names. Order must match
# ``judges/panel.py::_MOCK_JUDGE_NAMES`` for the first three entries.
_TRANSLATION_CANONICAL: tuple[str, ...] = ("bleu", "comet", "mqm_llm")
# Short alias -> canonical full name for the 8 style judges.
_STYLE_CANONICAL_BY_SHORT: dict[str, str] = {
"d1": "d1_structural",
"d2": "d2_stylistic",
"d3": "d3_framing",
"d4": "d4_granularity",
"d5": "d5_resolution_clarity",
"d6": "d6_source_reliability",
"d7": "d7_leading_check",
"d8": "d8_duplicate_detection",
}
def _derive_canonical(
translation_scores: dict[str, Any],
style_alignment_passes: dict[str, Any],
) -> tuple[dict[str, Any], dict[str, Any], bool]:
"""Return updated copies + a ``changed`` flag.
Strategy:
* Prefer the ``_judges`` dossier when present (carries the
canonical names + normalized 0-1 ``score`` field).
* Fallback: map short ``d1``..``d8`` aliases to their canonical
names by hardcoded lookup; for ``mqm_llm`` synthesize from the
existing ``mqm`` dict's ``score`` (0-100 -> 0-1).
* Last-resort: stamp ``backfill_unknown`` so consumers know the
row could not be enriched.
"""
ts_out = dict(translation_scores)
sap_out = dict(style_alignment_passes)
changed = False
dossier = translation_scores.get("_judges") if isinstance(
translation_scores, dict
) else None
if isinstance(dossier, list) and dossier:
for j in dossier:
if not isinstance(j, dict):
continue
raw_name = j.get("name")
if not isinstance(raw_name, str):
continue
# Legacy live-verify rows wrote uppercase names (BLEU/COMET/MQM).
# Normalize to lowercase so the canonical-name lookup matches.
name_lower = raw_name.lower()
# Map legacy ``mqm`` (without _llm suffix) onto the canonical
# judge name. The dossier score is on the 0-1 scale (or 0-100 for
# legacy rows that smuggled the raw MQM); both fall through to
# the float conversion below — the consumer normalizes display.
if name_lower == "mqm":
name_lower = "mqm_llm"
if name_lower in _TRANSLATION_CANONICAL:
raw = j.get("score")
if name_lower not in ts_out and isinstance(raw, (int, float)):
score = float(raw)
# Legacy uppercase MQM rows stored 0-100 directly; map
# to 0-1 so all canonical translation scores share scale.
if name_lower == "mqm_llm" and score > 1:
score = score / 100.0
ts_out[name_lower] = score
changed = True
elif name_lower.startswith("d") and name_lower not in sap_out:
sap_out[name_lower] = bool(j.get("passed"))
changed = True
return ts_out, sap_out, changed
# No dossier — try direct alias mapping for style passes.
for short, full in _STYLE_CANONICAL_BY_SHORT.items():
if short in sap_out and full not in sap_out:
sap_out[full] = bool(sap_out[short])
changed = True
# mqm_llm fallback from the panel-emitted ``mqm`` dict.
if "mqm_llm" not in ts_out:
mqm_raw = ts_out.get("mqm")
if isinstance(mqm_raw, dict):
mqm_score = mqm_raw.get("score")
if isinstance(mqm_score, (int, float)):
# 0-100 -> 0-1 normalized.
normalized = float(mqm_score) / 100.0 if mqm_score > 1 else float(
mqm_score
)
ts_out["mqm_llm"] = normalized
changed = True
# If we still couldn't fill in any canonical translation key, tag
# the row so consumers can flag it instead of treating absence as
# "judge didn't run".
has_any_canonical_ts = any(k in ts_out for k in _TRANSLATION_CANONICAL)
has_any_canonical_sap = any(
full in sap_out for full in _STYLE_CANONICAL_BY_SHORT.values()
)
if not has_any_canonical_ts and "backfill_unknown" not in ts_out:
ts_out["backfill_unknown"] = True
changed = True
if not has_any_canonical_sap and "backfill_unknown" not in sap_out:
sap_out["backfill_unknown"] = True
changed = True
return ts_out, sap_out, changed
def _row_already_canonical(
ts: dict[str, Any], sap: dict[str, Any]
) -> bool:
"""Return True when both columns already carry canonical keys."""
ts_has_all = all(k in ts for k in _TRANSLATION_CANONICAL)
sap_has_all = all(
full in sap for full in _STYLE_CANONICAL_BY_SHORT.values()
)
return ts_has_all and sap_has_all
def backfill(db_path: Path, dry_run: bool = False) -> dict[str, int]:
"""Walk every quality_scores row, derive canonical keys, write back.
Returns a stats dict: ``{"total", "already_canonical", "updated",
"tagged_unknown", "no_change"}``.
"""
conn = sqlite3.connect(str(db_path))
try:
rows = conn.execute(
"SELECT event_id, translation_scores, style_alignment_passes "
"FROM quality_scores"
).fetchall()
finally:
# we'll re-open for writes below
pass
stats = {
"total": len(rows),
"already_canonical": 0,
"updated": 0,
"tagged_unknown": 0,
"no_change": 0,
}
updates: list[tuple[int, str, str]] = []
for event_id, ts_raw, sap_raw in rows:
try:
ts_obj: dict[str, Any] = json.loads(ts_raw) if ts_raw else {}
except (TypeError, json.JSONDecodeError):
ts_obj = {}
try:
sap_obj: dict[str, Any] = json.loads(sap_raw) if sap_raw else {}
except (TypeError, json.JSONDecodeError):
sap_obj = {}
if _row_already_canonical(ts_obj, sap_obj):
stats["already_canonical"] += 1
continue
new_ts, new_sap, changed = _derive_canonical(ts_obj, sap_obj)
if not changed:
stats["no_change"] += 1
continue
if new_ts.get("backfill_unknown") or new_sap.get("backfill_unknown"):
stats["tagged_unknown"] += 1
else:
stats["updated"] += 1
updates.append(
(event_id, json.dumps(new_ts), json.dumps(new_sap))
)
if not dry_run and updates:
with conn:
conn.executemany(
"UPDATE quality_scores SET translation_scores = ?, "
"style_alignment_passes = ? WHERE event_id = ?",
[(ts, sap, eid) for eid, ts, sap in updates],
)
conn.close()
return stats
def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--db",
type=Path,
default=Path("polyglot_alpha.db"),
help="Path to the SQLite database (default: polyglot_alpha.db).",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Audit and print stats without writing.",
)
return parser.parse_args(argv)
def main(argv: Optional[list[str]] = None) -> int:
args = _parse_args(argv)
if not args.db.exists():
print(f"error: database not found: {args.db}", file=sys.stderr)
return 1
stats = backfill(args.db, dry_run=args.dry_run)
mode = "DRY RUN" if args.dry_run else "APPLIED"
print(f"[{mode}] backfill_qualityscore_json on {args.db}")
print(f" total rows: {stats['total']}")
print(f" already canonical: {stats['already_canonical']}")
print(f" updated (dossier): {stats['updated']}")
print(f" tagged unknown: {stats['tagged_unknown']}")
print(f" no-change: {stats['no_change']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|