"""Capture user feedback on answers — the self-improving flywheel. The MVP's core idea: a small shop owner can't audit SQL, but they CAN say "that's right" or "no, that's wrong — it should be X". Because the app already shows the exact query it ran, each thumbs-down + correction is a high-signal *candidate training example*: the question, the SQL the model actually produced, and what the human says is correct. These accumulate into exactly the data the next training round needs — notably the multi-hop-join / error-recovery cases the model is weakest on (see docs/guides/training_playbook.md). Dep-light, stdlib only: one JSON object per line (JSONL), append-only, so it is trivially greppable and feeds straight into scripts/generate_sft_data.py-style curation later. No model or heavy deps imported here. """ from __future__ import annotations import json import time from pathlib import Path from typing import Any # Runtime artifact (not source). Append-only; one record per feedback click. DEFAULT_FEEDBACK_PATH = Path("data/feedback/feedback.jsonl") _VERDICTS = ("up", "down") def record_feedback( *, question: str, dataset: str, shown_sql: str, result: str, verdict: str, correction: str = "", path: str | Path | None = None, ) -> dict[str, Any]: """Append one feedback record as a JSONL line; return the stored record. Args: question: The user's natural-language question. dataset: Which dataset/DB it was asked against. shown_sql: The SQL the model actually ran (the audit surface). result: The answer/result the user is reacting to. verdict: "up" (correct) or "down" (wrong). correction: Free-text "what it should be" (the training signal on a down-vote; optional on an up-vote). path: Override the JSONL location (tests pass a tmp path). Returns: The record dict (also written to disk). Raises: ValueError: if ``verdict`` is not "up"/"down". """ if verdict not in _VERDICTS: raise ValueError(f"verdict must be one of {_VERDICTS}, got {verdict!r}") correction = (correction or "").strip() record = { "ts": time.time(), "question": question, "dataset": dataset, "shown_sql": shown_sql, "result": result, "verdict": verdict, "correction": correction, # A down-vote WITH a correction is the gold: a labelled "the model said # X, the truth is Y" pair the next training round can learn from. "is_training_candidate": verdict == "down" and bool(correction), } out = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH out.parent.mkdir(parents=True, exist_ok=True) with out.open("a", encoding="utf-8") as fh: fh.write(json.dumps(record, ensure_ascii=False) + "\n") return record def load_feedback(path: str | Path | None = None) -> list[dict[str, Any]]: """Read all feedback records (for inspection / the demo's flywheel counter).""" src = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH if not src.exists(): return [] records: list[dict[str, Any]] = [] for line in src.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue # Skip a malformed/truncated line (e.g. an interrupted write) rather than # let one bad record break feedback_summary() inside a UI click handler. try: records.append(json.loads(line)) except json.JSONDecodeError: continue return records def feedback_summary(path: str | Path | None = None) -> dict[str, int]: """Counts for a "flywheel" status line: total, 👍, 👎, training candidates.""" records = load_feedback(path) return { "total": len(records), "up": sum(1 for r in records if r.get("verdict") == "up"), "down": sum(1 for r in records if r.get("verdict") == "down"), "training_candidates": sum( 1 for r in records if r.get("is_training_candidate") ), }