analyst-buddy / server /feedback.py
hjerpe's picture
F006/F008: serve Qwen models + model switcher (vanilla-first)
656f91e verified
Raw
History Blame Contribute Delete
4.11 kB
"""Capture user feedback on answers β€” the self-improving flywheel.
The MVP's core idea: a small shop owner can't audit SQL, but they CAN say
"that's right" or "no, that's wrong β€” it should be X". Because the app already
shows the exact query it ran, each thumbs-down + correction is a high-signal
*candidate training example*: the question, the SQL the model actually produced,
and what the human says is correct. These accumulate into exactly the data the
next training round needs β€” notably the multi-hop-join / error-recovery cases
the model is weakest on (see docs/guides/training_playbook.md).
Dep-light, stdlib only: one JSON object per line (JSONL), append-only, so it is
trivially greppable and feeds straight into scripts/generate_sft_data.py-style
curation later. No model or heavy deps imported here.
"""
from __future__ import annotations
import json
import time
from pathlib import Path
from typing import Any
# Runtime artifact (not source). Append-only; one record per feedback click.
DEFAULT_FEEDBACK_PATH = Path("data/feedback/feedback.jsonl")
_VERDICTS = ("up", "down")
def record_feedback(
*,
question: str,
dataset: str,
shown_sql: str,
result: str,
verdict: str,
correction: str = "",
path: str | Path | None = None,
) -> dict[str, Any]:
"""Append one feedback record as a JSONL line; return the stored record.
Args:
question: The user's natural-language question.
dataset: Which dataset/DB it was asked against.
shown_sql: The SQL the model actually ran (the audit surface).
result: The answer/result the user is reacting to.
verdict: "up" (correct) or "down" (wrong).
correction: Free-text "what it should be" (the training signal on a
down-vote; optional on an up-vote).
path: Override the JSONL location (tests pass a tmp path).
Returns:
The record dict (also written to disk).
Raises:
ValueError: if ``verdict`` is not "up"/"down".
"""
if verdict not in _VERDICTS:
raise ValueError(f"verdict must be one of {_VERDICTS}, got {verdict!r}")
correction = (correction or "").strip()
record = {
"ts": time.time(),
"question": question,
"dataset": dataset,
"shown_sql": shown_sql,
"result": result,
"verdict": verdict,
"correction": correction,
# A down-vote WITH a correction is the gold: a labelled "the model said
# X, the truth is Y" pair the next training round can learn from.
"is_training_candidate": verdict == "down" and bool(correction),
}
out = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH
out.parent.mkdir(parents=True, exist_ok=True)
with out.open("a", encoding="utf-8") as fh:
fh.write(json.dumps(record, ensure_ascii=False) + "\n")
return record
def load_feedback(path: str | Path | None = None) -> list[dict[str, Any]]:
"""Read all feedback records (for inspection / the demo's flywheel counter)."""
src = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH
if not src.exists():
return []
records: list[dict[str, Any]] = []
for line in src.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
# Skip a malformed/truncated line (e.g. an interrupted write) rather than
# let one bad record break feedback_summary() inside a UI click handler.
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
return records
def feedback_summary(path: str | Path | None = None) -> dict[str, int]:
"""Counts for a "flywheel" status line: total, πŸ‘, πŸ‘Ž, training candidates."""
records = load_feedback(path)
return {
"total": len(records),
"up": sum(1 for r in records if r.get("verdict") == "up"),
"down": sum(1 for r in records if r.get("verdict") == "down"),
"training_candidates": sum(
1 for r in records if r.get("is_training_candidate")
),
}