Spaces:

build-small-hackathon
/

analyst-buddy

Running on Zero

App Files Files Community

analyst-buddy / server /feedback.py

hjerpe

F006/F008: serve Qwen models + model switcher (vanilla-first)

656f91e verified 18 days ago

Raw

History Blame Contribute Delete

4.11 kB

	"""Capture user feedback on answers — the self-improving flywheel.

	The MVP's core idea: a small shop owner can't audit SQL, but they CAN say
	"that's right" or "no, that's wrong — it should be X". Because the app already
	shows the exact query it ran, each thumbs-down + correction is a high-signal
	candidate training example: the question, the SQL the model actually produced,
	and what the human says is correct. These accumulate into exactly the data the
	next training round needs — notably the multi-hop-join / error-recovery cases
	the model is weakest on (see docs/guides/training_playbook.md).

	Dep-light, stdlib only: one JSON object per line (JSONL), append-only, so it is
	trivially greppable and feeds straight into scripts/generate_sft_data.py-style
	curation later. No model or heavy deps imported here.
	"""

	from __future__ import annotations

	import json
	import time
	from pathlib import Path
	from typing import Any

	# Runtime artifact (not source). Append-only; one record per feedback click.
	DEFAULT_FEEDBACK_PATH = Path("data/feedback/feedback.jsonl")

	_VERDICTS = ("up", "down")


	def record_feedback(
	*,
	question: str,
	dataset: str,
	shown_sql: str,
	result: str,
	verdict: str,
	correction: str = "",
	path: str \| Path \| None = None,
	) -> dict[str, Any]:
	"""Append one feedback record as a JSONL line; return the stored record.

	Args:
	question: The user's natural-language question.
	dataset: Which dataset/DB it was asked against.
	shown_sql: The SQL the model actually ran (the audit surface).
	result: The answer/result the user is reacting to.
	verdict: "up" (correct) or "down" (wrong).
	correction: Free-text "what it should be" (the training signal on a
	down-vote; optional on an up-vote).
	path: Override the JSONL location (tests pass a tmp path).

	Returns:
	The record dict (also written to disk).

	Raises:
	ValueError: if ``verdict`` is not "up"/"down".
	"""
	if verdict not in _VERDICTS:
	raise ValueError(f"verdict must be one of {_VERDICTS}, got {verdict!r}")

	correction = (correction or "").strip()
	record = {
	"ts": time.time(),
	"question": question,
	"dataset": dataset,
	"shown_sql": shown_sql,
	"result": result,
	"verdict": verdict,
	"correction": correction,
	# A down-vote WITH a correction is the gold: a labelled "the model said
	# X, the truth is Y" pair the next training round can learn from.
	"is_training_candidate": verdict == "down" and bool(correction),
	}

	out = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH
	out.parent.mkdir(parents=True, exist_ok=True)
	with out.open("a", encoding="utf-8") as fh:
	fh.write(json.dumps(record, ensure_ascii=False) + "\n")
	return record


	def load_feedback(path: str \| Path \| None = None) -> list[dict[str, Any]]:
	"""Read all feedback records (for inspection / the demo's flywheel counter)."""
	src = Path(path) if path is not None else DEFAULT_FEEDBACK_PATH
	if not src.exists():
	return []
	records: list[dict[str, Any]] = []
	for line in src.read_text(encoding="utf-8").splitlines():
	line = line.strip()
	if not line:
	continue
	# Skip a malformed/truncated line (e.g. an interrupted write) rather than
	# let one bad record break feedback_summary() inside a UI click handler.
	try:
	records.append(json.loads(line))
	except json.JSONDecodeError:
	continue
	return records


	def feedback_summary(path: str \| Path \| None = None) -> dict[str, int]:
	"""Counts for a "flywheel" status line: total, 👍, 👎, training candidates."""
	records = load_feedback(path)
	return {
	"total": len(records),
	"up": sum(1 for r in records if r.get("verdict") == "up"),
	"down": sum(1 for r in records if r.get("verdict") == "down"),
	"training_candidates": sum(
	1 for r in records if r.get("is_training_candidate")
	),
	}