Spaces:

crazycrazypete
/

Masters-four-Tab-OpenAI

Running

Masters-four-Tab-OpenAI / backend /scripts /export_eval_qna_csv.py

Pete Dunn

Sync pending app, RAG, and evaluation updates

b72f98e 4 months ago

8.4 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import csv
	import json
	import re
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Optional


	def _safe_str(value: Any) -> str:
	if value is None:
	return ""
	if isinstance(value, (dict, list)):
	return json.dumps(value, ensure_ascii=False)
	return str(value)


	def _extract_why(answer: str) -> str:
	text = str(answer or "")
	if not text:
	return ""
	patterns = [
	r"\\Why\\\s(.?)(?:\n\s\\Next action\\*\|\Z)",
	r"###\sWhy this recommendation\s(.?)(?:\n\s###\s*Next step\|\Z)",
	r"###\sWhy\s(.?)(?:\n\s###\s*Next\|\Z)",
	]
	for p in patterns:
	m = re.search(p, text, flags=re.IGNORECASE \| re.DOTALL)
	if m:
	return re.sub(r"\s+", " ", m.group(1)).strip()
	return ""


	def _sources_summary(sources: Any) -> str:
	if not isinstance(sources, list):
	return ""
	names: List[str] = []
	for src in sources:
	if not isinstance(src, dict):
	continue
	doc = src.get("doc") or src.get("document") or src.get("relative_path") or src.get("href") or src.get("id")
	if doc:
	names.append(str(doc))
	# de-dupe preserve order
	seen = set()
	out = []
	for n in names:
	if n in seen:
	continue
	seen.add(n)
	out.append(n)
	return " \| ".join(out)


	def _iter_result_rows(data: Dict[str, Any], source_file: Path) -> Iterable[Dict[str, Any]]:
	results = data.get("results")
	if not isinstance(results, list):
	return
	run_generated_at = _safe_str(data.get("generated_at"))
	suite_total = _safe_str(data.get("total"))
	suite_pass_rate = _safe_str(data.get("pass_rate"))
	for r in results:
	if not isinstance(r, dict):
	continue
	assistant = _safe_str(r.get("assistant"))
	assistant_preview = _safe_str(r.get("assistant_preview"))
	query = _safe_str(r.get("query") or r.get("question") or r.get("message"))
	# only include rows where an answer was produced
	if not assistant and not assistant_preview:
	continue
	semantic = r.get("semantic") if isinstance(r.get("semantic"), dict) else {}
	timing = r.get("timing_ms") if isinstance(r.get("timing_ms"), dict) else {}
	sources = r.get("sources") if isinstance(r.get("sources"), list) else []
	issues = r.get("issues") if isinstance(r.get("issues"), list) else []
	notes = r.get("notes") if isinstance(r.get("notes"), list) else []
	yield {
	"source_file": str(source_file),
	"run_generated_at": run_generated_at,
	"suite_total": suite_total,
	"suite_pass_rate": suite_pass_rate,
	"case_id": _safe_str(r.get("id")),
	"query": query,
	"assistant": assistant,
	"assistant_preview": assistant_preview,
	"why_extracted": _extract_why(assistant),
	"domain": _safe_str(r.get("domain")),
	"mode": _safe_str(r.get("mode")),
	"difficulty": _safe_str(r.get("difficulty")),
	"web_assisted": _safe_str(r.get("web_assisted")),
	"pass": _safe_str(r.get("pass")),
	"grade": _safe_str(r.get("grade")),
	"rule_grade": _safe_str(r.get("rule_grade")),
	"rule_score": _safe_str(r.get("rule_score")),
	"semantic_score": _safe_str(semantic.get("score")),
	"semantic_grade": _safe_str(semantic.get("grade")),
	"semantic_reason": _safe_str(semantic.get("reason") or semantic.get("notes") or semantic.get("explanation")),
	"final_score": _safe_str(r.get("final_score")),
	"latency_ms": _safe_str(r.get("latency_ms")),
	"timing_total_ms": _safe_str(timing.get("total")),
	"timing_ms_json": _safe_str(timing),
	"issues": " \| ".join(_safe_str(x) for x in issues),
	"notes": " \| ".join(_safe_str(x) for x in notes),
	"source_count": str(len(sources)),
	"sources_summary": _sources_summary(sources),
	"sources_json": _safe_str(sources),
	}


	def _iter_cases_rows(data: Dict[str, Any], source_file: Path) -> Iterable[Dict[str, Any]]:
	cases = data.get("cases")
	if not isinstance(cases, list):
	return
	for r in cases:
	if not isinstance(r, dict):
	continue
	assistant = _safe_str(r.get("assistant"))
	assistant_preview = _safe_str(r.get("assistant_preview"))
	query = _safe_str(r.get("query") or r.get("question") or r.get("q") or r.get("message"))
	if not assistant and not assistant_preview:
	continue
	yield {
	"source_file": str(source_file),
	"run_generated_at": _safe_str(data.get("generated_at")),
	"suite_total": "",
	"suite_pass_rate": "",
	"case_id": _safe_str(r.get("id")),
	"query": query,
	"assistant": assistant,
	"assistant_preview": assistant_preview,
	"why_extracted": _extract_why(assistant),
	"domain": _safe_str(r.get("domain")),
	"mode": _safe_str(r.get("mode")),
	"difficulty": _safe_str(r.get("difficulty")),
	"web_assisted": _safe_str(r.get("web_assisted")),
	"pass": _safe_str(r.get("pass")),
	"grade": "",
	"rule_grade": "",
	"rule_score": "",
	"semantic_score": "",
	"semantic_grade": "",
	"semantic_reason": _safe_str(r.get("weak_reason")),
	"final_score": "",
	"latency_ms": _safe_str(r.get("latency_ms")),
	"timing_total_ms": "",
	"timing_ms_json": "",
	"issues": "",
	"notes": _safe_str(r.get("notes")),
	"source_count": _safe_str(r.get("source_count")),
	"sources_summary": "",
	"sources_json": "",
	}


	def export_eval_csv(evals_root: Path, out_csv: Path) -> int:
	rows: List[Dict[str, Any]] = []
	for json_path in sorted(evals_root.rglob("*.json")):
	try:
	with json_path.open("r", encoding="utf-8") as f:
	data = json.load(f)
	except Exception:
	continue
	if not isinstance(data, dict):
	continue
	rows.extend(_iter_result_rows(data, json_path))
	rows.extend(_iter_cases_rows(data, json_path))

	# stable ordering for review
	def _sort_key(r: Dict[str, Any]):
	return (r.get("source_file", ""), str(r.get("case_id", "")))

	rows.sort(key=_sort_key)

	out_csv.parent.mkdir(parents=True, exist_ok=True)
	headers = [
	"exported_at",
	"source_file",
	"run_generated_at",
	"suite_total",
	"suite_pass_rate",
	"case_id",
	"query",
	"assistant",
	"assistant_preview",
	"why_extracted",
	"domain",
	"mode",
	"difficulty",
	"web_assisted",
	"pass",
	"grade",
	"rule_grade",
	"rule_score",
	"semantic_score",
	"semantic_grade",
	"semantic_reason",
	"final_score",
	"latency_ms",
	"timing_total_ms",
	"timing_ms_json",
	"issues",
	"notes",
	"source_count",
	"sources_summary",
	"sources_json",
	]
	exported_at = datetime.now(timezone.utc).isoformat()
	with out_csv.open("w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=headers)
	writer.writeheader()
	for row in rows:
	out = {k: row.get(k, "") for k in headers}
	out["exported_at"] = exported_at
	writer.writerow(out)
	return len(rows)


	def main() -> int:
	parser = argparse.ArgumentParser(description="Export eval Q/A rows to CSV")
	parser.add_argument("--evals-root", default="docs/evals", help="Folder to scan for eval JSON files")
	parser.add_argument("--out", default="docs/evals/all_eval_questions_answers.csv", help="Output CSV path")
	args = parser.parse_args()

	root = Path(args.evals_root)
	out = Path(args.out)
	count = export_eval_csv(root, out)
	print(f"Wrote {count} rows to {out}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())