Spaces:
Sleeping
Sleeping
Create rag_eval_metrics.py
Browse files- rag_eval_metrics.py +59 -0
rag_eval_metrics.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ====== Eval tab (optional, tiny and safe) ======
|
| 2 |
+
import subprocess, sys, textwrap
|
| 3 |
+
|
| 4 |
+
def _run_eval_inproc(gold_path: str, k: int = 8):
|
| 5 |
+
# Run the evaluator script and return file paths + JSON summary
|
| 6 |
+
out_dir = str(ARTIFACT_DIR)
|
| 7 |
+
logs = str(LOG_PATH)
|
| 8 |
+
cmd = [
|
| 9 |
+
sys.executable, "rag_eval_metrics.py",
|
| 10 |
+
"--gold_csv", gold_path,
|
| 11 |
+
"--logs_jsonl", logs,
|
| 12 |
+
"--k", str(k),
|
| 13 |
+
"--out_dir", out_dir
|
| 14 |
+
]
|
| 15 |
+
try:
|
| 16 |
+
p = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
| 17 |
+
stdout = p.stdout or ""
|
| 18 |
+
stderr = p.stderr or ""
|
| 19 |
+
# Read outputs
|
| 20 |
+
perq = ARTIFACT_DIR / "metrics_per_question.csv"
|
| 21 |
+
agg = ARTIFACT_DIR / "metrics_aggregate.json"
|
| 22 |
+
agg_json = {}
|
| 23 |
+
if agg.exists():
|
| 24 |
+
import json
|
| 25 |
+
agg_json = json.loads(agg.read_text(encoding="utf-8"))
|
| 26 |
+
# Minimal report for the UI
|
| 27 |
+
report = "```\n" + (stdout.strip() or "(no stdout)") + "\n" + (stderr.strip() or "") + "\n```"
|
| 28 |
+
return (str(perq) if perq.exists() else None,
|
| 29 |
+
str(agg) if agg.exists() else None,
|
| 30 |
+
agg_json,
|
| 31 |
+
report)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
return (None, None, {}, f"**Eval error:** {e}")
|
| 34 |
+
|
| 35 |
+
with gr.Tab("📏 Evaluate (Gold vs Logs)"):
|
| 36 |
+
gr.Markdown("Upload your **gold.csv** (question, reference_answer, support_docs_pages, type) and compute metrics against the app logs.")
|
| 37 |
+
with gr.Row():
|
| 38 |
+
gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
|
| 39 |
+
k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG")
|
| 40 |
+
with gr.Row():
|
| 41 |
+
btn_eval = gr.Button("Compute Metrics", variant="primary")
|
| 42 |
+
with gr.Row():
|
| 43 |
+
out_perq = gr.File(label="Per-question metrics (CSV)")
|
| 44 |
+
out_agg = gr.File(label="Aggregate metrics (JSON)")
|
| 45 |
+
out_json = gr.JSON(label="Aggregate summary")
|
| 46 |
+
out_log = gr.Markdown(label="Run log")
|
| 47 |
+
|
| 48 |
+
def _eval_wrapper(gf, k):
|
| 49 |
+
# If user didn’t upload, look for gold.csv in repo
|
| 50 |
+
if gf is None:
|
| 51 |
+
default_gold = Path("gold.csv")
|
| 52 |
+
if not default_gold.exists():
|
| 53 |
+
return None, None, {}, "**No gold.csv provided or found in repo root.**"
|
| 54 |
+
gold_path = str(default_gold)
|
| 55 |
+
else:
|
| 56 |
+
gold_path = gf.name
|
| 57 |
+
return _run_eval_inproc(gold_path, int(k))
|
| 58 |
+
|
| 59 |
+
btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider], outputs=[out_perq, out_agg, out_json, out_log])
|