Inframat-x commited on
Commit
d1791d5
·
verified ·
1 Parent(s): 65df9cc

Create rag_eval_metrics.py

Browse files
Files changed (1) hide show
  1. rag_eval_metrics.py +59 -0
rag_eval_metrics.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== Eval tab (optional, tiny and safe) ======
2
+ import subprocess, sys, textwrap
3
+
4
+ def _run_eval_inproc(gold_path: str, k: int = 8):
5
+ # Run the evaluator script and return file paths + JSON summary
6
+ out_dir = str(ARTIFACT_DIR)
7
+ logs = str(LOG_PATH)
8
+ cmd = [
9
+ sys.executable, "rag_eval_metrics.py",
10
+ "--gold_csv", gold_path,
11
+ "--logs_jsonl", logs,
12
+ "--k", str(k),
13
+ "--out_dir", out_dir
14
+ ]
15
+ try:
16
+ p = subprocess.run(cmd, capture_output=True, text=True, check=False)
17
+ stdout = p.stdout or ""
18
+ stderr = p.stderr or ""
19
+ # Read outputs
20
+ perq = ARTIFACT_DIR / "metrics_per_question.csv"
21
+ agg = ARTIFACT_DIR / "metrics_aggregate.json"
22
+ agg_json = {}
23
+ if agg.exists():
24
+ import json
25
+ agg_json = json.loads(agg.read_text(encoding="utf-8"))
26
+ # Minimal report for the UI
27
+ report = "```\n" + (stdout.strip() or "(no stdout)") + "\n" + (stderr.strip() or "") + "\n```"
28
+ return (str(perq) if perq.exists() else None,
29
+ str(agg) if agg.exists() else None,
30
+ agg_json,
31
+ report)
32
+ except Exception as e:
33
+ return (None, None, {}, f"**Eval error:** {e}")
34
+
35
+ with gr.Tab("📏 Evaluate (Gold vs Logs)"):
36
+ gr.Markdown("Upload your **gold.csv** (question, reference_answer, support_docs_pages, type) and compute metrics against the app logs.")
37
+ with gr.Row():
38
+ gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
39
+ k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG")
40
+ with gr.Row():
41
+ btn_eval = gr.Button("Compute Metrics", variant="primary")
42
+ with gr.Row():
43
+ out_perq = gr.File(label="Per-question metrics (CSV)")
44
+ out_agg = gr.File(label="Aggregate metrics (JSON)")
45
+ out_json = gr.JSON(label="Aggregate summary")
46
+ out_log = gr.Markdown(label="Run log")
47
+
48
+ def _eval_wrapper(gf, k):
49
+ # If user didn’t upload, look for gold.csv in repo
50
+ if gf is None:
51
+ default_gold = Path("gold.csv")
52
+ if not default_gold.exists():
53
+ return None, None, {}, "**No gold.csv provided or found in repo root.**"
54
+ gold_path = str(default_gold)
55
+ else:
56
+ gold_path = gf.name
57
+ return _run_eval_inproc(gold_path, int(k))
58
+
59
+ btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider], outputs=[out_perq, out_agg, out_json, out_log])