Spaces:
Sleeping
Sleeping
| """ | |
| Launch with: | |
| streamlit run scripts/dashboard.py | |
| Relies on the directory structure produced by analysis.py: | |
| outputs/grid/<dataset>/<config>/{aggregates.yaml, rq1.yaml, ...} | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import yaml | |
| from pathlib import Path | |
| import pandas as pd | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| BASE_DIR = Path("outputs/grid") | |
| METRIC_KEY = "rag_score" | |
| # --------------------------------------------------------------------- Sidebar | |
| st.sidebar.title("RAG-Eval Dashboard") | |
| if not BASE_DIR.exists(): | |
| st.sidebar.error(f"Folder {BASE_DIR} not found β run experiments first.") | |
| st.stop() | |
| datasets = sorted([p.name for p in BASE_DIR.iterdir() if p.is_dir()]) | |
| dataset = st.sidebar.selectbox("Dataset", datasets) | |
| conf_dir = BASE_DIR / dataset | |
| configs = sorted([p.name for p in conf_dir.iterdir() if p.is_dir()]) | |
| sel_cfgs = st.sidebar.multiselect("Configurations", configs, default=configs) | |
| if not sel_cfgs: | |
| st.warning("Select at least one configuration.") | |
| st.stop() | |
| # ---------------------------------------------------------------- Load helpers | |
| def _yaml(path: Path): return yaml.safe_load(path.read_text()) | |
| def _jsonl(path: Path): return [json.loads(l) for l in path.read_text().splitlines()] | |
| # ---------------------------------------------------------------- Main view | |
| st.title(f"Dataset: {dataset}") | |
| # ββ Aggregated metrics table ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| agg = {c: _yaml(conf_dir / c / "aggregates.yaml") for c in sel_cfgs} | |
| agg_df = pd.DataFrame(agg).T | |
| st.subheader("Aggregated metrics") | |
| st.dataframe(agg_df, use_container_width=True) | |
| # ββ Bar chart of rag_score means ββββββββββββββββββββββββββββββββββββββββββββ | |
| st.subheader(f"Mean {METRIC_KEY}") | |
| fig, ax = plt.subplots() | |
| agg_df[METRIC_KEY].plot.bar(ax=ax) | |
| ax.set_ylabel(METRIC_KEY) | |
| ax.set_ylim(0, 1) | |
| st.pyplot(fig) | |
| # ββ Scatter MRR vs Correctness per config βββββββββββββββββββββββββββββββββββ | |
| st.subheader("MRR vs Human Correctness") | |
| cols = st.columns(len(sel_cfgs)) | |
| for col, cfg in zip(cols, sel_cfgs): | |
| rows = _jsonl(conf_dir / cfg / "results.jsonl") | |
| x = [r["metrics"].get("mrr", float("nan")) for r in rows] | |
| y = [1 if r.get("human_correct") else 0 for r in rows] | |
| fig, ax = plt.subplots() | |
| ax.scatter(x, y, alpha=0.5) | |
| ax.set(title=cfg, xlabel="MRR", ylabel="Correct?") | |
| col.pyplot(fig) | |
| # ββ Pairwise Wilcoxon-Holm table (rag_score) ββββββββββββββββββββββββββββββββ | |
| wh_path = conf_dir / "wilcoxon_rag_holm.yaml" | |
| if wh_path.exists(): | |
| st.subheader("Pairwise Wilcoxon-Holm (rag_score)") | |
| wh_df = pd.Series(_yaml(wh_path), name="p_adj").to_frame() | |
| st.dataframe(wh_df) | |
| else: | |
| st.info("Wilcoxon table not found β run_grid_experiments.py computes it.") | |
| # ββ Research-question YAMLs βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| rq_tabs = st.tabs([f"{cfg}" for cfg in sel_cfgs]) | |
| for tab, cfg in zip(rq_tabs, sel_cfgs): | |
| with tab: | |
| for rq in ("rq1", "rq2", "rq3", "rq4"): | |
| path = conf_dir / cfg / f"{rq}.yaml" | |
| if path.exists(): | |
| st.markdown(f"**{rq.upper()}**") | |
| st.json(_yaml(path)) | |
| else: | |
| st.markdown(f"*{rq.upper()} β not available*") | |
| # ββ Raw results download ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.sidebar.subheader("Download") | |
| for cfg in sel_cfgs: | |
| st.sidebar.download_button( | |
| label=f"{cfg} results.jsonl", | |
| data=(conf_dir / cfg / "results.jsonl").read_bytes(), | |
| file_name=f"{dataset}_{cfg}_results.jsonl", | |
| mime="application/jsonl", | |
| ) | |