Spaces:
Sleeping
Sleeping
File size: 4,122 Bytes
e8c3964 12409b1 e8c3964 12409b1 e8c3964 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
"""
Launch with:
streamlit run scripts/dashboard.py
Relies on the directory structure produced by analysis.py:
outputs/grid/<dataset>/<config>/{aggregates.yaml, rq1.yaml, ...}
"""
from __future__ import annotations
import json
import yaml
from pathlib import Path
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
BASE_DIR = Path("outputs/grid")
METRIC_KEY = "rag_score"
# --------------------------------------------------------------------- Sidebar
st.sidebar.title("RAG-Eval Dashboard")
if not BASE_DIR.exists():
st.sidebar.error(f"Folder {BASE_DIR} not found β run experiments first.")
st.stop()
datasets = sorted([p.name for p in BASE_DIR.iterdir() if p.is_dir()])
dataset = st.sidebar.selectbox("Dataset", datasets)
conf_dir = BASE_DIR / dataset
configs = sorted([p.name for p in conf_dir.iterdir() if p.is_dir()])
sel_cfgs = st.sidebar.multiselect("Configurations", configs, default=configs)
if not sel_cfgs:
st.warning("Select at least one configuration.")
st.stop()
# ---------------------------------------------------------------- Load helpers
def _yaml(path: Path): return yaml.safe_load(path.read_text())
def _jsonl(path: Path): return [json.loads(l) for l in path.read_text().splitlines()]
# ---------------------------------------------------------------- Main view
st.title(f"Dataset: {dataset}")
# ββ Aggregated metrics table ββββββββββββββββββββββββββββββββββββββββββββββββ
agg = {c: _yaml(conf_dir / c / "aggregates.yaml") for c in sel_cfgs}
agg_df = pd.DataFrame(agg).T
st.subheader("Aggregated metrics")
st.dataframe(agg_df, use_container_width=True)
# ββ Bar chart of rag_score means ββββββββββββββββββββββββββββββββββββββββββββ
st.subheader(f"Mean {METRIC_KEY}")
fig, ax = plt.subplots()
agg_df[METRIC_KEY].plot.bar(ax=ax)
ax.set_ylabel(METRIC_KEY)
ax.set_ylim(0, 1)
st.pyplot(fig)
# ββ Scatter MRR vs Correctness per config βββββββββββββββββββββββββββββββββββ
st.subheader("MRR vs Human Correctness")
cols = st.columns(len(sel_cfgs))
for col, cfg in zip(cols, sel_cfgs):
rows = _jsonl(conf_dir / cfg / "results.jsonl")
x = [r["metrics"].get("mrr", float("nan")) for r in rows]
y = [1 if r.get("human_correct") else 0 for r in rows]
fig, ax = plt.subplots()
ax.scatter(x, y, alpha=0.5)
ax.set(title=cfg, xlabel="MRR", ylabel="Correct?")
col.pyplot(fig)
# ββ Pairwise Wilcoxon-Holm table (rag_score) ββββββββββββββββββββββββββββββββ
wh_path = conf_dir / "wilcoxon_rag_holm.yaml"
if wh_path.exists():
st.subheader("Pairwise Wilcoxon-Holm (rag_score)")
wh_df = pd.Series(_yaml(wh_path), name="p_adj").to_frame()
st.dataframe(wh_df)
else:
st.info("Wilcoxon table not found β run_grid_experiments.py computes it.")
# ββ Research-question YAMLs βββββββββββββββββββββββββββββββββββββββββββββββββ
rq_tabs = st.tabs([f"{cfg}" for cfg in sel_cfgs])
for tab, cfg in zip(rq_tabs, sel_cfgs):
with tab:
for rq in ("rq1", "rq2", "rq3", "rq4"):
path = conf_dir / cfg / f"{rq}.yaml"
if path.exists():
st.markdown(f"**{rq.upper()}**")
st.json(_yaml(path))
else:
st.markdown(f"*{rq.upper()} β not available*")
# ββ Raw results download ββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.sidebar.subheader("Download")
for cfg in sel_cfgs:
st.sidebar.download_button(
label=f"{cfg} results.jsonl",
data=(conf_dir / cfg / "results.jsonl").read_bytes(),
file_name=f"{dataset}_{cfg}_results.jsonl",
mime="application/jsonl",
)
|