rag-qa-command-cente / src /views /trace_explorer.py
Tarek Masryo
chore: update project files
6bef416
from __future__ import annotations
import pandas as pd
import streamlit as st
from src.analytics import RETRIEVAL_OK_THRESHOLD, top_examples
from src.text_search import literal_text_mask
from src.ui import callout, section_title, trace_box
class TraceExplorerViewMixin:
"""Example-level trace review page."""
def _page_trace_explorer(self) -> None:
section_title("Trace Explorer", "Inspect examples, retrieved chunks, diagnosis, and metadata.")
mode = st.radio("Example queue", ["High risk", "Incorrect", "Hallucination", "Low retrieval"], horizontal=True, key="trace_mode")
examples = top_examples(self.ctx.filtered_eval, mode=mode, n=150, reference_df=self.data.eval_runs)
search = st.text_input("Search questions", "", key="trace_search")
if search and "query" in examples.columns:
examples = examples[literal_text_mask(examples["query"], search)]
if examples.empty:
callout("warn", "No examples", "Try another queue or relax filters.")
return
id_col = "example_id" if "example_id" in examples.columns else examples.columns[0]
selected_id = st.selectbox("Select example", examples[id_col].astype(str).tolist(), key="trace_example_id")
ex = examples[examples[id_col].astype(str) == selected_id].iloc[0]
self._render_trace_detail(ex, selected_id)
def _render_trace_detail(self, ex: pd.Series, selected_id: str) -> None:
left, middle, right = st.columns([1.15, 1.0, 0.85], gap="large")
with left:
trace_box("Question", str(ex.get("query", "No query column available.")))
trace_box("Gold answer", str(ex.get("gold_answer", "No gold answer available.")))
with middle:
st.json(
{
"domain": ex.get("domain"),
"scenario_type": ex.get("scenario_type"),
"difficulty": ex.get("difficulty"),
"correct": ex.get("is_correct"),
"hallucination": ex.get("hallucination_flag"),
"recall_at_10": ex.get("recall_at_10"),
"mrr_at_10": ex.get("mrr_at_10"),
}
)
with right:
diagnosis, kind = self._diagnose_example(ex)
callout(kind, diagnosis, "Use the retrieved chunks below to validate evidence and decide the next debugging action.")
self._render_retrieved_chunks(selected_id)
@staticmethod
def _diagnose_example(ex: pd.Series) -> tuple[str, str]:
correct = float(ex.get("is_correct", 0) or 0)
hallucination = float(ex.get("hallucination_flag", 0) or 0)
recall = float(ex.get("recall_at_10", 0) or 0)
if hallucination >= 0.5:
return "Hallucination review needed", "bad"
if recall < RETRIEVAL_OK_THRESHOLD and correct < 0.5:
return "Likely retrieval failure", "bad"
if recall >= RETRIEVAL_OK_THRESHOLD and correct < 0.5:
return "Likely generation failure", "warn"
return "Healthy or recovered case", "good"
def _render_retrieved_chunks(self, selected_id: str) -> None:
if "example_id" not in self.data.retrieval_events.columns or "chunk_id" not in self.data.retrieval_events.columns:
callout("info", "No retrieval table", "Retrieval events are unavailable in the packaged data.")
return
ret = self.ctx.filtered_retrieval[self.ctx.filtered_retrieval["example_id"].astype(str) == str(selected_id)].copy()
if ret.empty:
callout("info", "No retrieval rows", "This example has no matching retrieval event rows under the current filters.")
return
chunk_lookup = self.data.chunks[[c for c in ["chunk_id", "doc_id", "chunk_text"] if c in self.data.chunks.columns]].copy()
ret = ret.merge(chunk_lookup, on="chunk_id", how="left")
show_cols = [c for c in ["rank", "chunk_id", "retrieval_score", "is_relevant", "doc_id", "chunk_text"] if c in ret.columns]
st.dataframe(ret[show_cols].sort_values("rank").head(12), use_container_width=True, hide_index=True)