from __future__ import annotations import pandas as pd import streamlit as st from src.analytics import RETRIEVAL_OK_THRESHOLD, top_examples from src.text_search import literal_text_mask from src.ui import callout, section_title, trace_box class TraceExplorerViewMixin: """Example-level trace review page.""" def _page_trace_explorer(self) -> None: section_title("Trace Explorer", "Inspect examples, retrieved chunks, diagnosis, and metadata.") mode = st.radio("Example queue", ["High risk", "Incorrect", "Hallucination", "Low retrieval"], horizontal=True, key="trace_mode") examples = top_examples(self.ctx.filtered_eval, mode=mode, n=150, reference_df=self.data.eval_runs) search = st.text_input("Search questions", "", key="trace_search") if search and "query" in examples.columns: examples = examples[literal_text_mask(examples["query"], search)] if examples.empty: callout("warn", "No examples", "Try another queue or relax filters.") return id_col = "example_id" if "example_id" in examples.columns else examples.columns[0] selected_id = st.selectbox("Select example", examples[id_col].astype(str).tolist(), key="trace_example_id") ex = examples[examples[id_col].astype(str) == selected_id].iloc[0] self._render_trace_detail(ex, selected_id) def _render_trace_detail(self, ex: pd.Series, selected_id: str) -> None: left, middle, right = st.columns([1.15, 1.0, 0.85], gap="large") with left: trace_box("Question", str(ex.get("query", "No query column available."))) trace_box("Gold answer", str(ex.get("gold_answer", "No gold answer available."))) with middle: st.json( { "domain": ex.get("domain"), "scenario_type": ex.get("scenario_type"), "difficulty": ex.get("difficulty"), "correct": ex.get("is_correct"), "hallucination": ex.get("hallucination_flag"), "recall_at_10": ex.get("recall_at_10"), "mrr_at_10": ex.get("mrr_at_10"), } ) with right: diagnosis, kind = self._diagnose_example(ex) callout(kind, diagnosis, "Use the retrieved chunks below to validate evidence and decide the next debugging action.") self._render_retrieved_chunks(selected_id) @staticmethod def _diagnose_example(ex: pd.Series) -> tuple[str, str]: correct = float(ex.get("is_correct", 0) or 0) hallucination = float(ex.get("hallucination_flag", 0) or 0) recall = float(ex.get("recall_at_10", 0) or 0) if hallucination >= 0.5: return "Hallucination review needed", "bad" if recall < RETRIEVAL_OK_THRESHOLD and correct < 0.5: return "Likely retrieval failure", "bad" if recall >= RETRIEVAL_OK_THRESHOLD and correct < 0.5: return "Likely generation failure", "warn" return "Healthy or recovered case", "good" def _render_retrieved_chunks(self, selected_id: str) -> None: if "example_id" not in self.data.retrieval_events.columns or "chunk_id" not in self.data.retrieval_events.columns: callout("info", "No retrieval table", "Retrieval events are unavailable in the packaged data.") return ret = self.ctx.filtered_retrieval[self.ctx.filtered_retrieval["example_id"].astype(str) == str(selected_id)].copy() if ret.empty: callout("info", "No retrieval rows", "This example has no matching retrieval event rows under the current filters.") return chunk_lookup = self.data.chunks[[c for c in ["chunk_id", "doc_id", "chunk_text"] if c in self.data.chunks.columns]].copy() ret = ret.merge(chunk_lookup, on="chunk_id", how="left") show_cols = [c for c in ["rank", "chunk_id", "retrieval_score", "is_relevant", "doc_id", "chunk_text"] if c in ret.columns] st.dataframe(ret[show_cols].sort_values("rank").head(12), use_container_width=True, hide_index=True)