from __future__ import annotations import numpy as np import pandas as pd import streamlit as st from src.charts import bubble_quality, heatmap from src.ui import callout, section_title class QualityMapViewMixin: """Quality maps across domain, difficulty, and risk slices.""" def _page_quality_map(self) -> None: section_title("Quality Map", "Where quality is strong, unstable, or worth deeper review.") domain_diff = self._domain_difficulty_table(self.ctx.filtered_eval) left, right = st.columns(2, gap="large") with left: self._plot(heatmap(domain_diff, "difficulty", "domain", "correct_rate", "Correctness · domain × difficulty"), "quality_correctness") with right: self._plot(heatmap(domain_diff, "difficulty", "domain", "hallucination_rate", "Hallucination · domain × difficulty"), "quality_hallucination") self._plot(bubble_quality(self.ctx.risk_slices, "Risk slice map"), "quality_risk_bubble") if not self.ctx.demand_coverage.empty: callout( "info", "Coverage interpretation", "Positive demand-minus-corpus means the domain receives more evaluation demand than its corpus document share.", ) @staticmethod def _domain_difficulty_table(df: pd.DataFrame) -> pd.DataFrame: needed = {"domain", "difficulty"} if df.empty or not needed.issubset(df.columns): return pd.DataFrame() aggs: dict[str, tuple[str, str]] = {"n": ("domain", "size")} if "is_correct" in df.columns: aggs["correct_rate"] = ("is_correct", "mean") if "hallucination_flag" in df.columns: aggs["hallucination_rate"] = ("hallucination_flag", "mean") if "recall_at_10" in df.columns: aggs["recall_at_10"] = ("recall_at_10", "mean") out = df.groupby(["domain", "difficulty"], dropna=False).agg(**aggs).reset_index() for col in ["correct_rate", "hallucination_rate", "recall_at_10"]: if col not in out.columns: out[col] = np.nan return out