| from __future__ import annotations |
|
|
| import numpy as np |
| import pandas as pd |
| import streamlit as st |
|
|
| from src.charts import bubble_quality, heatmap |
| from src.ui import callout, section_title |
|
|
|
|
| class QualityMapViewMixin: |
| """Quality maps across domain, difficulty, and risk slices.""" |
|
|
| def _page_quality_map(self) -> None: |
| section_title("Quality Map", "Where quality is strong, unstable, or worth deeper review.") |
| domain_diff = self._domain_difficulty_table(self.ctx.filtered_eval) |
| left, right = st.columns(2, gap="large") |
| with left: |
| self._plot(heatmap(domain_diff, "difficulty", "domain", "correct_rate", "Correctness 路 domain 脳 difficulty"), "quality_correctness") |
| with right: |
| self._plot(heatmap(domain_diff, "difficulty", "domain", "hallucination_rate", "Hallucination 路 domain 脳 difficulty"), "quality_hallucination") |
| self._plot(bubble_quality(self.ctx.risk_slices, "Risk slice map"), "quality_risk_bubble") |
| if not self.ctx.demand_coverage.empty: |
| callout( |
| "info", |
| "Coverage interpretation", |
| "Positive demand-minus-corpus means the domain receives more evaluation demand than its corpus document share.", |
| ) |
|
|
| @staticmethod |
| def _domain_difficulty_table(df: pd.DataFrame) -> pd.DataFrame: |
| needed = {"domain", "difficulty"} |
| if df.empty or not needed.issubset(df.columns): |
| return pd.DataFrame() |
| aggs: dict[str, tuple[str, str]] = {"n": ("domain", "size")} |
| if "is_correct" in df.columns: |
| aggs["correct_rate"] = ("is_correct", "mean") |
| if "hallucination_flag" in df.columns: |
| aggs["hallucination_rate"] = ("hallucination_flag", "mean") |
| if "recall_at_10" in df.columns: |
| aggs["recall_at_10"] = ("recall_at_10", "mean") |
| out = df.groupby(["domain", "difficulty"], dropna=False).agg(**aggs).reset_index() |
| for col in ["correct_rate", "hallucination_rate", "recall_at_10"]: |
| if col not in out.columns: |
| out[col] = np.nan |
| return out |
|
|
|
|