import pandas as pd import requests import streamlit as st st.set_page_config( page_title="Monitoring Dashboard", page_icon="🛠️", layout="wide", ) API_BASE_URL = "https://Signe22-Article-Data-API.hf.space" @st.cache_data(ttl=300) def load_monitoring_results() -> pd.DataFrame: response = requests.get( f"{API_BASE_URL}/monitoring/results", params={"limit": 500}, timeout=30, ) response.raise_for_status() data = response.json() df = pd.DataFrame(data) if df.empty: return df df["published_at"] = pd.to_datetime(df["published_at"], errors="coerce", utc=True) df["classified_at"] = pd.to_datetime(df["classified_at"], errors="coerce", utc=True) df["evaluated_at"] = pd.to_datetime(df["evaluated_at"], errors="coerce", utc=True) df["published_date"] = df["published_at"].dt.date df["evaluated_date"] = df["evaluated_at"].dt.date return df @st.cache_data(ttl=300) def load_monitoring_summary() -> dict: response = requests.get(f"{API_BASE_URL}/monitoring/summary", timeout=30) response.raise_for_status() return response.json() def apply_filters(df: pd.DataFrame) -> pd.DataFrame: st.sidebar.header("Monitoring Filters") label_judgment_options = sorted(df["label_judgment"].dropna().unique().tolist()) if not df.empty else [] predicted_label_options = sorted(df["predicted_label"].dropna().unique().tolist()) if not df.empty else [] source_options = sorted(df["source"].dropna().unique().tolist()) if not df.empty else [] selected_label_judgment = st.sidebar.multiselect("Label judgment", label_judgment_options, default=label_judgment_options) selected_predicted_labels = st.sidebar.multiselect("Predicted labels", predicted_label_options, default=[]) selected_sources = st.sidebar.multiselect("Sources", source_options, default=[]) review_only = st.sidebar.checkbox("Only show articles needing review", value=False) min_date = df["published_date"].min() if not df.empty else None max_date = df["published_date"].max() if not df.empty else None date_range = None if min_date and max_date: date_range = st.sidebar.date_input( "Published date range", value=(min_date, max_date), min_value=min_date, max_value=max_date, ) search_term = st.sidebar.text_input("Search title or description") filtered = df.copy() if selected_label_judgment: filtered = filtered[filtered["label_judgment"].isin(selected_label_judgment)] if selected_predicted_labels: filtered = filtered[filtered["predicted_label"].isin(selected_predicted_labels)] if selected_sources: filtered = filtered[filtered["source"].isin(selected_sources)] if review_only: filtered = filtered[filtered["requires_human_review"] == 1] if date_range and len(date_range) == 2: start_date, end_date = date_range filtered = filtered[ (filtered["published_date"] >= start_date) & (filtered["published_date"] <= end_date) ] if search_term: search_term = search_term.lower().strip() filtered = filtered[ filtered["title"].fillna("").str.lower().str.contains(search_term, na=False) | filtered["description"].fillna("").str.lower().str.contains(search_term, na=False) ] return filtered def render_summary(summary: dict, df: pd.DataFrame) -> None: st.subheader("Monitoring Overview") c1, c2, c3, c4 = st.columns(4) c1.metric("Total monitored", summary.get("total_monitored", 0)) c2.metric("Needs review", summary.get("needs_review", 0)) c3.metric("Shown after filters", len(df)) c4.metric( "Problem rate", f"{(len(df[df['overall_status'] != 'ok']) / len(df) * 100):.1f}%" if len(df) else "0.0%", ) if df.empty: st.info("No monitoring results match the current filters.") return st.markdown("#### Label judgment distribution") label_df = ( df["label_judgment"] .value_counts() .rename_axis("label_judgment") .reset_index(name="count") ) st.bar_chart(label_df.set_index("label_judgment")) def render_problem_patterns(df: pd.DataFrame) -> None: st.subheader("Problem Patterns") if df.empty: st.info("No data available.") return issues = df[df["overall_status"] != "ok"] if issues.empty: st.success("No current problem cases in the filtered selection.") return st.markdown("#### Most problematic predicted labels") bad_labels = ( issues["predicted_label"] .value_counts() .rename_axis("predicted_label") .reset_index(name="count") ) st.dataframe(bad_labels, use_container_width=True, hide_index=True) st.markdown("#### Most problematic sources") bad_sources = ( issues["source"] .value_counts() .rename_axis("source") .reset_index(name="count") ) st.dataframe(bad_sources, use_container_width=True, hide_index=True) def render_review_queue(df: pd.DataFrame) -> None: st.subheader("Review Queue") if df.empty: st.info("No monitoring results available.") return queue_df = df[df["requires_human_review"] == 1].copy() if queue_df.empty: st.success("No articles currently flagged for review in the filtered selection.") return max_rows = st.slider("Number of review cases to display", 5, 100, 20) queue_df = queue_df.sort_values("evaluated_at", ascending=False).head(max_rows) for _, row in queue_df.iterrows(): published_str = row["published_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["published_at"]) else "Unknown" evaluated_str = row["evaluated_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["evaluated_at"]) else "Unknown" with st.expander(f"{row['title']}"): m1, m2, m3, m4 = st.columns(4) m1.markdown(f"**Predicted label:** {row['predicted_label']}") m2.markdown(f"**Overall status:** {row['overall_status']}") m3.markdown(f"**Source:** {row['source']}") m4.markdown(f"**Published:** {published_str}") st.markdown("**Description**") st.write(row["description"] if pd.notnull(row["description"]) else "No description") st.markdown("**Judge output**") st.markdown(f"**Label quality:** {row['label_judgment']} ({row['label_confidence']})") st.write(row["label_explanation"]) st.markdown("**Metadata**") st.caption(f"Article ID: {row['article_id']}") st.caption(f"Evaluated at: {evaluated_str}") if pd.notnull(row["url"]) and str(row["url"]).strip(): st.markdown(f"[Open article]({row['url']})") def render_correct_cases(df: pd.DataFrame) -> None: st.subheader("Correct Classification Examples") if df.empty: st.info("No monitoring results available.") return correct_df = df[df["label_judgment"] == "correct"].copy() if correct_df.empty: st.info("No correct classifications available.") return max_rows = st.slider( "Number of correct examples to display", 5, 100, 20, key="correct_slider", ) correct_df = correct_df.sort_values("evaluated_at", ascending=False).head(max_rows) for _, row in correct_df.iterrows(): published_str = ( row["published_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["published_at"]) else "Unknown" ) evaluated_str = ( row["evaluated_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["evaluated_at"]) else "Unknown" ) with st.expander(f"{row['title']}"): m1, m2, m3, m4 = st.columns(4) m1.markdown(f"**Predicted label:** {row['predicted_label']}") m2.markdown(f"**Overall status:** {row['overall_status']}") m3.markdown(f"**Source:** {row['source']}") m4.markdown(f"**Published:** {published_str}") st.markdown("**Description**") st.write( row["description"] if pd.notnull(row["description"]) else "No description" ) st.markdown("**Judge output**") st.markdown( f"**Label quality:** {row['label_judgment']} " f"({row['label_confidence']})" ) st.write(row["label_explanation"]) st.markdown("**Metadata**") st.caption(f"Article ID: {row['article_id']}") st.caption(f"Evaluated at: {evaluated_str}") if pd.notnull(row["url"]) and str(row["url"]).strip(): st.markdown(f"[Open article]({row['url']})") def render_full_table(df: pd.DataFrame) -> None: st.subheader("Monitoring Table") if df.empty: st.info("No rows to display.") return table_df = df[ [ "published_at", "source", "predicted_label", "label_judgment", "label_confidence", "requires_human_review", "title", ] ].copy() table_df["published_at"] = table_df["published_at"].dt.strftime("%Y-%m-%d %H:%M") st.dataframe(table_df, use_container_width=True, hide_index=True) def main() -> None: st.title("🛠️ Monitoring Dashboard") st.write( "This dashboard helps inspect LLM-as-a-judge monitoring output in order to identify " "label accuracy issues and low-confidence cases that may require pipeline improvements." ) try: summary = load_monitoring_summary() df = load_monitoring_results() except Exception as e: st.error(f"Failed to load monitoring data from API: {e}") return if df.empty: st.warning("No monitoring results found yet.") return filtered_df = apply_filters(df) tab1, tab2, tab3, tab4, tab5 = st.tabs( [ "Overview", "Problem Patterns", "Correct Classifications", "Review Queue", "Table", ] ) with tab1: render_summary(summary, filtered_df) with tab2: render_problem_patterns(filtered_df) with tab3: render_correct_cases(filtered_df) with tab4: render_review_queue(filtered_df) with tab5: render_full_table(filtered_df) if __name__ == "__main__": main()