Green-Energy-News-Monitoring / monitoring_app.py
Signe22's picture
Update monitoring_app.py
b00263d verified
import pandas as pd
import requests
import streamlit as st
st.set_page_config(
page_title="Monitoring Dashboard",
page_icon="🛠️",
layout="wide",
)
API_BASE_URL = "https://Signe22-Article-Data-API.hf.space"
@st.cache_data(ttl=300)
def load_monitoring_results() -> pd.DataFrame:
response = requests.get(
f"{API_BASE_URL}/monitoring/results",
params={"limit": 500},
timeout=30,
)
response.raise_for_status()
data = response.json()
df = pd.DataFrame(data)
if df.empty:
return df
df["published_at"] = pd.to_datetime(df["published_at"], errors="coerce", utc=True)
df["classified_at"] = pd.to_datetime(df["classified_at"], errors="coerce", utc=True)
df["evaluated_at"] = pd.to_datetime(df["evaluated_at"], errors="coerce", utc=True)
df["published_date"] = df["published_at"].dt.date
df["evaluated_date"] = df["evaluated_at"].dt.date
return df
@st.cache_data(ttl=300)
def load_monitoring_summary() -> dict:
response = requests.get(f"{API_BASE_URL}/monitoring/summary", timeout=30)
response.raise_for_status()
return response.json()
def apply_filters(df: pd.DataFrame) -> pd.DataFrame:
st.sidebar.header("Monitoring Filters")
label_judgment_options = sorted(df["label_judgment"].dropna().unique().tolist()) if not df.empty else []
predicted_label_options = sorted(df["predicted_label"].dropna().unique().tolist()) if not df.empty else []
source_options = sorted(df["source"].dropna().unique().tolist()) if not df.empty else []
selected_label_judgment = st.sidebar.multiselect("Label judgment", label_judgment_options, default=label_judgment_options)
selected_predicted_labels = st.sidebar.multiselect("Predicted labels", predicted_label_options, default=[])
selected_sources = st.sidebar.multiselect("Sources", source_options, default=[])
review_only = st.sidebar.checkbox("Only show articles needing review", value=False)
min_date = df["published_date"].min() if not df.empty else None
max_date = df["published_date"].max() if not df.empty else None
date_range = None
if min_date and max_date:
date_range = st.sidebar.date_input(
"Published date range",
value=(min_date, max_date),
min_value=min_date,
max_value=max_date,
)
search_term = st.sidebar.text_input("Search title or description")
filtered = df.copy()
if selected_label_judgment:
filtered = filtered[filtered["label_judgment"].isin(selected_label_judgment)]
if selected_predicted_labels:
filtered = filtered[filtered["predicted_label"].isin(selected_predicted_labels)]
if selected_sources:
filtered = filtered[filtered["source"].isin(selected_sources)]
if review_only:
filtered = filtered[filtered["requires_human_review"] == 1]
if date_range and len(date_range) == 2:
start_date, end_date = date_range
filtered = filtered[
(filtered["published_date"] >= start_date)
& (filtered["published_date"] <= end_date)
]
if search_term:
search_term = search_term.lower().strip()
filtered = filtered[
filtered["title"].fillna("").str.lower().str.contains(search_term, na=False)
| filtered["description"].fillna("").str.lower().str.contains(search_term, na=False)
]
return filtered
def render_summary(summary: dict, df: pd.DataFrame) -> None:
st.subheader("Monitoring Overview")
c1, c2, c3, c4 = st.columns(4)
c1.metric("Total monitored", summary.get("total_monitored", 0))
c2.metric("Needs review", summary.get("needs_review", 0))
c3.metric("Shown after filters", len(df))
c4.metric(
"Problem rate",
f"{(len(df[df['overall_status'] != 'ok']) / len(df) * 100):.1f}%"
if len(df)
else "0.0%",
)
if df.empty:
st.info("No monitoring results match the current filters.")
return
st.markdown("#### Label judgment distribution")
label_df = (
df["label_judgment"]
.value_counts()
.rename_axis("label_judgment")
.reset_index(name="count")
)
st.bar_chart(label_df.set_index("label_judgment"))
def render_problem_patterns(df: pd.DataFrame) -> None:
st.subheader("Problem Patterns")
if df.empty:
st.info("No data available.")
return
issues = df[df["overall_status"] != "ok"]
if issues.empty:
st.success("No current problem cases in the filtered selection.")
return
st.markdown("#### Most problematic predicted labels")
bad_labels = (
issues["predicted_label"]
.value_counts()
.rename_axis("predicted_label")
.reset_index(name="count")
)
st.dataframe(bad_labels, use_container_width=True, hide_index=True)
st.markdown("#### Most problematic sources")
bad_sources = (
issues["source"]
.value_counts()
.rename_axis("source")
.reset_index(name="count")
)
st.dataframe(bad_sources, use_container_width=True, hide_index=True)
def render_review_queue(df: pd.DataFrame) -> None:
st.subheader("Review Queue")
if df.empty:
st.info("No monitoring results available.")
return
queue_df = df[df["requires_human_review"] == 1].copy()
if queue_df.empty:
st.success("No articles currently flagged for review in the filtered selection.")
return
max_rows = st.slider("Number of review cases to display", 5, 100, 20)
queue_df = queue_df.sort_values("evaluated_at", ascending=False).head(max_rows)
for _, row in queue_df.iterrows():
published_str = row["published_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["published_at"]) else "Unknown"
evaluated_str = row["evaluated_at"].strftime("%Y-%m-%d %H:%M UTC") if pd.notnull(row["evaluated_at"]) else "Unknown"
with st.expander(f"{row['title']}"):
m1, m2, m3, m4 = st.columns(4)
m1.markdown(f"**Predicted label:** {row['predicted_label']}")
m2.markdown(f"**Overall status:** {row['overall_status']}")
m3.markdown(f"**Source:** {row['source']}")
m4.markdown(f"**Published:** {published_str}")
st.markdown("**Description**")
st.write(row["description"] if pd.notnull(row["description"]) else "No description")
st.markdown("**Judge output**")
st.markdown(f"**Label quality:** {row['label_judgment']} ({row['label_confidence']})")
st.write(row["label_explanation"])
st.markdown("**Metadata**")
st.caption(f"Article ID: {row['article_id']}")
st.caption(f"Evaluated at: {evaluated_str}")
if pd.notnull(row["url"]) and str(row["url"]).strip():
st.markdown(f"[Open article]({row['url']})")
def render_correct_cases(df: pd.DataFrame) -> None:
st.subheader("Correct Classification Examples")
if df.empty:
st.info("No monitoring results available.")
return
correct_df = df[df["label_judgment"] == "correct"].copy()
if correct_df.empty:
st.info("No correct classifications available.")
return
max_rows = st.slider(
"Number of correct examples to display",
5,
100,
20,
key="correct_slider",
)
correct_df = correct_df.sort_values("evaluated_at", ascending=False).head(max_rows)
for _, row in correct_df.iterrows():
published_str = (
row["published_at"].strftime("%Y-%m-%d %H:%M UTC")
if pd.notnull(row["published_at"])
else "Unknown"
)
evaluated_str = (
row["evaluated_at"].strftime("%Y-%m-%d %H:%M UTC")
if pd.notnull(row["evaluated_at"])
else "Unknown"
)
with st.expander(f"{row['title']}"):
m1, m2, m3, m4 = st.columns(4)
m1.markdown(f"**Predicted label:** {row['predicted_label']}")
m2.markdown(f"**Overall status:** {row['overall_status']}")
m3.markdown(f"**Source:** {row['source']}")
m4.markdown(f"**Published:** {published_str}")
st.markdown("**Description**")
st.write(
row["description"]
if pd.notnull(row["description"])
else "No description"
)
st.markdown("**Judge output**")
st.markdown(
f"**Label quality:** {row['label_judgment']} "
f"({row['label_confidence']})"
)
st.write(row["label_explanation"])
st.markdown("**Metadata**")
st.caption(f"Article ID: {row['article_id']}")
st.caption(f"Evaluated at: {evaluated_str}")
if pd.notnull(row["url"]) and str(row["url"]).strip():
st.markdown(f"[Open article]({row['url']})")
def render_full_table(df: pd.DataFrame) -> None:
st.subheader("Monitoring Table")
if df.empty:
st.info("No rows to display.")
return
table_df = df[
[
"published_at",
"source",
"predicted_label",
"label_judgment",
"label_confidence",
"requires_human_review",
"title",
]
].copy()
table_df["published_at"] = table_df["published_at"].dt.strftime("%Y-%m-%d %H:%M")
st.dataframe(table_df, use_container_width=True, hide_index=True)
def main() -> None:
st.title("🛠️ Monitoring Dashboard")
st.write(
"This dashboard helps inspect LLM-as-a-judge monitoring output in order to identify "
"label accuracy issues and low-confidence cases that may require pipeline improvements."
)
try:
summary = load_monitoring_summary()
df = load_monitoring_results()
except Exception as e:
st.error(f"Failed to load monitoring data from API: {e}")
return
if df.empty:
st.warning("No monitoring results found yet.")
return
filtered_df = apply_filters(df)
tab1, tab2, tab3, tab4, tab5 = st.tabs(
[
"Overview",
"Problem Patterns",
"Correct Classifications",
"Review Queue",
"Table",
]
)
with tab1:
render_summary(summary, filtered_df)
with tab2:
render_problem_patterns(filtered_df)
with tab3:
render_correct_cases(filtered_df)
with tab4:
render_review_queue(filtered_df)
with tab5:
render_full_table(filtered_df)
if __name__ == "__main__":
main()