Spaces:

RoyAalekh
/

hackathon_code4change

Sleeping

File size: 31,379 Bytes

"""Analytics & Reports page - Compare simulation runs and analyze performance.

Features:
1. Simulation Comparison - Compare multiple simulation runs side-by-side
2. Performance Trends - Analyze metrics over time
3. Fairness Analysis - Evaluate equity and distribution
4. Report Generation - Export comprehensive analysis
"""

from __future__ import annotations

from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st

# Page configuration
st.set_page_config(
    page_title="Analytics & Reports",
    page_icon="chart",
    layout="wide",
)

st.title("Analytics & Reports")
st.markdown("Compare simulation runs and analyze system performance")

st.markdown("---")

# Main tabs
tab1, tab2, tab3, tab4 = st.tabs(
    [
        "Simulation Comparison",
        "Performance Trends",
        "Fairness Analysis",
        "Report Generation",
    ]
)

# TAB 1: Simulation Comparison
with tab1:
    st.markdown("### Simulation Comparison")
    st.markdown(
        "Compare multiple simulation runs to evaluate different policies and parameters."
    )

    # Check for available simulation runs (centralized base)
    from src.config.paths import get_runs_base

    runs_dir = get_runs_base()

    if not runs_dir.exists():
        st.warning(
            "No simulation outputs found. Run simulations first to generate data."
        )
    else:
        # Collect all run directories that actually contain a metrics.csv file.
        # Some runs may be nested (version folder inside timestamp). We treat every
        # directory that has metrics.csv as a runnable result.
        metric_files = list(runs_dir.rglob("metrics.csv"))
        run_paths = sorted({p.parent for p in metric_files})

        # Build label -> path map; label is relative path inside simulation_runs
        run_map = {str(p.relative_to(runs_dir)): p for p in run_paths}

        if len(run_map) < 2:
            st.info(
                "At least 2 simulation runs needed for comparison. Run more simulations to enable comparison."
            )
        else:
            st.markdown(f"**{len(run_map)} simulation run(s) available**")

            # Select runs to compare
            col1, col2 = st.columns(2)

            labels = sorted(run_map.keys())

            with col1:
                run1_label = st.selectbox(
                    "First simulation run", options=labels, key="compare_run1"
                )

            with col2:
                run2_options = [lbl for lbl in labels if lbl != run1_label]
                run2_label = st.selectbox(
                    "Second simulation run",
                    options=run2_options,
                    key="compare_run2",
                )

            if st.button("Compare Runs", type="primary"):
                # Load metrics from both runs
                run1_metrics_path = run_map[run1_label] / "metrics.csv"
                run2_metrics_path = run_map[run2_label] / "metrics.csv"

                if not run1_metrics_path.exists() or not run2_metrics_path.exists():
                    st.error("Metrics files not found for one or both runs.")
                else:
                    try:
                        df1 = pd.read_csv(run1_metrics_path)
                        df2 = pd.read_csv(run2_metrics_path)

                        st.success("Loaded metrics successfully")

                        # Show Key Insights from report.txt for both runs
                        st.markdown("#### Key Insights (from report.txt)")
                        col_ins_1, col_ins_2 = st.columns(2)

                        report1_path = run_map[run1_label] / "report.txt"
                        report2_path = run_map[run2_label] / "report.txt"

                        with col_ins_1:
                            st.markdown(f"**{run1_label}**")
                            if report1_path.exists():
                                st.code(
                                    report1_path.read_text(encoding="utf-8"),
                                    language="text",
                                )
                            else:
                                st.info("No report.txt found for this run.")

                        with col_ins_2:
                            st.markdown(f"**{run2_label}**")
                            if report2_path.exists():
                                st.code(
                                    report2_path.read_text(encoding="utf-8"),
                                    language="text",
                                )
                            else:
                                st.info("No report.txt found for this run.")

                        # Summary comparison
                        st.markdown("#### Summary Comparison")

                        col1, col2, col3 = st.columns(3)

                        with col1:
                            st.markdown(f"**{run1_label}**")
                            if "disposal_rate" in df1.columns:
                                avg_disposal1 = df1["disposal_rate"].mean()
                                st.metric("Avg. Disposal Rate", f"{avg_disposal1:.2%}")
                            if "utilization" in df1.columns:
                                avg_util1 = df1["utilization"].mean()
                                st.metric("Avg. Utilization", f"{avg_util1:.2%}")

                        with col2:
                            st.markdown(f"**{run2_label}**")
                            if "disposal_rate" in df2.columns:
                                avg_disposal2 = df2["disposal_rate"].mean()
                                st.metric("Avg. Disposal Rate", f"{avg_disposal2:.2%}")
                            if "utilization" in df2.columns:
                                avg_util2 = df2["utilization"].mean()
                                st.metric("Avg. Utilization", f"{avg_util2:.2%}")

                        with col3:
                            st.markdown("**Difference**")
                            if (
                                "disposal_rate" in df1.columns
                                and "disposal_rate" in df2.columns
                            ):
                                diff_disposal = avg_disposal2 - avg_disposal1
                                st.metric("Disposal Rate Δ", f"{diff_disposal:+.2%}")
                            if (
                                "utilization" in df1.columns
                                and "utilization" in df2.columns
                            ):
                                diff_util = avg_util2 - avg_util1
                                st.metric("Utilization Δ", f"{diff_util:+.2%}")

                        st.markdown("---")

                        # Time series comparison
                        st.markdown("#### Performance Over Time")

                        if (
                            "disposal_rate" in df1.columns
                            and "disposal_rate" in df2.columns
                        ):
                            fig = go.Figure()

                            fig.add_trace(
                                go.Scatter(
                                    x=df1.index,
                                    y=df1["disposal_rate"],
                                    mode="lines",
                                    name=run1_label,
                                    line=dict(color="blue"),
                                )
                            )

                            fig.add_trace(
                                go.Scatter(
                                    x=df2.index,
                                    y=df2["disposal_rate"],
                                    mode="lines",
                                    name=run2_label,
                                    line=dict(color="red"),
                                )
                            )

                            fig.update_layout(
                                title="Disposal Rate Comparison",
                                xaxis_title="Day",
                                yaxis_title="Disposal Rate",
                                height=400,
                            )

                            st.plotly_chart(fig, use_container_width=True)

                        if (
                            "utilization" in df1.columns
                            and "utilization" in df2.columns
                        ):
                            fig = go.Figure()

                            fig.add_trace(
                                go.Scatter(
                                    x=df1.index,
                                    y=df1["utilization"],
                                    mode="lines",
                                    name=run1_label,
                                    line=dict(color="blue"),
                                )
                            )

                            fig.add_trace(
                                go.Scatter(
                                    x=df2.index,
                                    y=df2["utilization"],
                                    mode="lines",
                                    name=run2_label,
                                    line=dict(color="red"),
                                )
                            )

                            fig.update_layout(
                                title="Utilization Comparison",
                                xaxis_title="Day",
                                yaxis_title="Utilization",
                                height=400,
                            )

                            st.plotly_chart(fig, use_container_width=True)

                    except Exception as e:
                        st.error(f"Error comparing runs: {e}")

# TAB 2: Performance Trends
with tab2:
    st.markdown("### Performance Trends")
    st.markdown("Analyze performance metrics across all simulation runs.")

    # Use centralized runs directory recursively
    from src.config.paths import get_runs_base

    runs_dir = get_runs_base()

    if not runs_dir.exists():
        st.warning("No simulation outputs found.")
    else:
        metric_files = list(runs_dir.rglob("metrics.csv"))
        run_paths = sorted({p.parent for p in metric_files})

        if not run_paths:
            st.info("No simulation runs found.")
        else:
            # Aggregate metrics from all runs
            all_metrics = []

            for run_dir in run_paths:
                metrics_path = run_dir / "metrics.csv"
                try:
                    df = pd.read_csv(metrics_path)
                    # Use relative label for clarity across nested structures
                    try:
                        df["run"] = str(run_dir.relative_to(runs_dir))
                    except ValueError:
                        # Fallback to folder name if not under base (shouldn't happen)
                        df["run"] = run_dir.name
                    all_metrics.append(df)
                except Exception:
                    pass  # Skip invalid metrics files

            if not all_metrics:
                st.warning("No valid metrics files found.")
            else:
                combined_df = pd.concat(all_metrics, ignore_index=True)

                st.markdown(f"**Loaded metrics from {len(all_metrics)} run(s)**")

                # Aggregate statistics
                st.markdown("#### Aggregate Statistics")

                col1, col2, col3 = st.columns(3)

                with col1:
                    if "disposal_rate" in combined_df.columns:
                        overall_avg = combined_df["disposal_rate"].mean()
                        st.metric("Overall Avg. Disposal Rate", f"{overall_avg:.2%}")

                with col2:
                    if "utilization" in combined_df.columns:
                        overall_util = combined_df["utilization"].mean()
                        st.metric("Overall Avg. Utilization", f"{overall_util:.2%}")

                with col3:
                    st.metric("Total Simulation Days", len(combined_df))

                st.markdown("---")

                # Distribution plots
                st.markdown("#### Metric Distributions")

                if "disposal_rate" in combined_df.columns:
                    fig = px.box(
                        combined_df,
                        x="run",
                        y="disposal_rate",
                        title="Disposal Rate Distribution by Run",
                        labels={
                            "disposal_rate": "Disposal Rate",
                            "run": "Simulation Run",
                        },
                    )
                    fig.update_layout(height=400)
                    st.plotly_chart(fig, use_container_width=True)

                if "utilization" in combined_df.columns:
                    fig = px.box(
                        combined_df,
                        x="run",
                        y="utilization",
                        title="Utilization Distribution by Run",
                        labels={"utilization": "Utilization", "run": "Simulation Run"},
                    )
                    fig.update_layout(height=400)
                    st.plotly_chart(fig, use_container_width=True)

# TAB 3: Fairness Analysis
with tab3:
    st.markdown("### Fairness Analysis")
    st.markdown("Evaluate equity and distribution of case handling across the system.")

    st.markdown("""
    Fairness metrics evaluate whether the scheduling system treats all cases equitably:
    - **Gini Coefficient**: Measures inequality in disposal times (0 = perfect equality, 1 = maximum inequality)
    - **Age Distribution**: Shows how long cases wait before disposal
    - **Case Type Balance**: Ensures no case type is systematically disadvantaged
    """)

    from src.config.paths import get_runs_base

    runs_dir = get_runs_base()

    if not runs_dir.exists():
        st.warning("No simulation outputs found.")
    else:
        event_files = list(runs_dir.rglob("events.csv"))
        run_event_paths = sorted({p.parent for p in event_files})

        if not run_event_paths:
            st.info("No simulation runs found.")
        else:
            # Select run for fairness analysis
            labels = [str(p.relative_to(runs_dir)) for p in run_event_paths]
            label_to_path = {str(p.relative_to(runs_dir)): p for p in run_event_paths}

            selected_run = st.selectbox(
                "Select simulation run for fairness analysis",
                options=labels,
                key="fairness_run",
            )

            # Look for events file (contains case-level data)
            events_path = label_to_path[selected_run] / "events.csv"

            if not events_path.exists():
                st.warning(
                    "Events file not found. Fairness analysis requires detailed event logs."
                )
            else:
                try:
                    events_df = pd.read_csv(events_path)

                    st.success("Loaded event data")

                    # Case age analysis
                    if "case_id" in events_df.columns and "date" in events_df.columns:
                        st.markdown("#### Case Age Distribution")

                        # Calculate case ages (simplified - would need filed_date for accurate calculation)
                        case_dates = events_df.groupby("case_id")["date"].agg(
                            ["min", "max"]
                        )
                        case_dates["age_days"] = (
                            pd.to_datetime(case_dates["max"])
                            - pd.to_datetime(case_dates["min"])
                        ).dt.days

                        fig = px.histogram(
                            case_dates,
                            x="age_days",
                            nbins=30,
                            title="Distribution of Case Ages",
                            labels={
                                "age_days": "Age (days)",
                                "count": "Number of Cases",
                            },
                        )
                        fig.update_layout(height=400)
                        st.plotly_chart(fig, use_container_width=True)

                        # Summary statistics
                        col1, col2, col3 = st.columns(3)

                        with col1:
                            st.metric(
                                "Median Age",
                                f"{case_dates['age_days'].median():.0f} days",
                            )
                        with col2:
                            st.metric(
                                "Mean Age", f"{case_dates['age_days'].mean():.0f} days"
                            )
                        with col3:
                            st.metric(
                                "Max Age", f"{case_dates['age_days'].max():.0f} days"
                            )

                        # Additional Fairness Metrics: Gini and Lorenz Curve
                        st.markdown("#### Inequality Metrics (Fairness)")

                        def _gini(values: np.ndarray) -> float:
                            v = np.asarray(values, dtype=float)
                            v = v[np.isfinite(v)]
                            v = v[v >= 0]
                            if v.size == 0:
                                return float("nan")
                            if np.all(v == 0):
                                return 0.0
                            v_sorted = np.sort(v)
                            n = v_sorted.size
                            cumulative = np.cumsum(v_sorted)
                            # Gini based on cumulative shares
                            gini = (n + 1 - 2 * np.sum(cumulative) / cumulative[-1]) / n
                            return float(gini)

                        ages = case_dates["age_days"].to_numpy()
                        gini_age = _gini(ages)

                        col_a, col_b = st.columns(2)
                        with col_a:
                            if np.isfinite(gini_age):
                                st.metric("Gini (Age Inequality)", f"{gini_age:.3f}")
                            else:
                                st.info("Gini (Age) not available")

                        # Lorenz curve for ages
                        with col_b:
                            try:
                                ages_clean = ages[np.isfinite(ages)]
                                ages_clean = ages_clean[ages_clean >= 0]
                                if ages_clean.size > 0:
                                    ages_sorted = np.sort(ages_clean)
                                    cum_ages = np.cumsum(ages_sorted)
                                    cum_ages = np.insert(cum_ages, 0, 0)
                                    cum_pop = np.linspace(0, 1, num=cum_ages.size)
                                    lorenz = cum_ages / cum_ages[-1]
                                    fig_lorenz = go.Figure()
                                    fig_lorenz.add_trace(
                                        go.Scatter(
                                            x=cum_pop,
                                            y=lorenz,
                                            mode="lines",
                                            name="Lorenz",
                                        )
                                    )
                                    fig_lorenz.add_trace(
                                        go.Scatter(
                                            x=[0, 1],
                                            y=[0, 1],
                                            mode="lines",
                                            name="Equality",
                                            line=dict(dash="dash"),
                                        )
                                    )
                                    fig_lorenz.update_layout(
                                        title="Lorenz Curve of Case Ages",
                                        xaxis_title="Cumulative share of cases",
                                        yaxis_title="Cumulative share of total age",
                                        height=350,
                                    )
                                    st.plotly_chart(
                                        fig_lorenz, use_container_width=True
                                    )
                                else:
                                    st.info("Not enough data to plot Lorenz curve")
                            except Exception:
                                st.info(
                                    "Unable to compute Lorenz curve for current data"
                                )

                    # Case type fairness
                    if "case_type" in events_df.columns:
                        st.markdown("---")
                        st.markdown("#### Case Type Balance")

                        case_type_counts = (
                            events_df["case_type"].value_counts().reset_index()
                        )
                        case_type_counts.columns = ["case_type", "count"]

                        fig = px.bar(
                            case_type_counts.head(10),
                            x="case_type",
                            y="count",
                            title="Top 10 Case Types by Hearing Count",
                            labels={
                                "case_type": "Case Type",
                                "count": "Number of Hearings",
                            },
                        )
                        fig.update_layout(height=400, xaxis_tickangle=-45)
                        st.plotly_chart(fig, use_container_width=True)

                        # Age distribution by case type (top N by cases)
                        st.markdown("#### Age Distribution by Case Type (Top 8)")
                        try:
                            # Map each case_id to a case_type (take the first occurrence)
                            cid_to_type = (
                                events_df.sort_values("date")
                                .groupby("case_id")["case_type"]
                                .first()
                            )
                            age_with_type = (
                                case_dates[["age_days"]]
                                .join(cid_to_type, how="left")
                                .dropna(
                                    subset=["case_type"]
                                )  # keep only cases with type
                            )
                            top_types = (
                                age_with_type["case_type"]
                                .value_counts()
                                .head(8)
                                .index.tolist()
                            )
                            filt = age_with_type["case_type"].isin(top_types)
                            fig_box = px.box(
                                age_with_type[filt],
                                x="case_type",
                                y="age_days",
                                points="outliers",
                                title="Case Age by Case Type (Top 8)",
                                labels={
                                    "case_type": "Case Type",
                                    "age_days": "Age (days)",
                                },
                            )
                            fig_box.update_layout(height=420, xaxis_tickangle=-45)
                            st.plotly_chart(fig_box, use_container_width=True)

                            # Gini by case type (Top 8)
                            st.markdown("#### Inequality by Case Type (Gini)")
                            gini_rows = []
                            for ctype in top_types:
                                vals = age_with_type.loc[
                                    age_with_type["case_type"] == ctype, "age_days"
                                ].to_numpy()
                                g = _gini(vals)
                                gini_rows.append({"case_type": ctype, "gini": g})
                            gini_df = pd.DataFrame(gini_rows).dropna()
                            if not gini_df.empty:
                                fig_gini = px.bar(
                                    gini_df,
                                    x="case_type",
                                    y="gini",
                                    title="Gini Coefficient by Case Type (Top 8)",
                                    labels={"case_type": "Case Type", "gini": "Gini"},
                                )
                                fig_gini.update_layout(
                                    height=380, xaxis_tickangle=-45, yaxis_range=[0, 1]
                                )
                                st.plotly_chart(fig_gini, use_container_width=True)
                            else:
                                st.info("Insufficient data to compute per-type Gini")
                        except Exception as _:
                            st.info(
                                "Unable to compute per-type age distributions for current data"
                            )

                except Exception as e:
                    st.error(f"Error loading events data: {e}")

# TAB 4: Report Generation
with tab4:
    st.markdown("### Report Generation")
    st.markdown(
        "Generate comprehensive reports summarizing system performance and analysis."
    )

    outputs_dir = Path("outputs")
    runs_dir = outputs_dir / "simulation_runs"

    if not runs_dir.exists():
        st.warning("No simulation outputs found.")
    else:
        metric_files = list(runs_dir.rglob("metrics.csv"))
        run_paths = sorted({p.parent for p in metric_files})

        if not run_paths:
            st.info("No simulation runs found.")
        else:
            st.markdown("#### Select Data for Report")

            # Multi-select runs
            labels = [str(p.relative_to(runs_dir)) for p in run_paths]
            label_to_path = {str(p.relative_to(runs_dir)): p for p in run_paths}

            selected_runs = st.multiselect(
                "Include simulation runs",
                options=labels,
                default=[labels[0]] if labels else [],
                key="report_runs",
            )

            # Report options
            include_metrics = st.checkbox("Include performance metrics", value=True)
            include_fairness = st.checkbox("Include fairness analysis", value=True)
            include_comparison = st.checkbox(
                "Include run comparisons", value=len(selected_runs) > 1
            )

            if st.button("Generate Report", type="primary", use_container_width=True):
                if not selected_runs:
                    st.error("Select at least one simulation run")
                else:
                    with st.spinner("Generating report..."):
                        # Create report content
                        report_sections = []

                        # Header
                        report_sections.append(
                            "# Court Scheduling System - Performance Report"
                        )
                        report_sections.append(
                            f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
                        )
                        report_sections.append(
                            f"Runs included: {', '.join(selected_runs)}"
                        )
                        report_sections.append("")

                        # Performance metrics
                        if include_metrics:
                            report_sections.append("## Performance Metrics")

                            for run_name in selected_runs:
                                metrics_path = label_to_path[run_name] / "metrics.csv"
                                if metrics_path.exists():
                                    df = pd.read_csv(metrics_path)

                                    report_sections.append(f"### {run_name}")

                                    if "disposal_rate" in df.columns:
                                        avg_disposal = df["disposal_rate"].mean()
                                        report_sections.append(
                                            f"- Average Disposal Rate: {avg_disposal:.2%}"
                                        )

                                    if "utilization" in df.columns:
                                        avg_util = df["utilization"].mean()
                                        report_sections.append(
                                            f"- Average Utilization: {avg_util:.2%}"
                                        )

                                    report_sections.append(
                                        f"- Simulation Days: {len(df)}"
                                    )
                                    report_sections.append("")

                        # Comparison
                        if include_comparison and len(selected_runs) > 1:
                            report_sections.append("## Comparison Analysis")
                            report_sections.append(
                                f"Comparing: {selected_runs[0]} vs {selected_runs[1]}"
                            )
                            report_sections.append("")

                        # Fairness
                        if include_fairness:
                            report_sections.append("## Fairness Analysis")
                            report_sections.append(
                                "Fairness metrics evaluate equitable treatment of all cases."
                            )
                            report_sections.append("")

                        # Footer
                        report_sections.append("---")
                        report_sections.append(
                            "Report generated by Court Scheduling System Analytics"
                        )

                        report_content = "\n".join(report_sections)

                        # Display report
                        st.markdown("#### Report Preview")
                        st.markdown(report_content)

                        # Download button
                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

                        st.download_button(
                            label="Download Report (Markdown)",
                            data=report_content,
                            file_name=f"scheduling_report_{timestamp}.md",
                            mime="text/markdown",
                        )

# Footer
st.markdown("---")
st.caption("Analytics & Reports - Performance analysis and comparative evaluation")