Spaces:

fmegahed
/

timeseries_visualization

Running

App Files Files Community

fmegahed commited on Feb 11

Commit

0191ae7

0 Parent(s):

Initial deploy: Time Series Visualizer v0.1.0

Browse files

Files changed (15) hide show

Dockerfile +24 -0
README.md +28 -0
app.py +698 -0
data/demo_multi_long.csv +0 -0
data/demo_multi_wide.csv +29 -0
data/demo_single.csv +121 -0
requirements.txt +12 -0
scripts/generate_demo_data.py +134 -0
src/__init__.py +0 -0
src/ai_interpretation.py +269 -0
src/cleaning.py +329 -0
src/diagnostics.py +509 -0
src/plotting.py +671 -0
src/querychat_helpers.py +161 -0
src/ui_theme.py +327 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.11-slim
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gcc g++ && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user:user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+COPY --chown=user:user . .
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", \
+     "--server.port=7860", \
+     "--server.address=0.0.0.0", \
+     "--browser.gatherUsageStats=false"]

README.md ADDED Viewed

	@@ -0,0 +1,28 @@

+---
+title: Time Series Visualizer
+emoji: 📈
+colorFrom: red
+colorTo: gray
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Time Series Visualizer + AI Chart Interpreter
+A Streamlit app for Miami University Business Analytics students to upload CSV
+time-series data, create publication-quality charts, and get AI-powered chart
+interpretation.
+## Features
+- **Upload & Clean** — auto-detect delimiters, date columns, and numeric formats
+- **9+ Chart Types** — line, seasonal, subseries, ACF/PACF, decomposition, rolling, YoY, lag, spaghetti
+- **Multi-Series Support** — panel (small-multiples) and spaghetti plots for comparing series
+- **AI Interpretation** — GPT-5.2 vision analyzes chart images and returns structured insights
+- **QueryChat** — natural-language data filtering powered by DuckDB
+## Privacy
+All data processing happens in-memory. No data is persisted to disk.
+Only chart PNG images (never raw data) are sent to the AI when you click "Interpret."

app.py ADDED Viewed

	@@ -0,0 +1,698 @@

+"""
+Time Series Visualizer + AI Chart Interpreter
+=============================================
+Main Streamlit application.  Run with:
+    streamlit run app.py --server.port=7860
+"""
+from __future__ import annotations
+import hashlib
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+import matplotlib
+matplotlib.use("Agg")
+import pandas as pd
+import streamlit as st
+from src.ui_theme import (
+    apply_miami_theme,
+    get_miami_mpl_style,
+    get_palette_colors,
+    render_palette_preview,
+)
+from src.cleaning import (
+    read_csv_upload,
+    suggest_date_columns,
+    suggest_numeric_columns,
+    clean_dataframe,
+    detect_frequency,
+    add_time_features,
+    CleaningReport,
+    FrequencyInfo,
+)
+from src.diagnostics import (
+    compute_summary_stats,
+    compute_acf_pacf,
+    compute_decomposition,
+    compute_rolling_stats,
+    compute_yoy_change,
+    compute_multi_series_summary,
+)
+from src.plotting import (
+    fig_to_png_bytes,
+    plot_line_with_markers,
+    plot_line_colored_markers,
+    plot_seasonal,
+    plot_seasonal_subseries,
+    plot_acf_pacf,
+    plot_decomposition,
+    plot_rolling_overlay,
+    plot_yoy_change,
+    plot_lag,
+    plot_panel,
+    plot_spaghetti,
+)
+from src.ai_interpretation import (
+    check_api_key_available,
+    interpret_chart,
+    render_interpretation,
+)
+from src.querychat_helpers import (
+    check_querychat_available,
+    create_querychat,
+    get_filtered_pandas_df,
+)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+_DATA_DIR = Path(__file__).parent / "data"
+_DEMO_FILES = {
+    "Monthly Retail Sales (single)": _DATA_DIR / "demo_single.csv",
+    "Quarterly Revenue by Region (wide)": _DATA_DIR / "demo_multi_wide.csv",
+    "Daily Stock Prices – 20 Tickers (long)": _DATA_DIR / "demo_multi_long.csv",
+}
+_CHART_TYPES = [
+    "Line with Markers",
+    "Line – Colored Markers",
+    "Seasonal Plot",
+    "Seasonal Sub-series",
+    "ACF / PACF",
+    "Decomposition",
+    "Rolling Mean Overlay",
+    "Year-over-Year Change",
+    "Lag Plot",
+]
+_PALETTE_NAMES = ["Set2", "Dark2", "Set1", "Paired", "Pastel1", "Pastel2", "Accent"]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _df_hash(df: pd.DataFrame) -> str:
+    """Fast hash of a DataFrame for cache-key / change-detection."""
+    return hashlib.md5(
+        pd.util.hash_pandas_object(df).values.tobytes()
+    ).hexdigest()
+def _load_demo(path: Path) -> pd.DataFrame:
+    return pd.read_csv(path)
+def _render_cleaning_report(report: CleaningReport) -> None:
+    """Show a data-quality card."""
+    c1, c2, c3 = st.columns(3)
+    c1.metric("Rows before", f"{report.rows_before:,}")
+    c2.metric("Rows after", f"{report.rows_after:,}")
+    c3.metric("Duplicates found", f"{report.duplicates_found:,}")
+    if report.missing_before:
+        with st.expander("Missing values"):
+            cols = list(report.missing_before.keys())
+            mc1, mc2 = st.columns(2)
+            with mc1:
+                st.write("**Before cleaning**")
+                for c in cols:
+                    st.write(f"- {c}: {report.missing_before[c]}")
+            with mc2:
+                st.write("**After cleaning**")
+                for c in cols:
+                    st.write(f"- {c}: {report.missing_after.get(c, 0)}")
+    if report.parsing_warnings:
+        with st.expander("Parsing warnings"):
+            for w in report.parsing_warnings:
+                st.warning(w)
+def _render_summary_stats(stats) -> None:
+    """Render SummaryStats as metric cards + expander."""
+    row1 = st.columns(4)
+    row1[0].metric("Count", f"{stats.count:,}")
+    row1[1].metric("Missing", f"{stats.missing_count} ({stats.missing_pct:.1f}%)")
+    row1[2].metric("Mean", f"{stats.mean_val:,.2f}")
+    row1[3].metric("Std Dev", f"{stats.std_val:,.2f}")
+    row2 = st.columns(4)
+    row2[0].metric("Min", f"{stats.min_val:,.2f}")
+    row2[1].metric("25th %ile", f"{stats.p25:,.2f}")
+    row2[2].metric("Median", f"{stats.median_val:,.2f}")
+    row2[3].metric("75th %ile / Max", f"{stats.p75:,.2f} / {stats.max_val:,.2f}")
+    with st.expander("Trend & Stationarity"):
+        tc1, tc2 = st.columns(2)
+        tc1.metric(
+            "Trend slope (per period)",
+            f"{stats.trend_slope:,.4f}" if pd.notna(stats.trend_slope) else "N/A",
+            help="Slope from OLS on a numeric index.",
+        )
+        tc2.metric(
+            "Trend p-value",
+            f"{stats.trend_pvalue:.4f}" if pd.notna(stats.trend_pvalue) else "N/A",
+        )
+        ac1, ac2 = st.columns(2)
+        ac1.metric(
+            "ADF statistic",
+            f"{stats.adf_statistic:.4f}" if pd.notna(stats.adf_statistic) else "N/A",
+            help="Augmented Dickey-Fuller test statistic.",
+        )
+        ac2.metric(
+            "ADF p-value",
+            f"{stats.adf_pvalue:.4f}" if pd.notna(stats.adf_pvalue) else "N/A",
+            help="p < 0.05 suggests the series is stationary.",
+        )
+        st.caption(
+            f"Date range: {stats.date_start.date()} to {stats.date_end.date()} "
+            f"({stats.date_span_days:,} days)"
+        )
+# ---------------------------------------------------------------------------
+# Page config
+# ---------------------------------------------------------------------------
+st.set_page_config(
+    page_title="Time Series Visualizer",
+    page_icon="\U0001f4c8",
+    layout="wide",
+)
+apply_miami_theme()
+style_dict = get_miami_mpl_style()
+# ---------------------------------------------------------------------------
+# Session state initialisation
+# ---------------------------------------------------------------------------
+for key in [
+    "raw_df", "cleaned_df", "cleaning_report", "freq_info",
+    "date_col", "y_cols", "qc", "qc_hash",
+]:
+    if key not in st.session_state:
+        st.session_state[key] = None
+# ---------------------------------------------------------------------------
+# Sidebar — Data input
+# ---------------------------------------------------------------------------
+with st.sidebar:
+    st.markdown(
+        """
+        <div style="text-align:center; margin-bottom:0.5rem;">
+            <span style="font-size:1.6rem; font-weight:800; color:#C41230;">
+                Time Series Visualizer
+            </span><br>
+            <span style="font-size:0.82rem; color:#000;">
+                ISA 444 &middot; Miami University
+            </span>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.divider()
+    st.header("Data Input")
+    uploaded = st.file_uploader("Upload a CSV file", type=["csv", "tsv", "txt"])
+    demo_choice = st.selectbox(
+        "Or load a demo dataset",
+        ["(none)"] + list(_DEMO_FILES.keys()),
+    )
+    # Load data
+    if uploaded is not None:
+        df_raw, delim = read_csv_upload(uploaded)
+        st.caption(f"Detected delimiter: `{repr(delim)}`")
+        st.session_state.raw_df = df_raw
+    elif demo_choice != "(none)":
+        st.session_state.raw_df = _load_demo(_DEMO_FILES[demo_choice])
+    # else: keep whatever was already in session state
+    raw_df: pd.DataFrame | None = st.session_state.raw_df
+    if raw_df is not None:
+        st.divider()
+        st.subheader("Column Selection")
+        # Auto-suggest
+        date_suggestions = suggest_date_columns(raw_df)
+        numeric_suggestions = suggest_numeric_columns(raw_df)
+        all_cols = list(raw_df.columns)
+        default_date_idx = all_cols.index(date_suggestions[0]) if date_suggestions else 0
+        date_col = st.selectbox("Date column", all_cols, index=default_date_idx)
+        remaining = [c for c in all_cols if c != date_col]
+        default_y = [c for c in numeric_suggestions if c != date_col]
+        y_cols = st.multiselect(
+            "Value column(s)",
+            remaining,
+            default=default_y[:4] if default_y else [],
+        )
+        st.session_state.date_col = date_col
+        st.session_state.y_cols = y_cols
+        st.divider()
+        st.subheader("Cleaning Options")
+        dup_action = st.selectbox(
+            "Duplicate dates",
+            ["keep_last", "keep_first", "drop_all"],
+        )
+        missing_action = st.selectbox(
+            "Missing values",
+            ["interpolate", "ffill", "drop"],
+        )
+        # Clean
+        if y_cols:
+            cleaned_df, report = clean_dataframe(
+                raw_df, date_col, y_cols,
+                dup_action=dup_action,
+                missing_action=missing_action,
+            )
+            freq_info = detect_frequency(cleaned_df, date_col)
+            cleaned_df = add_time_features(cleaned_df, date_col)
+            st.session_state.cleaned_df = cleaned_df
+            st.session_state.cleaning_report = report
+            st.session_state.freq_info = freq_info
+            st.caption(f"Frequency: **{freq_info.label}** "
+                       f"({'regular' if freq_info.is_regular else 'irregular'})")
+            # Frequency override
+            freq_override = st.text_input(
+                "Override frequency label (optional)",
+                value="",
+                help="e.g. Daily, Weekly, Monthly, Quarterly, Yearly",
+            )
+            if freq_override.strip():
+                st.session_state.freq_info = FrequencyInfo(
+                    label=freq_override.strip(),
+                    median_delta=freq_info.median_delta,
+                    is_regular=freq_info.is_regular,
+                )
+            # ------ QueryChat ------
+            if check_querychat_available():
+                current_hash = _df_hash(cleaned_df) + str(y_cols)
+                if st.session_state.qc_hash != current_hash:
+                    st.session_state.qc = create_querychat(
+                        cleaned_df,
+                        name="uploaded data",
+                        date_col=date_col,
+                        y_cols=y_cols,
+                        freq_label=st.session_state.freq_info.label,
+                    )
+                    st.session_state.qc_hash = current_hash
+                st.divider()
+                st.subheader("QueryChat")
+                st.session_state.qc.ui()
+            else:
+                st.divider()
+                st.info(
+                    "Set `OPENAI_API_KEY` to enable QueryChat "
+                    "(natural-language data filtering)."
+                )
+        # Reset button
+        st.divider()
+        if st.button("Reset all"):
+            for k in list(st.session_state.keys()):
+                del st.session_state[k]
+            st.rerun()
+    st.divider()
+    st.markdown(
+        """
+        <div style="text-align:center; padding:0.5rem 0;">
+            <span style="font-size:0.75rem; color:#000;">
+                Developed by <strong>Fadel M. Megahed</strong><br>
+                for <strong>ISA 444</strong> &middot; Miami University<br>
+                Version <strong>0.1.0</strong>
+            </span>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.caption(
+        "**Privacy:** All processing is in-memory. "
+        "Only chart images (never raw data) are sent to the AI when you click Interpret."
+    )
+# ---------------------------------------------------------------------------
+# Main area — guard
+# ---------------------------------------------------------------------------
+cleaned_df: pd.DataFrame | None = st.session_state.cleaned_df
+date_col: str | None = st.session_state.date_col
+y_cols: list[str] | None = st.session_state.y_cols
+freq_info: FrequencyInfo | None = st.session_state.freq_info
+report: CleaningReport | None = st.session_state.cleaning_report
+if cleaned_df is None or not y_cols:
+    st.title("Time Series Visualizer")
+    st.write(
+        "Upload a CSV or choose a demo dataset from the sidebar to get started."
+    )
+    st.stop()
+# If QueryChat is active, use its filtered df
+if st.session_state.qc is not None:
+    working_df = get_filtered_pandas_df(st.session_state.qc)
+    if working_df.empty:
+        working_df = cleaned_df
+else:
+    working_df = cleaned_df
+# Data quality report
+if report is not None:
+    with st.expander("Data Quality Report", expanded=False):
+        _render_cleaning_report(report)
+# ---------------------------------------------------------------------------
+# Tabs
+# ---------------------------------------------------------------------------
+tab_single, tab_few, tab_many = st.tabs([
+    "Single Series",
+    "Few Series (Panel)",
+    "Many Series (Spaghetti)",
+])
+# ===================================================================
+# Tab A — Single Series
+# ===================================================================
+with tab_single:
+    if len(y_cols) == 1:
+        active_y = y_cols[0]
+    else:
+        active_y = st.selectbox("Select value column", y_cols, key="tab_a_y")
+    # ---- Date range filter ------------------------------------------------
+    dr_mode = st.radio(
+        "Date range",
+        ["All", "Last N years", "Custom"],
+        horizontal=True,
+        key="dr_mode",
+    )
+    df_plot = working_df.copy()
+    if dr_mode == "Last N years":
+        n_years = st.slider("Years", 1, 20, 5, key="dr_n")
+        cutoff = df_plot[date_col].max() - pd.DateOffset(years=n_years)
+        df_plot = df_plot[df_plot[date_col] >= cutoff]
+    elif dr_mode == "Custom":
+        d_min = df_plot[date_col].min().date()
+        d_max = df_plot[date_col].max().date()
+        sel = st.slider("Date range", d_min, d_max, (d_min, d_max), key="dr_custom")
+        df_plot = df_plot[
+            (df_plot[date_col].dt.date >= sel[0])
+            & (df_plot[date_col].dt.date <= sel[1])
+        ]
+    if df_plot.empty:
+        st.warning("No data in selected range.")
+        st.stop()
+    # ---- Chart controls ---------------------------------------------------
+    col_chart, col_opts = st.columns([2, 1])
+    with col_opts:
+        chart_type = st.selectbox("Chart type", _CHART_TYPES, key="chart_type_a")
+        palette_name = st.selectbox("Color palette", _PALETTE_NAMES, key="pal_a")
+        n_colors = max(12, len(y_cols))
+        palette_colors = get_palette_colors(palette_name, n_colors)
+        swatch_fig = render_palette_preview(palette_colors[:8])
+        st.pyplot(swatch_fig, width="stretch")
+        # Chart-specific controls
+        period_label = "month"
+        window_size = 12
+        lag_val = 1
+        decomp_model = "additive"
+        if chart_type in ("Seasonal Plot", "Seasonal Sub-series"):
+            period_label = st.selectbox("Period", ["month", "quarter"], key="period_a")
+        if chart_type == "Rolling Mean Overlay":
+            window_size = st.slider("Window", 2, 52, 12, key="window_a")
+        if chart_type == "Lag Plot":
+            lag_val = st.slider("Lag", 1, 52, 1, key="lag_a")
+        if chart_type == "Decomposition":
+            decomp_model = st.selectbox("Model", ["additive", "multiplicative"], key="decomp_a")
+    # ---- Render chart -----------------------------------------------------
+    with col_chart:
+        fig = None
+        try:
+            if chart_type == "Line with Markers":
+                fig = plot_line_with_markers(
+                    df_plot, date_col, active_y,
+                    title=f"{active_y} over Time",
+                    style_dict=style_dict, palette_colors=palette_colors,
+                )
+            elif chart_type == "Line – Colored Markers":
+                if "month" in df_plot.columns:
+                    color_by = st.selectbox(
+                        "Color by",
+                        ["month", "quarter", "year", "day_of_week"],
+                        key="color_by_a",
+                    )
+                else:
+                    color_by = st.selectbox("Color by", [c for c in df_plot.columns if c not in (date_col, active_y)][:5], key="color_by_a")
+                fig = plot_line_colored_markers(
+                    df_plot, date_col, active_y,
+                    color_by=color_by, palette_colors=palette_colors,
+                    title=f"{active_y} colored by {color_by}",
+                    style_dict=style_dict,
+                )
+            elif chart_type == "Seasonal Plot":
+                fig = plot_seasonal(
+                    df_plot, date_col, active_y,
+                    period=period_label,
+                    palette_name_colors=palette_colors,
+                    title=f"Seasonal Plot – {active_y}",
+                    style_dict=style_dict,
+                )
+            elif chart_type == "Seasonal Sub-series":
+                fig = plot_seasonal_subseries(
+                    df_plot, date_col, active_y,
+                    period=period_label,
+                    title=f"Seasonal Sub-series – {active_y}",
+                    style_dict=style_dict, palette_colors=palette_colors,
+                )
+            elif chart_type == "ACF / PACF":
+                series = df_plot[active_y].dropna()
+                acf_vals, acf_ci, pacf_vals, pacf_ci = compute_acf_pacf(series)
+                fig = plot_acf_pacf(
+                    acf_vals, acf_ci, pacf_vals, pacf_ci,
+                    title=f"ACF / PACF – {active_y}",
+                    style_dict=style_dict,
+                )
+            elif chart_type == "Decomposition":
+                period_int = None
+                if freq_info and freq_info.label == "Monthly":
+                    period_int = 12
+                elif freq_info and freq_info.label == "Quarterly":
+                    period_int = 4
+                elif freq_info and freq_info.label == "Weekly":
+                    period_int = 52
+                elif freq_info and freq_info.label == "Daily":
+                    period_int = 365
+                result = compute_decomposition(
+                    df_plot, date_col, active_y,
+                    model=decomp_model, period=period_int,
+                )
+                fig = plot_decomposition(
+                    result,
+                    title=f"Decomposition – {active_y} ({decomp_model})",
+                    style_dict=style_dict,
+                )
+            elif chart_type == "Rolling Mean Overlay":
+                fig = plot_rolling_overlay(
+                    df_plot, date_col, active_y,
+                    window=window_size,
+                    title=f"Rolling {window_size}-pt Mean – {active_y}",
+                    style_dict=style_dict, palette_colors=palette_colors,
+                )
+            elif chart_type == "Year-over-Year Change":
+                yoy_result = compute_yoy_change(df_plot, date_col, active_y)
+                yoy_df = pd.DataFrame({
+                    "date": yoy_result[date_col],
+                    "abs_change": yoy_result["yoy_abs_change"],
+                    "pct_change": yoy_result["yoy_pct_change"],
+                }).dropna()
+                fig = plot_yoy_change(
+                    df_plot, date_col, active_y, yoy_df,
+                    title=f"Year-over-Year Change – {active_y}",
+                    style_dict=style_dict,
+                )
+            elif chart_type == "Lag Plot":
+                fig = plot_lag(
+                    df_plot[active_y],
+                    lag=lag_val,
+                    title=f"Lag-{lag_val} Plot – {active_y}",
+                    style_dict=style_dict,
+                )
+        except Exception as exc:
+            st.error(f"Chart error: {exc}")
+        if fig is not None:
+            st.pyplot(fig, width="stretch")
+    # ---- Summary stats expander -------------------------------------------
+    with st.expander("Summary Statistics", expanded=False):
+        stats = compute_summary_stats(df_plot, date_col, active_y)
+        _render_summary_stats(stats)
+    # ---- AI Interpretation ------------------------------------------------
+    with st.expander("AI Chart Interpretation", expanded=False):
+        st.caption(
+            "The chart image (PNG) and metadata are sent to OpenAI. "
+            "No raw data leaves this app."
+        )
+        if not check_api_key_available():
+            st.warning("Set `OPENAI_API_KEY` to enable AI interpretation.")
+        elif fig is not None:
+            if st.button("Interpret Chart with AI", key="interpret_a"):
+                with st.spinner("Analyzing chart..."):
+                    png = fig_to_png_bytes(fig)
+                    date_range_str = (
+                        f"{df_plot[date_col].min().date()} to "
+                        f"{df_plot[date_col].max().date()}"
+                    )
+                    metadata = {
+                        "chart_type": chart_type,
+                        "frequency_label": freq_info.label if freq_info else "Unknown",
+                        "date_range": date_range_str,
+                        "y_column": active_y,
+                    }
+                    interp = interpret_chart(png, metadata)
+                    render_interpretation(interp)
+# ===================================================================
+# Tab B — Few Series (Panel)
+# ===================================================================
+with tab_few:
+    if len(y_cols) < 2:
+        st.info("Select 2+ value columns in the sidebar to use panel plots.")
+    else:
+        st.subheader("Panel Plot (Small Multiples)")
+        panel_cols = st.multiselect(
+            "Columns to plot",
+            y_cols,
+            default=y_cols[:4],
+            key="panel_cols",
+        )
+        if panel_cols:
+            pc1, pc2 = st.columns(2)
+            with pc1:
+                panel_chart = st.selectbox(
+                    "Chart type", ["line", "bar"], key="panel_chart"
+                )
+            with pc2:
+                shared_y = st.checkbox("Shared Y axis", value=True, key="panel_shared")
+            palette_name_b = st.selectbox("Color palette", _PALETTE_NAMES, key="pal_b")
+            palette_b = get_palette_colors(palette_name_b, len(panel_cols))
+            try:
+                fig_panel = plot_panel(
+                    working_df, date_col, panel_cols,
+                    chart_type=panel_chart,
+                    shared_y=shared_y,
+                    title="Panel Comparison",
+                    style_dict=style_dict,
+                    palette_colors=palette_b,
+                )
+                st.pyplot(fig_panel, width="stretch")
+            except Exception as exc:
+                st.error(f"Panel chart error: {exc}")
+            # Per-series summary table
+            with st.expander("Per-series Summary", expanded=False):
+                summary_df = compute_multi_series_summary(
+                    working_df, date_col, panel_cols,
+                )
+                st.dataframe(
+                    summary_df.style.format({
+                        "mean": "{:,.2f}",
+                        "std": "{:,.2f}",
+                        "min": "{:,.2f}",
+                        "max": "{:,.2f}",
+                        "trend_slope": "{:,.4f}",
+                        "adf_pvalue": "{:.4f}",
+                    }),
+                    width="stretch",
+                )
+# ===================================================================
+# Tab C — Many Series (Spaghetti)
+# ===================================================================
+with tab_many:
+    if len(y_cols) < 2:
+        st.info("Select 2+ value columns in the sidebar to use spaghetti plots.")
+    else:
+        st.subheader("Spaghetti Plot")
+        spag_cols = st.multiselect(
+            "Columns to include",
+            y_cols,
+            default=y_cols,
+            key="spag_cols",
+        )
+        if spag_cols:
+            sc1, sc2, sc3 = st.columns(3)
+            with sc1:
+                alpha_val = st.slider("Alpha", 0.05, 1.0, 0.15, 0.05, key="spag_alpha")
+            with sc2:
+                top_n = st.number_input("Highlight top N", 0, len(spag_cols), 0, key="spag_topn")
+                top_n = top_n if top_n > 0 else None
+            with sc3:
+                highlight = st.selectbox(
+                    "Highlight series",
+                    ["(none)"] + spag_cols,
+                    key="spag_highlight",
+                )
+                highlight_col = highlight if highlight != "(none)" else None
+            show_median = st.checkbox("Show Median + IQR band", value=False, key="spag_median")
+            palette_name_c = st.selectbox("Color palette", _PALETTE_NAMES, key="pal_c")
+            palette_c = get_palette_colors(palette_name_c, len(spag_cols))
+            try:
+                fig_spag = plot_spaghetti(
+                    working_df, date_col, spag_cols,
+                    alpha=alpha_val,
+                    highlight_col=highlight_col,
+                    top_n=top_n,
+                    show_median_band=show_median,
+                    title="Spaghetti Plot",
+                    style_dict=style_dict,
+                    palette_colors=palette_c,
+                )
+                st.pyplot(fig_spag, width="stretch")
+            except Exception as exc:
+                st.error(f"Spaghetti chart error: {exc}")

data/demo_multi_long.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/demo_multi_wide.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+date,North,South,East,West
+2017-01-01,102373.1,81565.82,120039.01,85866.99
+2017-04-01,103071.84,86690.95,130160.6,92986.52
+2017-07-01,105808.38,82351.48,120806.03,93145.11
+2017-10-01,93194.45,78439.34,125560.51,88941.36
+2018-01-01,104960.57,81159.93,125077.0,94745.14
+2018-04-01,115571.37,89696.76,126428.53,110558.19
+2018-07-01,101828.39,85679.22,121587.32,96512.67
+2018-10-01,98901.11,78456.95,122047.42,94006.7
+2019-01-01,106698.95,91997.32,125729.61,99262.01
+2019-04-01,110689.57,93621.5,134342.0,104154.17
+2019-07-01,103348.01,84426.09,129419.71,97054.19
+2019-10-01,104005.69,85769.66,123581.51,96076.91
+2020-01-01,106413.09,86675.95,127059.62,97281.52
+2020-04-01,116820.78,97761.25,130855.46,104689.54
+2020-07-01,108441.73,94675.79,129860.46,99743.91
+2020-10-01,111649.8,84537.95,129569.2,97245.62
+2021-01-01,110450.24,95690.13,133442.28,109743.98
+2021-04-01,117633.82,99838.34,134862.78,102998.2
+2021-07-01,116840.55,96866.18,134919.54,106458.78
+2021-10-01,106507.41,95890.38,131355.95,95361.85
+2022-01-01,116682.38,95263.84,133348.43,104584.2
+2022-04-01,125721.43,99538.79,142261.18,115066.85
+2022-07-01,112777.55,94931.46,137774.63,107792.84
+2022-10-01,113953.9,90952.57,129971.09,100166.77
+2023-01-01,119979.65,98968.69,140273.36,107054.09
+2023-04-01,127345.47,106023.46,146682.35,117038.79
+2023-07-01,117089.15,101630.07,144049.15,108608.9
+2023-10-01,112638.63,99081.55,139761.41,107249.38

data/demo_single.csv ADDED Viewed

	@@ -0,0 +1,121 @@

+date,sales
+2014-01-01,44065.23
+2014-02-01,45923.47
+2014-03-01,51695.38
+2014-04-01,57646.06
+2014-05-01,57259.9
+2014-06-01,58531.73
+2014-07-01,61286.63
+2014-08-01,56934.87
+2014-09-01,50661.05
+2014-10-01,48885.12
+2014-11-01,44144.96
+2014-12-01,43268.54
+2015-01-01,45955.72
+2015-02-01,44773.44
+2015-03-01,49350.16
+2015-04-01,55875.42
+2015-05-01,58102.54
+2015-06-01,62028.49
+2015-07-01,58712.16
+2015-08-01,54975.39
+2015-09-01,56931.3
+2015-10-01,49748.45
+2015-11-01,47606.85
+2015-12-01,43750.5
+2016-01-01,46783.03
+2016-02-01,51221.85
+2016-03-01,52898.01
+2016-04-01,60151.4
+2016-05-01,61326.93
+2016-06-01,63216.61
+2016-07-01,61724.79
+2016-08-01,63904.56
+2016-09-01,56373.01
+2016-10-01,50484.58
+2016-11-01,51516.89
+2016-12-01,46558.31
+2017-01-01,65689.52
+2017-02-01,49480.66
+2017-03-01,54943.63
+2017-04-01,62193.72
+2017-05-01,66405.14
+2017-06-01,66542.74
+2017-07-01,65096.91
+2017-08-01,61997.79
+2017-09-01,55842.96
+2017-10-01,53560.31
+2017-11-01,51350.52
+2017-12-01,53514.24
+2018-01-01,53359.03
+2018-02-01,52273.92
+2018-03-01,60648.17
+2018-04-01,63429.84
+2018-05-01,65974.36
+2018-06-01,69823.35
+2018-07-01,69790.2
+2018-08-01,66862.56
+2018-09-01,59521.56
+2018-10-01,56781.58
+2018-11-01,55334.32
+2018-12-01,55751.09
+2019-01-01,54113.45
+2019-02-01,57828.68
+2019-03-01,60187.33
+2019-04-01,64207.59
+2019-05-01,71353.25
+2019-06-01,73712.48
+2019-07-01,69984.18
+2019-08-01,69407.07
+2019-09-01,64323.27
+2019-10-01,58509.76
+2019-11-01,57794.59
+2019-12-01,59276.07
+2020-01-01,72400.14
+2020-02-01,63729.29
+2020-03-01,59560.51
+2020-04-01,70643.81
+2020-05-01,72302.3
+2020-06-01,72801.99
+2020-07-01,72711.72
+2020-08-01,65824.86
+2020-09-01,65560.66
+2020-10-01,62914.23
+2020-11-01,62427.58
+2020-12-01,57563.46
+2021-01-01,58254.81
+2021-02-01,61996.49
+2021-03-01,69030.8
+2021-04-01,72057.5
+2021-05-01,73468.68
+2021-06-01,76826.53
+2021-07-01,75122.36
+2021-08-01,74137.29
+2021-09-01,66995.89
+2021-10-01,63944.68
+2021-11-01,61087.58
+2021-12-01,58072.97
+2022-01-01,62864.04
+2022-02-01,65922.11
+2022-03-01,69610.23
+2022-04-01,73330.83
+2022-05-01,89097.46
+2022-06-01,77358.71
+2022-07-01,76642.77
+2022-08-01,72995.45
+2022-09-01,70477.43
+2022-10-01,67808.1
+2022-11-01,68044.17
+2022-12-01,63749.16
+2023-01-01,65186.9
+2023-02-01,67651.11
+2023-03-01,68162.46
+2023-04-01,76146.97
+2023-05-01,79448.66
+2023-06-01,85526.48
+2023-07-01,79343.48
+2023-08-01,77603.09
+2023-09-01,73130.58
+2023-10-01,67062.64
+2023-11-01,68957.44
+2023-12-01,67303.87

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit==1.54.0
+pandas==2.3.3
+numpy==2.4.2
+matplotlib==3.10.8
+statsmodels==0.14.6
+scipy==1.17.0
+openai==2.2.0
+querychat[streamlit]==0.5.1
+duckdb==1.4.4
+palettable==3.3.3
+pydantic==2.12.5
+python-dotenv==1.1.0

scripts/generate_demo_data.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Generate demo CSV datasets for the time-series visualization app."""
+import os
+from pathlib import Path
+import numpy as np
+import pandas as pd
+# Reproducibility
+np.random.seed(42)
+# Resolve paths relative to the project root (parent of scripts/)
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DATA_DIR = PROJECT_ROOT / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+# ---------------------------------------------------------------------------
+# 1. data/demo_single.csv  --  Monthly retail sales (Jan 2014 - Dec 2023)
+# ---------------------------------------------------------------------------
+def generate_single_series() -> pd.DataFrame:
+    n = 120  # 10 years * 12 months
+    dates = pd.date_range(start="2014-01-01", periods=n, freq="MS")
+    months = np.arange(n)
+    # Upward trend: start ~50 000, grow ~200 per month
+    trend = 50_000 + 200 * months
+    # Seasonal component: sin wave peaking in December (month index 11)
+    # sin peaks at pi/2; December is month 11 (0-indexed within each year).
+    # Shift so that sin(...) = 1 when month-of-year == 11 (December).
+    month_of_year = months % 12
+    seasonal = 8_000 * np.sin(2 * np.pi * (month_of_year - 2) / 12)
+    # Random noise
+    noise = np.random.normal(0, 2_000, size=n)
+    sales = trend + seasonal + noise
+    # Inject 2-3 anomaly spikes
+    for idx in [36, 72, 100]:
+        sales[idx] += 15_000
+    df = pd.DataFrame({"date": dates, "sales": np.round(sales, 2)})
+    return df
+# ---------------------------------------------------------------------------
+# 2. data/demo_multi_wide.csv  --  Quarterly revenue by region (Q1 2017 - Q4 2023)
+# ---------------------------------------------------------------------------
+def generate_multi_wide() -> pd.DataFrame:
+    n = 28  # 7 years * 4 quarters
+    dates = pd.date_range(start="2017-01-01", periods=n, freq="QS")
+    quarters = np.arange(n)
+    quarter_of_year = quarters % 4  # 0=Q1 .. 3=Q4
+    regions = {
+        "North": 100_000,
+        "South": 80_000,
+        "East": 120_000,
+        "West": 90_000,
+    }
+    data: dict[str, object] = {"date": dates}
+    for name, base in regions.items():
+        trend = base + 800 * quarters
+        seasonal = 5_000 * np.sin(2 * np.pi * quarter_of_year / 4)
+        noise = np.random.normal(0, 3_000, size=n)
+        data[name] = np.round(trend + seasonal + noise, 2)
+    return pd.DataFrame(data)
+# ---------------------------------------------------------------------------
+# 3. data/demo_multi_long.csv  --  Daily stock prices for 20 tickers
+#    (2022-01-03 to 2023-12-29, business days only)
+# ---------------------------------------------------------------------------
+def generate_multi_long() -> pd.DataFrame:
+    trading_days = pd.bdate_range(start="2022-01-03", end="2023-12-29")
+    # 20 simple four-letter tickers: AAAA, BBBB, ..., TTTT
+    tickers = [chr(ord("A") + i) * 4 for i in range(20)]
+    daily_drift = 0.0002
+    daily_vol = 0.02
+    frames: list[pd.DataFrame] = []
+    for ticker in tickers:
+        start_price = np.random.uniform(50, 500)
+        n_days = len(trading_days)
+        # Geometric Brownian Motion: S_t = S_0 * exp(cumsum(log returns))
+        log_returns = np.random.normal(
+            daily_drift - 0.5 * daily_vol**2, daily_vol, size=n_days
+        )
+        log_returns[0] = 0  # first day: price = start_price
+        prices = start_price * np.exp(np.cumsum(log_returns))
+        frames.append(
+            pd.DataFrame(
+                {
+                    "date": trading_days,
+                    "ticker": ticker,
+                    "price": np.round(prices, 2),
+                }
+            )
+        )
+    return pd.concat(frames, ignore_index=True)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    single = generate_single_series()
+    single.to_csv(DATA_DIR / "demo_single.csv", index=False)
+    print(f"Wrote {len(single)} rows -> {DATA_DIR / 'demo_single.csv'}")
+    wide = generate_multi_wide()
+    wide.to_csv(DATA_DIR / "demo_multi_wide.csv", index=False)
+    print(f"Wrote {len(wide)} rows -> {DATA_DIR / 'demo_multi_wide.csv'}")
+    long = generate_multi_long()
+    long.to_csv(DATA_DIR / "demo_multi_long.csv", index=False)
+    print(f"Wrote {len(long)} rows -> {DATA_DIR / 'demo_multi_long.csv'}")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

File without changes

src/ai_interpretation.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+ai_interpretation.py
+--------------------
+AI-powered chart interpretation using OpenAI GPT-5.2 vision with
+Pydantic structured output.
+Provides:
+    - Pydantic models for structured chart analysis results
+    - Vision-based chart interpretation via OpenAI's GPT-5.2 model
+    - Streamlit rendering of interpretation results
+"""
+from __future__ import annotations
+import base64
+import json
+import os
+from typing import Literal
+import openai
+from pydantic import BaseModel, ConfigDict
+import streamlit as st
+# ---------------------------------------------------------------------------
+# Pydantic models
+# ---------------------------------------------------------------------------
+class TrendInfo(BaseModel):
+    """Describes the overall trend detected in the chart."""
+    model_config = ConfigDict(extra="forbid")
+    direction: Literal["upward", "downward", "flat", "mixed"]
+    description: str
+class SeasonalityInfo(BaseModel):
+    """Describes any seasonality detected in the chart."""
+    model_config = ConfigDict(extra="forbid")
+    detected: bool
+    period: str | None
+    description: str
+class StationarityInfo(BaseModel):
+    """Describes whether the series appears stationary."""
+    model_config = ConfigDict(extra="forbid")
+    likely_stationary: bool
+    description: str
+class AnomalyItem(BaseModel):
+    """A single anomaly or outlier observation."""
+    model_config = ConfigDict(extra="forbid")
+    approximate_location: str
+    description: str
+    severity: Literal["low", "medium", "high"]
+class ChartInterpretation(BaseModel):
+    """Complete structured interpretation of a time-series chart."""
+    model_config = ConfigDict(extra="forbid")
+    chart_type_detected: str
+    trend: TrendInfo
+    seasonality: SeasonalityInfo
+    stationarity: StationarityInfo
+    anomalies: list[AnomalyItem]
+    key_observations: list[str]
+    summary: str
+    recommendations: list[str]
+# ---------------------------------------------------------------------------
+# API key check
+# ---------------------------------------------------------------------------
+def check_api_key_available() -> bool:
+    """Return ``True`` if the ``OPENAI_API_KEY`` environment variable is set
+    and non-empty."""
+    key = os.environ.get("OPENAI_API_KEY", "")
+    return bool(key.strip())
+# ---------------------------------------------------------------------------
+# Chart interpretation
+# ---------------------------------------------------------------------------
+_SYSTEM_PROMPT = (
+    "You are a careful time-series analyst helping business analytics "
+    "students. Analyze the chart image and provide a structured "
+    "interpretation. Be precise about what the data shows; flag anything "
+    "noteworthy. Use plain language suitable for students."
+)
+def interpret_chart(
+    png_bytes: bytes,
+    metadata: dict,
+) -> ChartInterpretation:
+    """Send a chart image to GPT-5.2 vision and return a structured
+    interpretation.
+    Parameters
+    ----------
+    png_bytes:
+        Raw PNG image bytes of the chart to analyse.
+    metadata:
+        Context about the chart.  Expected keys:
+        * ``chart_type`` -- e.g. ``"line"``, ``"bar"``, ``"decomposition"``
+        * ``frequency_label`` -- e.g. ``"Monthly"``, ``"Daily"``
+        * ``date_range`` -- human-readable date range string
+        * ``y_column`` -- name of the value column being plotted
+    """
+    try:
+        client = openai.OpenAI()
+        # Encode the PNG as a base64 data URI
+        b64 = base64.b64encode(png_bytes).decode("utf-8")
+        image_data_uri = f"data:image/png;base64,{b64}"
+        chart_type = metadata.get("chart_type", "time-series")
+        metadata_str = json.dumps(metadata, default=str)
+        response = client.beta.chat.completions.parse(
+            model="gpt-5.2-2025-12-11",
+            response_format=ChartInterpretation,
+            messages=[
+                {"role": "system", "content": _SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_data_uri},
+                        },
+                        {
+                            "type": "text",
+                            "text": (
+                                f"Analyze this {chart_type} chart. "
+                                f"Metadata: {metadata_str}"
+                            ),
+                        },
+                    ],
+                },
+            ],
+        )
+        # Prefer the parsed structured output
+        parsed = response.choices[0].message.parsed
+        if parsed is not None:
+            return parsed
+        # Fallback: try to manually parse the raw content
+        raw_content = response.choices[0].message.content or ""
+        data = json.loads(raw_content)
+        return ChartInterpretation(**data)
+    except Exception as exc:  # noqa: BLE001
+        # Return a minimal interpretation that surfaces the error
+        return ChartInterpretation(
+            chart_type_detected="unknown",
+            trend=TrendInfo(direction="mixed", description="Unable to determine."),
+            seasonality=SeasonalityInfo(
+                detected=False, period=None, description="Unable to determine."
+            ),
+            stationarity=StationarityInfo(
+                likely_stationary=False, description="Unable to determine."
+            ),
+            anomalies=[],
+            key_observations=["AI interpretation failed; see summary for details."],
+            summary=f"Error during AI interpretation: {exc}",
+            recommendations=["Check that your OPENAI_API_KEY is set and valid."],
+        )
+# ---------------------------------------------------------------------------
+# Streamlit rendering
+# ---------------------------------------------------------------------------
+_DIRECTION_EMOJI = {
+    "upward": "\u2197\ufe0f",    # arrow upper-right
+    "downward": "\u2198\ufe0f",  # arrow lower-right
+    "flat": "\u27a1\ufe0f",      # arrow right
+    "mixed": "\u2194\ufe0f",     # left-right arrow
+}
+_SEVERITY_COLOR = {
+    "low": "green",
+    "medium": "orange",
+    "high": "red",
+}
+def render_interpretation(interp: ChartInterpretation) -> None:
+    """Render a :class:`ChartInterpretation` as a styled Streamlit card.
+    Uses ``st.markdown``, ``st.expander``, and related widgets to lay out
+    the interpretation in an easy-to-read format with sections for trend,
+    seasonality, stationarity, anomalies, key observations, summary, and
+    recommendations.
+    """
+    st.markdown("### AI Chart Interpretation")
+    st.markdown(
+        f"**Detected chart type:** {interp.chart_type_detected}"
+    )
+    # ---- Summary ----------------------------------------------------------
+    st.markdown("---")
+    st.markdown(f"**Summary:** {interp.summary}")
+    # ---- Key observations -------------------------------------------------
+    with st.expander("Key Observations", expanded=True):
+        for obs in interp.key_observations:
+            st.markdown(f"- {obs}")
+    # ---- Trend ------------------------------------------------------------
+    with st.expander("Trend Analysis"):
+        arrow = _DIRECTION_EMOJI.get(interp.trend.direction, "")
+        st.markdown(
+            f"**Direction:** {interp.trend.direction.capitalize()} {arrow}"
+        )
+        st.markdown(interp.trend.description)
+    # ---- Seasonality ------------------------------------------------------
+    with st.expander("Seasonality"):
+        status = "Detected" if interp.seasonality.detected else "Not detected"
+        st.markdown(f"**Status:** {status}")
+        if interp.seasonality.period:
+            st.markdown(f"**Period:** {interp.seasonality.period}")
+        st.markdown(interp.seasonality.description)
+    # ---- Stationarity -----------------------------------------------------
+    with st.expander("Stationarity"):
+        label = (
+            "Likely stationary"
+            if interp.stationarity.likely_stationary
+            else "Likely non-stationary"
+        )
+        st.markdown(f"**Assessment:** {label}")
+        st.markdown(interp.stationarity.description)
+    # ---- Anomalies --------------------------------------------------------
+    with st.expander("Anomalies"):
+        if not interp.anomalies:
+            st.markdown("No anomalies detected.")
+        else:
+            for anomaly in interp.anomalies:
+                color = _SEVERITY_COLOR.get(anomaly.severity, "gray")
+                st.markdown(
+                    f"- **[{anomaly.approximate_location}]** "
+                    f":{color}[{anomaly.severity.upper()}] "
+                    f"-- {anomaly.description}"
+                )
+    # ---- Recommendations --------------------------------------------------
+    with st.expander("Recommended Next Steps"):
+        for rec in interp.recommendations:
+            st.markdown(f"1. {rec}")

src/cleaning.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+CSV ingest and auto-clean pipeline for time-series data.
+Provides delimiter detection, date/numeric column suggestion,
+numeric cleaning (currency, commas, percentages, parenthesised negatives),
+duplicate and missing-value handling, frequency detection, and
+calendar-feature extraction.
+"""
+import csv
+import io
+import re
+from dataclasses import dataclass, field
+from datetime import timedelta
+import numpy as np
+import pandas as pd
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+@dataclass
+class CleaningReport:
+    """Summary produced by :func:`clean_dataframe`."""
+    rows_before: int = 0
+    rows_after: int = 0
+    duplicates_found: int = 0
+    duplicates_action: str = ""
+    missing_before: dict = field(default_factory=dict)
+    missing_after: dict = field(default_factory=dict)
+    parsing_warnings: list = field(default_factory=list)
+@dataclass
+class FrequencyInfo:
+    """Result of :func:`detect_frequency`."""
+    label: str = "Unknown"
+    median_delta: timedelta = timedelta(0)
+    is_regular: bool = False
+# ---------------------------------------------------------------------------
+# Delimiter detection
+# ---------------------------------------------------------------------------
+def detect_delimiter(file_bytes: bytes) -> str:
+    """Return the most likely CSV delimiter for *file_bytes*.
+    Uses :class:`csv.Sniffer` on the first 8 KB of text.  Falls back to a
+    comma if the sniffer cannot decide.
+    """
+    try:
+        sample = file_bytes[:8192].decode("utf-8", errors="replace")
+        dialect = csv.Sniffer().sniff(sample)
+        return dialect.delimiter
+    except csv.Error:
+        return ","
+# ---------------------------------------------------------------------------
+# Reading uploads
+# ---------------------------------------------------------------------------
+def read_csv_upload(uploaded_file) -> tuple[pd.DataFrame, str]:
+    """Read a Streamlit ``UploadedFile`` and return ``(df, delimiter)``.
+    The file position is rewound so the object can be re-read later if
+    needed.
+    """
+    raw = uploaded_file.getvalue()
+    delimiter = detect_delimiter(raw)
+    text = raw.decode("utf-8", errors="replace")
+    df = pd.read_csv(io.StringIO(text), sep=delimiter)
+    # Rewind in case the caller wants to read again
+    uploaded_file.seek(0)
+    return df, delimiter
+# ---------------------------------------------------------------------------
+# Column suggestion helpers
+# ---------------------------------------------------------------------------
+_DATE_NAME_TOKENS = re.compile(r"(date|time|year|month|day|period)", re.IGNORECASE)
+def suggest_date_columns(df: pd.DataFrame) -> list[str]:
+    """Return column names that are likely to contain date/time values.
+    Checks are applied in order:
+    1. Column already has a datetime dtype.
+    2. :func:`pd.to_datetime` succeeds on the first non-null values.
+    3. The column *name* contains a date-related keyword.
+    """
+    candidates: list[str] = []
+    for col in df.columns:
+        # 1. Already datetime
+        if pd.api.types.is_datetime64_any_dtype(df[col]):
+            if col not in candidates:
+                candidates.append(col)
+            continue
+        # 2. Parseable as datetime (check up to first 5 non-null values)
+        sample = df[col].dropna().head(5)
+        if not sample.empty:
+            try:
+                pd.to_datetime(sample)
+                if col not in candidates:
+                    candidates.append(col)
+                continue
+            except (ValueError, TypeError, OverflowError):
+                pass
+        # 3. Column name heuristic
+        if _DATE_NAME_TOKENS.search(str(col)):
+            if col not in candidates:
+                candidates.append(col)
+    return candidates
+def suggest_numeric_columns(df: pd.DataFrame) -> list[str]:
+    """Return columns that are numeric or could be cleaned to numeric.
+    A non-numeric column qualifies if, after stripping common formatting
+    characters (currency symbols, commas, ``%``, parentheses), at least half
+    of its non-null values can be converted to a number.
+    """
+    candidates: list[str] = []
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            candidates.append(col)
+            continue
+        # Attempt lightweight cleaning on a sample
+        sample = df[col].dropna().head(50).astype(str)
+        if sample.empty:
+            continue
+        cleaned = (
+            sample
+            .str.replace(r"[\$\u20ac\u00a3,% ]", "", regex=True)
+            .str.replace(r"^\((.+)\)$", r"-\1", regex=True)
+        )
+        numeric = pd.to_numeric(cleaned, errors="coerce")
+        if numeric.notna().sum() >= max(1, len(sample) * 0.5):
+            candidates.append(col)
+    return candidates
+# ---------------------------------------------------------------------------
+# Numeric cleaning
+# ---------------------------------------------------------------------------
+def clean_numeric_series(series: pd.Series) -> pd.Series:
+    """Clean a series into proper numeric values.
+    Handles:
+    * Currency symbols: ``$``, ``EUR`` (U+20AC), ``GBP`` (U+00A3)
+    * Thousands separators (commas)
+    * Percentage signs
+    * Parenthesised negatives, e.g. ``(123)`` becomes ``-123``
+    """
+    s = series.astype(str)
+    # Strip currency symbols, commas, percent signs, and whitespace
+    s = s.str.replace(r"[\$\u20ac\u00a3,%\s]", "", regex=True)
+    # Convert accounting-style negatives: (123.45) -> -123.45
+    s = s.str.replace(r"^\((.+)\)$", r"-\1", regex=True)
+    return pd.to_numeric(s, errors="coerce")
+# ---------------------------------------------------------------------------
+# Full cleaning pipeline
+# ---------------------------------------------------------------------------
+def clean_dataframe(
+    df: pd.DataFrame,
+    date_col: str,
+    y_cols: list[str],
+    dup_action: str = "keep_last",
+    missing_action: str = "interpolate",
+) -> tuple[pd.DataFrame, CleaningReport]:
+    """Run the full cleaning pipeline and return ``(cleaned_df, report)``.
+    Parameters
+    ----------
+    df:
+        Input dataframe (will not be mutated).
+    date_col:
+        Name of the column to parse as dates.
+    y_cols:
+        Names of the value columns to clean to numeric.
+    dup_action:
+        How to handle duplicate dates: ``"keep_first"``, ``"keep_last"``,
+        or ``"drop_all"``.
+    missing_action:
+        How to handle missing values in *y_cols*: ``"interpolate"``,
+        ``"ffill"``, or ``"drop"``.
+    """
+    df = df.copy()
+    report = CleaningReport()
+    report.rows_before = len(df)
+    # --- Parse date column ------------------------------------------------
+    try:
+        df[date_col] = pd.to_datetime(df[date_col])
+    except Exception as exc:  # noqa: BLE001
+        report.parsing_warnings.append(
+            f"Date parsing issue in column '{date_col}': {exc}"
+        )
+        # Coerce individually so partial failures become NaT
+        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
+    nat_count = int(df[date_col].isna().sum())
+    if nat_count > 0:
+        report.parsing_warnings.append(
+            f"{nat_count} value(s) in '{date_col}' could not be parsed as dates."
+        )
+        df = df.dropna(subset=[date_col])
+    # --- Clean numeric columns --------------------------------------------
+    for col in y_cols:
+        if not pd.api.types.is_numeric_dtype(df[col]):
+            df[col] = clean_numeric_series(df[col])
+    # Record missing values *before* imputation
+    report.missing_before = {
+        col: int(df[col].isna().sum()) for col in y_cols
+    }
+    # --- Handle duplicates on date column ---------------------------------
+    dup_mask = df.duplicated(subset=[date_col], keep=False)
+    report.duplicates_found = int(dup_mask.sum())
+    report.duplicates_action = dup_action
+    if report.duplicates_found > 0:
+        if dup_action == "keep_first":
+            df = df.drop_duplicates(subset=[date_col], keep="first")
+        elif dup_action == "keep_last":
+            df = df.drop_duplicates(subset=[date_col], keep="last")
+        elif dup_action == "drop_all":
+            df = df[~dup_mask]
+    # --- Sort by date -----------------------------------------------------
+    df = df.sort_values(date_col).reset_index(drop=True)
+    # --- Handle missing values --------------------------------------------
+    if missing_action == "interpolate":
+        df[y_cols] = df[y_cols].interpolate(method="linear", limit_direction="both")
+    elif missing_action == "ffill":
+        df[y_cols] = df[y_cols].ffill().bfill()
+    elif missing_action == "drop":
+        df = df.dropna(subset=y_cols)
+    report.missing_after = {
+        col: int(df[col].isna().sum()) for col in y_cols
+    }
+    report.rows_after = len(df)
+    return df, report
+# ---------------------------------------------------------------------------
+# Frequency detection
+# ---------------------------------------------------------------------------
+def detect_frequency(df: pd.DataFrame, date_col: str) -> FrequencyInfo:
+    """Classify the time-series frequency based on median time delta.
+    Returns a :class:`FrequencyInfo` with a human-readable label, the
+    computed median delta, and whether the series is *regular* (the
+    standard deviation of deltas is less than 20 % of the median).
+    """
+    dates = df[date_col].dropna().sort_values()
+    if len(dates) < 2:
+        return FrequencyInfo(label="Unknown", median_delta=timedelta(0), is_regular=False)
+    deltas = dates.diff().dropna()
+    median_delta = deltas.median()
+    # Regularity: std < 20% of median
+    std_delta = deltas.std()
+    is_regular = bool(std_delta <= median_delta * 0.2) if median_delta > timedelta(0) else False
+    # Classify by median days
+    days = median_delta.days
+    if days <= 1:
+        label = "Daily"
+    elif 5 <= days <= 9:
+        label = "Weekly"
+    elif 25 <= days <= 35:
+        label = "Monthly"
+    elif 85 <= days <= 100:
+        label = "Quarterly"
+    elif 350 <= days <= 380:
+        label = "Yearly"
+    else:
+        label = "Irregular"
+    return FrequencyInfo(label=label, median_delta=median_delta, is_regular=is_regular)
+# ---------------------------------------------------------------------------
+# Calendar feature extraction
+# ---------------------------------------------------------------------------
+def add_time_features(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
+    """Add calendar columns derived from *date_col*.
+    New columns: ``year``, ``quarter``, ``month``, ``day_of_week``.
+    The dataframe is returned (not copied) with new columns appended.
+    """
+    dt = df[date_col].dt
+    df["year"] = dt.year
+    df["quarter"] = dt.quarter
+    df["month"] = dt.month
+    df["day_of_week"] = dt.dayofweek
+    return df

src/diagnostics.py ADDED Viewed

	@@ -0,0 +1,509 @@

+"""Time-series diagnostics utilities.
+Provides summary statistics, stationarity tests, trend estimation,
+autocorrelation analysis, seasonal decomposition, rolling statistics,
+year-over-year change computation, and multi-series summaries.
+"""
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import pandas as pd
+from numpy.typing import NDArray
+from scipy import stats
+from statsmodels.tsa.stattools import adfuller, acf, pacf
+from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+@dataclass
+class SummaryStats:
+    """Container for univariate time-series summary statistics."""
+    count: int
+    missing_count: int
+    missing_pct: float
+    min_val: float
+    max_val: float
+    mean_val: float
+    median_val: float
+    std_val: float
+    p25: float
+    p75: float
+    date_start: pd.Timestamp
+    date_end: pd.Timestamp
+    date_span_days: int
+    trend_slope: float
+    trend_pvalue: float
+    adf_statistic: float
+    adf_pvalue: float
+# ---------------------------------------------------------------------------
+# Core helper functions
+# ---------------------------------------------------------------------------
+def compute_adf_test(series: pd.Series) -> tuple[float, float]:
+    """Run the Augmented Dickey-Fuller test for stationarity.
+    Parameters
+    ----------
+    series : pd.Series
+        The time-series values (NaNs are dropped automatically).
+    Returns
+    -------
+    tuple[float, float]
+        ``(adf_statistic, p_value)``.  Returns ``(np.nan, np.nan)`` when the
+        test cannot be performed (e.g. too few observations or constant data).
+    """
+    clean = series.dropna()
+    if len(clean) < 2:
+        return np.nan, np.nan
+    try:
+        result = adfuller(clean, autolag="AIC")
+        return float(result[0]), float(result[1])
+    except Exception:
+        return np.nan, np.nan
+def compute_trend_slope(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+) -> tuple[float, float]:
+    """Estimate a linear trend via OLS on a numeric index.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Must contain *date_col* and *y_col*.
+    date_col : str
+        Column with datetime-like values.
+    y_col : str
+        Column with numeric values.
+    Returns
+    -------
+    tuple[float, float]
+        ``(slope, p_value)`` from ``scipy.stats.linregress``.
+        Returns ``(np.nan, np.nan)`` when the regression cannot be computed.
+    """
+    subset = df[[date_col, y_col]].dropna()
+    if len(subset) < 2:
+        return np.nan, np.nan
+    try:
+        x = np.arange(len(subset), dtype=float)
+        y = subset[y_col].astype(float).values
+        result = stats.linregress(x, y)
+        return float(result.slope), float(result.pvalue)
+    except Exception:
+        return np.nan, np.nan
+# ---------------------------------------------------------------------------
+# Summary statistics
+# ---------------------------------------------------------------------------
+def compute_summary_stats(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+) -> SummaryStats:
+    """Compute a comprehensive set of summary statistics for a time series.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Source data.
+    date_col : str
+        Name of the datetime column.
+    y_col : str
+        Name of the numeric value column.
+    Returns
+    -------
+    SummaryStats
+        Dataclass instance containing descriptive stats, date range info,
+        trend slope / p-value, and ADF test results.
+    """
+    series = df[y_col]
+    dates = pd.to_datetime(df[date_col])
+    count = int(series.notna().sum())
+    missing_count = int(series.isna().sum())
+    total = len(series)
+    missing_pct = (missing_count / total * 100.0) if total > 0 else 0.0
+    min_val = float(series.min())
+    max_val = float(series.max())
+    mean_val = float(series.mean())
+    median_val = float(series.median())
+    std_val = float(series.std())
+    p25 = float(series.quantile(0.25))
+    p75 = float(series.quantile(0.75))
+    date_start = dates.min()
+    date_end = dates.max()
+    date_span_days = int((date_end - date_start).days)
+    trend_slope, trend_pvalue = compute_trend_slope(df, date_col, y_col)
+    adf_statistic, adf_pvalue = compute_adf_test(series)
+    return SummaryStats(
+        count=count,
+        missing_count=missing_count,
+        missing_pct=missing_pct,
+        min_val=min_val,
+        max_val=max_val,
+        mean_val=mean_val,
+        median_val=median_val,
+        std_val=std_val,
+        p25=p25,
+        p75=p75,
+        date_start=date_start,
+        date_end=date_end,
+        date_span_days=date_span_days,
+        trend_slope=trend_slope,
+        trend_pvalue=trend_pvalue,
+        adf_statistic=adf_statistic,
+        adf_pvalue=adf_pvalue,
+    )
+# ---------------------------------------------------------------------------
+# Autocorrelation / partial autocorrelation
+# ---------------------------------------------------------------------------
+def compute_acf_pacf(
+    series: pd.Series,
+    nlags: int = 40,
+) -> tuple[NDArray, NDArray, NDArray, NDArray]:
+    """Compute ACF and PACF with confidence intervals.
+    Parameters
+    ----------
+    series : pd.Series
+        The time-series values (NaNs are dropped automatically).
+    nlags : int, optional
+        Maximum number of lags (default 40).  Automatically reduced when the
+        series is shorter than ``nlags + 1``.
+    Returns
+    -------
+    tuple[ndarray, ndarray, ndarray, ndarray]
+        ``(acf_values, acf_confint, pacf_values, pacf_confint)``
+        * ``acf_values``  -- shape ``(nlags + 1,)``
+        * ``acf_confint`` -- shape ``(nlags + 1, 2)``
+        * ``pacf_values`` -- shape ``(nlags + 1,)``
+        * ``pacf_confint`` -- shape ``(nlags + 1, 2)``
+    """
+    clean = series.dropna().values.astype(float)
+    # Ensure nlags does not exceed what the data can support.
+    max_possible = len(clean) - 1
+    if max_possible < 1:
+        raise ValueError(
+            "Series has fewer than 2 non-NaN observations; "
+            "cannot compute ACF/PACF."
+        )
+    nlags = min(nlags, max_possible)
+    acf_values, acf_confint = acf(clean, nlags=nlags, alpha=0.05)
+    pacf_values, pacf_confint = pacf(clean, nlags=nlags, alpha=0.05)
+    return acf_values, acf_confint, pacf_values, pacf_confint
+# ---------------------------------------------------------------------------
+# Seasonal decomposition
+# ---------------------------------------------------------------------------
+def _infer_period(df: pd.DataFrame, date_col: str) -> int:
+    """Best-effort period inference from the date column's frequency.
+    Returns a sensible integer period or raises ``ValueError`` when the
+    frequency cannot be determined.
+    """
+    dates = pd.to_datetime(df[date_col])
+    freq = pd.infer_freq(dates)
+    if freq is None:
+        raise ValueError(
+            "Cannot infer a regular frequency from the date column. "
+            "Please supply an explicit 'period' argument or resample the "
+            "data to a regular frequency before calling compute_decomposition."
+        )
+    # Map common frequency strings to typical seasonal periods.
+    freq_upper = freq.upper()
+    period_map: dict[str, int] = {
+        "D": 365,
+        "B": 252,       # business days in a year
+        "W": 52,
+        "SM": 24,       # semi-monthly
+        "BMS": 12,
+        "BM": 12,
+        "MS": 12,
+        "M": 12,        # calendar month end
+        "ME": 12,       # month-end (pandas >= 2.2)
+        "QS": 4,
+        "Q": 4,
+        "QE": 4,
+        "BQ": 4,
+        "AS": 1,
+        "A": 1,
+        "YS": 1,
+        "Y": 1,
+        "YE": 1,
+        "H": 24,
+        "T": 60,
+        "MIN": 60,
+        "S": 60,
+    }
+    # Strip leading digits (e.g. "2W" -> "W") to normalise anchored offsets.
+    stripped = freq_upper.lstrip("0123456789")
+    # Also strip any anchor suffix like "W-SUN" -> "W".
+    base = stripped.split("-")[0]
+    if base in period_map:
+        return period_map[base]
+    raise ValueError(
+        f"Unable to map inferred frequency '{freq}' to a seasonal period. "
+        "Please provide an explicit 'period' argument."
+    )
+def compute_decomposition(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    model: str = "additive",
+    period: Optional[int] = None,
+) -> DecomposeResult:
+    """Decompose a time series into trend, seasonal, and residual components.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Source data.
+    date_col : str
+        Datetime column name.
+    y_col : str
+        Numeric value column name.
+    model : str, optional
+        ``"additive"`` (default) or ``"multiplicative"``.
+    period : int or None, optional
+        Seasonal period.  When *None* the period is inferred from the date
+        column's frequency.
+    Returns
+    -------
+    statsmodels.tsa.seasonal.DecomposeResult
+    Raises
+    ------
+    ValueError
+        If a regular frequency cannot be inferred and *period* is not given.
+    """
+    ts = (
+        df[[date_col, y_col]]
+        .copy()
+        .set_index(date_col)
+        .sort_index()
+    )
+    ts.index = pd.to_datetime(ts.index)
+    # Forward-fill / back-fill small gaps so decomposition doesn't fail on
+    # a handful of interior NaNs.
+    ts[y_col] = ts[y_col].ffill().bfill()
+    if period is None:
+        period = _infer_period(df, date_col)
+    # Attempt to set a frequency on the index so that seasonal_decompose is
+    # happy; fall back to the explicit period if this fails.
+    if ts.index.freq is None:
+        inferred = pd.infer_freq(ts.index)
+        if inferred is not None:
+            ts = ts.asfreq(inferred)
+            ts[y_col] = ts[y_col].ffill().bfill()
+    return seasonal_decompose(ts[y_col], model=model, period=period)
+# ---------------------------------------------------------------------------
+# Rolling statistics
+# ---------------------------------------------------------------------------
+def compute_rolling_stats(
+    df: pd.DataFrame,
+    y_col: str,
+    window: int = 12,
+) -> pd.DataFrame:
+    """Add rolling mean and rolling standard deviation columns to *df*.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Source data (not mutated).
+    y_col : str
+        Column over which rolling statistics are calculated.
+    window : int, optional
+        Rolling window size (default 12).
+    Returns
+    -------
+    pd.DataFrame
+        Copy of *df* with two extra columns: ``rolling_mean`` and
+        ``rolling_std``.
+    """
+    out = df.copy()
+    out["rolling_mean"] = out[y_col].rolling(window=window, min_periods=1).mean()
+    out["rolling_std"] = out[y_col].rolling(window=window, min_periods=1).std()
+    return out
+# ---------------------------------------------------------------------------
+# Year-over-year change
+# ---------------------------------------------------------------------------
+def _offset_for_frequency(df: pd.DataFrame, date_col: str) -> pd.DateOffset:
+    """Return a 1-year ``DateOffset`` appropriate to the series frequency."""
+    dates = pd.to_datetime(df[date_col])
+    freq = pd.infer_freq(dates)
+    if freq is not None:
+        freq_upper = freq.upper().lstrip("0123456789").split("-")[0]
+        # For sub-monthly frequencies we shift by 365 days / 52 weeks etc.
+        if freq_upper in {"D", "B"}:
+            return pd.DateOffset(days=365)
+        if freq_upper in {"W"}:
+            return pd.DateOffset(weeks=52)
+        if freq_upper in {"H", "T", "MIN", "S"}:
+            return pd.DateOffset(days=365)
+    # Default: shift by 12 months (works for M, Q, and annual data).
+    return pd.DateOffset(months=12)
+def compute_yoy_change(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+) -> pd.DataFrame:
+    """Compute year-over-year absolute and percentage change.
+    The number of periods to shift is determined from the inferred frequency
+    of the date column.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Source data (not mutated).
+    date_col : str
+        Datetime column name.
+    y_col : str
+        Numeric value column name.
+    Returns
+    -------
+    pd.DataFrame
+        Copy of *df* sorted by *date_col* with additional columns
+        ``yoy_abs_change`` and ``yoy_pct_change``.
+    """
+    out = df.copy().sort_values(date_col).reset_index(drop=True)
+    out[date_col] = pd.to_datetime(out[date_col])
+    # Determine the number of rows that correspond to ~1 year.
+    freq = pd.infer_freq(out[date_col])
+    if freq is not None:
+        freq_upper = freq.upper().lstrip("0123456789").split("-")[0]
+        period_map: dict[str, int] = {
+            "D": 365,
+            "B": 252,
+            "W": 52,
+            "SM": 24,
+            "BMS": 12,
+            "BM": 12,
+            "MS": 12,
+            "M": 12,
+            "ME": 12,
+            "QS": 4,
+            "Q": 4,
+            "QE": 4,
+            "BQ": 4,
+            "AS": 1,
+            "A": 1,
+            "YS": 1,
+            "Y": 1,
+            "YE": 1,
+            "H": 8760,
+            "T": 525600,
+            "MIN": 525600,
+            "S": 31536000,
+        }
+        base = freq_upper
+        shift_periods = period_map.get(base, 12)
+    else:
+        # Fallback: assume monthly data.
+        shift_periods = 12
+    shifted = out[y_col].shift(shift_periods)
+    out["yoy_abs_change"] = out[y_col] - shifted
+    out["yoy_pct_change"] = out["yoy_abs_change"] / shifted.abs().replace(0, np.nan) * 100.0
+    return out
+# ---------------------------------------------------------------------------
+# Multi-series summary
+# ---------------------------------------------------------------------------
+def compute_multi_series_summary(
+    df: pd.DataFrame,
+    date_col: str,
+    y_cols: list[str],
+) -> pd.DataFrame:
+    """Produce a summary DataFrame with one row per value column.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Source data.
+    date_col : str
+        Datetime column name.
+    y_cols : list[str]
+        List of numeric column names to summarise.
+    Returns
+    -------
+    pd.DataFrame
+        Columns: ``variable``, ``count``, ``mean``, ``std``, ``min``,
+        ``max``, ``trend_slope``, ``adf_pvalue``.
+    """
+    rows: list[dict] = []
+    for col in y_cols:
+        series = df[col]
+        slope, _ = compute_trend_slope(df, date_col, col)
+        _, adf_p = compute_adf_test(series)
+        rows.append(
+            {
+                "variable": col,
+                "count": int(series.notna().sum()),
+                "mean": float(series.mean()),
+                "std": float(series.std()),
+                "min": float(series.min()),
+                "max": float(series.max()),
+                "trend_slope": slope,
+                "adf_pvalue": adf_p,
+            }
+        )
+    return pd.DataFrame(rows)

src/plotting.py ADDED Viewed

	@@ -0,0 +1,671 @@

+"""
+plotting.py
+-----------
+Chart-generation functions for time-series visualisation.
+Every public function returns a :class:`matplotlib.figure.Figure` object.
+Callers (e.g. Streamlit pages) can pass the figure to ``st.pyplot(fig)``
+or convert it to PNG bytes via :func:`fig_to_png_bytes`.
+All functions accept an optional *style_dict* (typically from
+:func:`ui_theme.get_miami_mpl_style`) and an optional *palette_colors*
+list so that colours stay consistent across the application.
+"""
+from __future__ import annotations
+import io
+import math
+from typing import Dict, List, Optional, Sequence
+# CRITICAL: set the non-interactive backend before any other mpl import.
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt  # noqa: E402
+import matplotlib.dates as mdates  # noqa: E402
+import numpy as np  # noqa: E402
+import pandas as pd  # noqa: E402
+# ---------------------------------------------------------------------------
+# Brand defaults (mirrors ui_theme.py)
+# ---------------------------------------------------------------------------
+MIAMI_RED: str = "#C41230"
+_DEFAULT_FIG_SIZE = (10, 6)
+# ---------------------------------------------------------------------------
+# Utility
+# ---------------------------------------------------------------------------
+def fig_to_png_bytes(fig: matplotlib.figure.Figure, dpi: int = 150) -> bytes:
+    """Render *fig* to an in-memory PNG and return the raw bytes."""
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight")
+    buf.seek(0)
+    return buf.read()
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+class _StyleContext:
+    """Context manager that temporarily applies *style_dict* to rcParams.
+    On exit the previous values are restored so that other figures are not
+    affected.
+    """
+    def __init__(self, style_dict: Optional[Dict[str, object]]):
+        self._style = style_dict
+        self._saved: Dict[str, object] = {}
+    def __enter__(self) -> "_StyleContext":
+        if self._style:
+            for key, value in self._style.items():
+                self._saved[key] = plt.rcParams.get(key)
+                try:
+                    plt.rcParams[key] = value
+                except (KeyError, ValueError):
+                    pass
+        return self
+    def __exit__(self, *exc_info: object) -> None:
+        for key, value in self._saved.items():
+            try:
+                plt.rcParams[key] = value
+            except (KeyError, ValueError):
+                pass
+def _default_color(palette_colors: Optional[List[str]], idx: int = 0) -> str:
+    """Pick a colour from *palette_colors* or fall back to MIAMI_RED."""
+    if palette_colors and len(palette_colors) > idx:
+        return palette_colors[idx % len(palette_colors)]
+    return MIAMI_RED
+def _finish_figure(fig: matplotlib.figure.Figure) -> matplotlib.figure.Figure:
+    """Apply common finishing touches and return the figure."""
+    fig.tight_layout()
+    return fig
+def _auto_date_axis(ax: plt.Axes) -> None:
+    """Auto-format and rotate date tick labels."""
+    ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(mdates.AutoDateLocator()))
+    for label in ax.get_xticklabels():
+        label.set_rotation(30)
+        label.set_ha("right")
+def _grid_dims(n: int) -> tuple[int, int]:
+    """Return (nrows, ncols) for a compact grid of *n* panels."""
+    ncols = min(n, 3)
+    nrows = math.ceil(n / ncols)
+    return nrows, ncols
+# ===================================================================
+# 1. Line with markers
+# ===================================================================
+def plot_line_with_markers(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+    palette_colors: Optional[List[str]] = None,
+) -> matplotlib.figure.Figure:
+    """Simple line plot with small circle markers.
+    Uses the first palette colour or *MIAMI_RED* as the default.
+    """
+    with _StyleContext(style_dict):
+        fig, ax = plt.subplots(figsize=_DEFAULT_FIG_SIZE)
+        color = _default_color(palette_colors, 0)
+        ax.plot(
+            df[date_col], df[y_col],
+            marker="o", markersize=4, linewidth=1.5,
+            color=color, label=y_col,
+        )
+        ax.set_xlabel(date_col)
+        ax.set_ylabel(y_col)
+        if title:
+            ax.set_title(title)
+        _auto_date_axis(ax)
+        ax.legend(loc="best")
+    return _finish_figure(fig)
+# ===================================================================
+# 2. Line with coloured markers
+# ===================================================================
+def plot_line_colored_markers(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    color_by: str,
+    palette_colors: List[str],
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """Line plot where marker colour varies by a categorical column.
+    A legend is added mapping each unique value of *color_by* to its
+    colour.
+    """
+    with _StyleContext(style_dict):
+        fig, ax = plt.subplots(figsize=_DEFAULT_FIG_SIZE)
+        # Draw the connecting line in a neutral grey
+        ax.plot(
+            df[date_col], df[y_col],
+            linewidth=1.0, color="#AAAAAA", zorder=1,
+        )
+        # Map categories to colours
+        categories = df[color_by].unique()
+        n_cats = len(categories)
+        if len(palette_colors) < n_cats:
+            # cycle palette to cover all categories
+            import itertools
+            palette_colors = list(itertools.islice(
+                itertools.cycle(palette_colors), n_cats
+            ))
+        color_map = {cat: palette_colors[i] for i, cat in enumerate(categories)}
+        for cat in categories:
+            mask = df[color_by] == cat
+            ax.scatter(
+                df.loc[mask, date_col], df.loc[mask, y_col],
+                c=color_map[cat], label=str(cat),
+                s=30, zorder=2, edgecolors="white", linewidths=0.3,
+            )
+        ax.set_xlabel(date_col)
+        ax.set_ylabel(y_col)
+        if title:
+            ax.set_title(title)
+        _auto_date_axis(ax)
+        ax.legend(title=color_by, loc="best", fontsize=8, ncol=max(1, n_cats // 8))
+    return _finish_figure(fig)
+# ===================================================================
+# 3. Seasonal plot
+# ===================================================================
+def plot_seasonal(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    period: str,
+    palette_name_colors: List[str],
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """Seasonal plot: one line per year/cycle, x-axis is within-period position.
+    Parameters
+    ----------
+    period:
+        ``"month"`` (x = month 1-12) or ``"quarter"`` (x = quarter 1-4).
+    palette_name_colors:
+        List of hex colours; one per cycle/year.
+    """
+    with _StyleContext(style_dict):
+        tmp = df[[date_col, y_col]].copy()
+        tmp["_year"] = tmp[date_col].dt.year
+        if period.lower().startswith("q"):
+            tmp["_period_pos"] = tmp[date_col].dt.quarter
+            x_label = "Quarter"
+        else:
+            tmp["_period_pos"] = tmp[date_col].dt.month
+            x_label = "Month"
+        years = sorted(tmp["_year"].unique())
+        n_years = len(years)
+        if len(palette_name_colors) < n_years:
+            import itertools
+            palette_name_colors = list(itertools.islice(
+                itertools.cycle(palette_name_colors), n_years
+            ))
+        fig, ax = plt.subplots(figsize=_DEFAULT_FIG_SIZE)
+        for i, year in enumerate(years):
+            sub = tmp[tmp["_year"] == year].sort_values("_period_pos")
+            ax.plot(
+                sub["_period_pos"], sub[y_col],
+                marker="o", markersize=4, linewidth=1.4,
+                color=palette_name_colors[i], label=str(year),
+            )
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_col)
+        if title:
+            ax.set_title(title)
+        ax.legend(title="Year", loc="best", fontsize=8, ncol=max(1, n_years // 6))
+    return _finish_figure(fig)
+# ===================================================================
+# 4. Seasonal sub-series
+# ===================================================================
+def plot_seasonal_subseries(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    period: str,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+    palette_colors: Optional[List[str]] = None,
+) -> matplotlib.figure.Figure:
+    """Subseries plot with vertical panels for each season and horizontal mean lines.
+    Parameters
+    ----------
+    period:
+        ``"month"`` or ``"quarter"``.
+    """
+    with _StyleContext(style_dict):
+        tmp = df[[date_col, y_col]].copy()
+        if period.lower().startswith("q"):
+            tmp["_season"] = tmp[date_col].dt.quarter
+            labels = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"}
+        else:
+            tmp["_season"] = tmp[date_col].dt.month
+            labels = {
+                1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
+                5: "May", 6: "Jun", 7: "Jul", 8: "Aug",
+                9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec",
+            }
+        seasons = sorted(tmp["_season"].unique())
+        n = len(seasons)
+        fig_w = max(10, n * 1.3)
+        fig, axes = plt.subplots(1, n, figsize=(fig_w, 5), sharey=True)
+        if n == 1:
+            axes = [axes]
+        color = _default_color(palette_colors, 0)
+        for idx, season in enumerate(seasons):
+            ax = axes[idx]
+            sub = tmp[tmp["_season"] == season].sort_values(date_col)
+            x_positions = range(len(sub))
+            ax.plot(x_positions, sub[y_col].values, marker="o", markersize=3,
+                    linewidth=1.2, color=color)
+            mean_val = sub[y_col].mean()
+            ax.axhline(mean_val, color=MIAMI_RED, linewidth=1.8, linestyle="--", alpha=0.8)
+            ax.set_title(labels.get(season, str(season)), fontsize=10)
+            ax.set_xticks([])
+            ax.tick_params(axis="y", labelsize=8)
+            if idx == 0:
+                ax.set_ylabel(y_col)
+        if title:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=1.02)
+    return _finish_figure(fig)
+# ===================================================================
+# 5. ACF / PACF
+# ===================================================================
+def plot_acf_pacf(
+    acf_vals: np.ndarray,
+    acf_ci: np.ndarray,
+    pacf_vals: np.ndarray,
+    pacf_ci: np.ndarray,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """Side-by-side ACF and PACF bar plots with confidence-interval bands.
+    Parameters
+    ----------
+    acf_vals, pacf_vals:
+        1-D arrays of autocorrelation values (lag 0, 1, ...).
+    acf_ci, pacf_ci:
+        Arrays of shape ``(n_lags, 2)`` giving the lower and upper CI bounds.
+    """
+    with _StyleContext(style_dict):
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+        for ax, vals, ci, sub_title in [
+            (ax1, acf_vals, acf_ci, "ACF"),
+            (ax2, pacf_vals, pacf_ci, "PACF"),
+        ]:
+            lags = np.arange(len(vals))
+            ax.bar(lags, vals, width=0.3, color=MIAMI_RED, alpha=0.85, zorder=2)
+            # Confidence band
+            lower = ci[:, 0]
+            upper = ci[:, 1]
+            ax.fill_between(lags, lower, upper, color="#C41230", alpha=0.12, zorder=1)
+            ax.axhline(0, color="black", linewidth=0.8)
+            ax.set_xlabel("Lag")
+            ax.set_ylabel("Correlation")
+            ax.set_title(sub_title)
+        if title:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=1.02)
+    return _finish_figure(fig)
+# ===================================================================
+# 6. Decomposition
+# ===================================================================
+def plot_decomposition(
+    decomposition_result,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """4-panel plot: observed, trend, seasonal, residual.
+    Parameters
+    ----------
+    decomposition_result:
+        An object with ``.observed``, ``.trend``, ``.seasonal``, and
+        ``.resid`` attributes (e.g. from ``statsmodels.tsa.seasonal_decompose``).
+    """
+    with _StyleContext(style_dict):
+        components = [
+            ("Observed", decomposition_result.observed),
+            ("Trend", decomposition_result.trend),
+            ("Seasonal", decomposition_result.seasonal),
+            ("Residual", decomposition_result.resid),
+        ]
+        fig, axes = plt.subplots(4, 1, figsize=(10, 10), sharex=True)
+        for ax, (label, series) in zip(axes, components):
+            ax.plot(series.index, series.values, linewidth=1.2, color=MIAMI_RED)
+            ax.set_ylabel(label, fontsize=10)
+            ax.tick_params(axis="both", labelsize=9)
+        # Date formatting on the shared x-axis (bottom panel)
+        _auto_date_axis(axes[-1])
+        if title:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=1.01)
+    return _finish_figure(fig)
+# ===================================================================
+# 7. Rolling overlay
+# ===================================================================
+def plot_rolling_overlay(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    window: int,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+    palette_colors: Optional[List[str]] = None,
+) -> matplotlib.figure.Figure:
+    """Original series (light) with rolling-mean overlay (bold) and +/-1 std band."""
+    with _StyleContext(style_dict):
+        fig, ax = plt.subplots(figsize=_DEFAULT_FIG_SIZE)
+        raw_color = _default_color(palette_colors, 0)
+        mean_color = _default_color(palette_colors, 1) if palette_colors and len(palette_colors) > 1 else "#333333"
+        dates = df[date_col]
+        vals = df[y_col]
+        rolling_mean = vals.rolling(window=window, center=True).mean()
+        rolling_std = vals.rolling(window=window, center=True).std()
+        # Original series (light)
+        ax.plot(dates, vals, linewidth=0.8, alpha=0.4, color=raw_color, label="Original")
+        # Rolling mean (bold)
+        ax.plot(dates, rolling_mean, linewidth=2.2, color=mean_color,
+                label=f"{window}-pt Rolling Mean")
+        # +/- 1 std band
+        ax.fill_between(
+            dates,
+            rolling_mean - rolling_std,
+            rolling_mean + rolling_std,
+            alpha=0.15, color=mean_color, label="\u00b11 Std Dev",
+        )
+        ax.set_xlabel(date_col)
+        ax.set_ylabel(y_col)
+        if title:
+            ax.set_title(title)
+        _auto_date_axis(ax)
+        ax.legend(loc="best")
+    return _finish_figure(fig)
+# ===================================================================
+# 8. Year-over-Year change
+# ===================================================================
+def plot_yoy_change(
+    df: pd.DataFrame,
+    date_col: str,
+    y_col: str,
+    yoy_df: pd.DataFrame,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """Two-subplot bar chart: absolute YoY change (top) and percentage YoY change (bottom).
+    Parameters
+    ----------
+    yoy_df:
+        DataFrame with columns ``"date"``, ``"abs_change"``, ``"pct_change"``.
+    """
+    with _StyleContext(style_dict):
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
+        dates = yoy_df["date"]
+        abs_change = yoy_df["abs_change"]
+        pct_change = yoy_df["pct_change"]
+        # Colours: green for positive, red for negative
+        abs_colors = ["#2ca02c" if v >= 0 else "#d62728" for v in abs_change]
+        pct_colors = ["#2ca02c" if v >= 0 else "#d62728" for v in pct_change]
+        ax1.bar(dates, abs_change, color=abs_colors, width=20, edgecolor="white", linewidth=0.3)
+        ax1.axhline(0, color="black", linewidth=0.6)
+        ax1.set_ylabel("Absolute Change")
+        ax1.set_title("Year-over-Year Absolute Change")
+        ax2.bar(dates, pct_change, color=pct_colors, width=20, edgecolor="white", linewidth=0.3)
+        ax2.axhline(0, color="black", linewidth=0.6)
+        ax2.set_ylabel("% Change")
+        ax2.set_title("Year-over-Year Percentage Change")
+        _auto_date_axis(ax2)
+        if title:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=1.02)
+    return _finish_figure(fig)
+# ===================================================================
+# 9. Lag plot
+# ===================================================================
+def plot_lag(
+    series: pd.Series,
+    lag: int = 1,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+) -> matplotlib.figure.Figure:
+    """Scatter plot of y(t) vs y(t-lag) with correlation-coefficient annotation."""
+    with _StyleContext(style_dict):
+        y = series.dropna().values
+        y_t = y[lag:]
+        y_lag = y[:-lag]
+        corr = np.corrcoef(y_t, y_lag)[0, 1]
+        fig, ax = plt.subplots(figsize=(7, 7))
+        ax.scatter(y_lag, y_t, alpha=0.5, s=20, color=MIAMI_RED, edgecolors="white", linewidths=0.3)
+        # Annotation
+        ax.annotate(
+            f"r = {corr:.3f}",
+            xy=(0.05, 0.95), xycoords="axes fraction",
+            fontsize=12, fontweight="bold",
+            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", edgecolor="#CCCCCC", alpha=0.9),
+            verticalalignment="top",
+        )
+        ax.set_xlabel(f"y(t\u2212{lag})")
+        ax.set_ylabel("y(t)")
+        if title:
+            ax.set_title(title)
+        else:
+            ax.set_title(f"Lag-{lag} Plot")
+    return _finish_figure(fig)
+# ===================================================================
+# 10. Panel (small multiples)
+# ===================================================================
+def plot_panel(
+    df: pd.DataFrame,
+    date_col: str,
+    y_cols: List[str],
+    chart_type: str = "line",
+    shared_y: bool = True,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+    palette_colors: Optional[List[str]] = None,
+) -> matplotlib.figure.Figure:
+    """Small multiples: one subplot per *y_col* arranged in a grid.
+    Parameters
+    ----------
+    chart_type:
+        ``"line"`` or ``"bar"``.
+    shared_y:
+        If ``True`` all panels share the same y-axis limits.
+    """
+    with _StyleContext(style_dict):
+        n = len(y_cols)
+        nrows, ncols = _grid_dims(n)
+        fig_h = max(4, nrows * 3.5)
+        fig_w = max(8, ncols * 4.5)
+        fig, axes = plt.subplots(
+            nrows, ncols, figsize=(fig_w, fig_h),
+            sharey=shared_y, squeeze=False,
+        )
+        flat_axes = axes.flatten()
+        for i, col in enumerate(y_cols):
+            ax = flat_axes[i]
+            color = _default_color(palette_colors, i)
+            if chart_type == "bar":
+                ax.bar(df[date_col], df[col], color=color, width=2, edgecolor="white", linewidth=0.3)
+            else:
+                ax.plot(df[date_col], df[col], linewidth=1.3, color=color)
+            ax.set_title(col, fontsize=10)
+            _auto_date_axis(ax)
+        # Hide unused subplots
+        for j in range(n, len(flat_axes)):
+            flat_axes[j].set_visible(False)
+        if title:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=1.02)
+    return _finish_figure(fig)
+# ===================================================================
+# 11. Spaghetti plot
+# ===================================================================
+def plot_spaghetti(
+    df: pd.DataFrame,
+    date_col: str,
+    y_cols: List[str],
+    alpha: float = 0.15,
+    highlight_col: Optional[str] = None,
+    top_n: Optional[int] = None,
+    show_median_band: bool = False,
+    title: Optional[str] = None,
+    style_dict: Optional[Dict[str, object]] = None,
+    palette_colors: Optional[List[str]] = None,
+) -> matplotlib.figure.Figure:
+    """All series on one plot at low opacity, with optional highlighting.
+    Parameters
+    ----------
+    alpha:
+        Opacity for the background spaghetti lines.
+    highlight_col:
+        Column name to draw with full opacity and thicker line.
+    top_n:
+        If set, highlight the *top_n* series by maximum value.
+    show_median_band:
+        If ``True``, overlay the median line and shade the IQR.
+    """
+    with _StyleContext(style_dict):
+        fig, ax = plt.subplots(figsize=_DEFAULT_FIG_SIZE)
+        dates = df[date_col]
+        # Determine which columns to highlight
+        highlight_set: set[str] = set()
+        if highlight_col and highlight_col in y_cols:
+            highlight_set.add(highlight_col)
+        if top_n:
+            max_vals = {col: df[col].max() for col in y_cols}
+            sorted_cols = sorted(max_vals, key=max_vals.get, reverse=True)  # type: ignore[arg-type]
+            highlight_set.update(sorted_cols[:top_n])
+        # Draw all series
+        for i, col in enumerate(y_cols):
+            color = _default_color(palette_colors, i)
+            if col in highlight_set:
+                ax.plot(dates, df[col], linewidth=2.0, alpha=0.9,
+                        color=color, label=col, zorder=3)
+            else:
+                ax.plot(dates, df[col], linewidth=0.8, alpha=alpha,
+                        color=color, zorder=1)
+        # Median + IQR band
+        if show_median_band:
+            numeric_data = df[y_cols]
+            median_line = numeric_data.median(axis=1)
+            q1 = numeric_data.quantile(0.25, axis=1)
+            q3 = numeric_data.quantile(0.75, axis=1)
+            ax.plot(dates, median_line, linewidth=2.2, color="#333333",
+                    label="Median", zorder=4)
+            ax.fill_between(dates, q1, q3, alpha=0.2, color="#333333",
+                            label="IQR", zorder=2)
+        ax.set_xlabel(date_col)
+        ax.set_ylabel("Value")
+        if title:
+            ax.set_title(title)
+        _auto_date_axis(ax)
+        # Only add legend if there are labelled items
+        handles, labels = ax.get_legend_handles_labels()
+        if labels:
+            ax.legend(loc="best", fontsize=8)
+    return _finish_figure(fig)

src/querychat_helpers.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+QueryChat initialization and filtered DataFrame helpers.
+Provides convenience wrappers around the ``querychat`` library for
+natural-language filtering of time-series DataFrames inside a Streamlit
+app.  All functions degrade gracefully when the package or an API key
+is unavailable.
+"""
+from __future__ import annotations
+import os
+from typing import List, Optional
+import pandas as pd
+import streamlit as st
+try:
+    from querychat.streamlit import QueryChat as _QueryChat
+    _QUERYCHAT_AVAILABLE = True
+except ImportError:  # pragma: no cover
+    _QUERYCHAT_AVAILABLE = False
+# ---------------------------------------------------------------------------
+# Availability check
+# ---------------------------------------------------------------------------
+def check_querychat_available() -> bool:
+    """Return ``True`` when both *querychat* is installed and an API key is set.
+    QueryChat requires an ``OPENAI_API_KEY`` environment variable.  This
+    helper lets callers gate UI elements behind a simple boolean.
+    """
+    if not _QUERYCHAT_AVAILABLE:
+        return False
+    return bool(os.environ.get("OPENAI_API_KEY"))
+# ---------------------------------------------------------------------------
+# QueryChat factory
+# ---------------------------------------------------------------------------
+def create_querychat(
+    df: pd.DataFrame,
+    name: str = "dataset",
+    date_col: str = "date",
+    y_cols: Optional[List[str]] = None,
+    freq_label: str = "",
+):
+    """Create and return a QueryChat instance bound to *df*.
+    Parameters
+    ----------
+    df:
+        The pandas DataFrame to expose to the chat interface.
+    name:
+        A human-readable name for the dataset (used in the description).
+    date_col:
+        Name of the date/time column.
+    y_cols:
+        Names of the value (numeric) columns.  If ``None``, an empty
+        list is used in the description.
+    freq_label:
+        Optional frequency label (e.g. ``"Monthly"``, ``"Daily"``).
+    Returns
+    -------
+    QueryChat instance
+        The object returned by ``QueryChat()``.
+    Raises
+    ------
+    RuntimeError
+        If querychat is not installed.
+    """
+    if not _QUERYCHAT_AVAILABLE:
+        raise RuntimeError(
+            "The 'querychat' package is not installed. "
+            "Install it with: pip install 'querychat[streamlit]'"
+        )
+    if y_cols is None:
+        y_cols = []
+    value_cols_str = ", ".join(y_cols) if y_cols else "none specified"
+    freq_part = f"  Frequency: {freq_label}." if freq_label else ""
+    data_description = (
+        f"This dataset is named '{name}'.  "
+        f"It contains {len(df):,} rows.  "
+        f"The date column is '{date_col}'.  "
+        f"Value columns: {value_cols_str}."
+        f"{freq_part}"
+    )
+    greeting = (
+        f"Hi! I can help you filter and explore the **{name}** dataset.  "
+        "Try asking me something like:\n"
+        '- "Show only 2023 data"\n'
+        '- "Filter where sales > 60000"\n'
+        '- "Show rows from January to March"'
+    )
+    qc = _QueryChat(
+        data_source=df,
+        table_name=name.replace(" ", "_"),
+        client="openai/gpt-5.2-2025-12-11",
+        data_description=data_description,
+        greeting=greeting,
+    )
+    return qc
+# ---------------------------------------------------------------------------
+# Filtered DataFrame extraction
+# ---------------------------------------------------------------------------
+def get_filtered_pandas_df(qc) -> pd.DataFrame:
+    """Extract the currently filtered DataFrame from a QueryChat instance.
+    The underlying ``qc.df()`` may return a *narwhals* DataFrame rather
+    than a pandas one.  This helper transparently converts when needed
+    and falls back to the original frame on any error.
+    Parameters
+    ----------
+    qc:
+        A QueryChat instance previously created via :func:`create_querychat`.
+    Returns
+    -------
+    pd.DataFrame
+        The filtered data as a pandas DataFrame.
+    """
+    try:
+        result = qc.df()
+        # narwhals (or polars) DataFrames expose .to_pandas()
+        if hasattr(result, "to_pandas"):
+            return result.to_pandas()
+        # Already a pandas DataFrame
+        if isinstance(result, pd.DataFrame):
+            return result
+        # Unknown type -- attempt conversion as a last resort
+        return pd.DataFrame(result)
+    except Exception:  # noqa: BLE001
+        # If anything goes wrong, surface the unfiltered data so the app
+        # can continue to function.
+        try:
+            raw = qc.df()
+            if isinstance(raw, pd.DataFrame):
+                return raw
+        except Exception:  # noqa: BLE001
+            pass
+        return pd.DataFrame()

src/ui_theme.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+ui_theme.py
+-----------
+Miami University branded theme and styling utilities for Streamlit apps.
+Provides:
+    - CSS injection for Streamlit components (buttons, sidebar, metrics, cards)
+    - Matplotlib rcParams styled with Miami branding
+    - ColorBrewer palette loading via palettable with graceful fallback
+    - Color-swatch preview figure generation
+"""
+from __future__ import annotations
+import itertools
+from typing import Dict, List, Optional
+import matplotlib.figure
+import matplotlib.pyplot as plt
+import streamlit as st
+# ---------------------------------------------------------------------------
+# Brand constants — Miami University (Ohio) official palette
+# ---------------------------------------------------------------------------
+MIAMI_RED: str = "#C41230"
+MIAMI_BLACK: str = "#000000"
+MIAMI_WHITE: str = "#FFFFFF"
+# Secondary palette tokens used only inside the CSS below.
+_WHITE = "#FFFFFF"
+_BLACK = "#000000"
+_LIGHT_GRAY = "#F5F5F5"
+_BORDER_GRAY = "#E0E0E0"
+_DARK_TEXT = "#000000"
+_HOVER_RED = "#9E0E26"
+# ---------------------------------------------------------------------------
+# Streamlit CSS injection
+# ---------------------------------------------------------------------------
+def apply_miami_theme() -> None:
+    """Inject Miami-branded CSS into the active Streamlit page.
+    Styles affected:
+        * Primary buttons  -- Miami Red background with white text
+        * Card containers  -- subtle border and rounded corners
+        * Sidebar header   -- Miami Red accent bar
+        * Metric cards     -- light background with left red accent
+    """
+    css = f"""
+    <style>
+        /* ---- Primary buttons ---- */
+        .stButton > button[kind="primary"],
+        .stButton > button {{
+            background-color: {MIAMI_RED};
+            color: {_WHITE};
+            border: none;
+            border-radius: 6px;
+            padding: 0.5rem 1.25rem;
+            font-weight: 600;
+            transition: background-color 0.2s ease;
+        }}
+        .stButton > button:hover {{
+            background-color: {_HOVER_RED};
+            color: {_WHITE};
+            border: none;
+        }}
+        .stButton > button:active,
+        .stButton > button:focus {{
+            background-color: {_HOVER_RED};
+            color: {_WHITE};
+            box-shadow: none;
+        }}
+        /* ---- Card borders ---- */
+        div[data-testid="stExpander"],
+        div[data-testid="stHorizontalBlock"] > div {{
+            border: 1px solid {_BORDER_GRAY};
+            border-radius: 8px;
+            padding: 0.75rem;
+        }}
+        /* ---- Sidebar header accent ---- */
+        section[data-testid="stSidebar"] > div:first-child {{
+            border-top: 4px solid {MIAMI_RED};
+        }}
+        section[data-testid="stSidebar"] h1,
+        section[data-testid="stSidebar"] h2,
+        section[data-testid="stSidebar"] h3 {{
+            color: {MIAMI_RED};
+        }}
+        /* ---- Metric cards ---- */
+        div[data-testid="stMetric"] {{
+            background-color: {_LIGHT_GRAY};
+            border-left: 4px solid {MIAMI_RED};
+            border-radius: 6px;
+            padding: 0.75rem 1rem;
+        }}
+        div[data-testid="stMetric"] label {{
+            color: {_BLACK};
+            font-size: 0.85rem;
+        }}
+        div[data-testid="stMetric"] div[data-testid="stMetricValue"] {{
+            color: {_BLACK};
+            font-weight: 700;
+        }}
+    </style>
+    """
+    st.markdown(css, unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Matplotlib style dictionary
+# ---------------------------------------------------------------------------
+def get_miami_mpl_style() -> Dict[str, object]:
+    """Return a dictionary of matplotlib rcParams for Miami branding.
+    Usage::
+        import matplotlib as mpl
+        mpl.rcParams.update(get_miami_mpl_style())
+    Or apply to a single figure::
+        with mpl.rc_context(get_miami_mpl_style()):
+            fig, ax = plt.subplots()
+            ...
+    """
+    return {
+        # Figure
+        "figure.facecolor": _WHITE,
+        "figure.edgecolor": _WHITE,
+        "figure.figsize": (10, 5),
+        "figure.dpi": 100,
+        # Axes
+        "axes.facecolor": _WHITE,
+        "axes.edgecolor": _BLACK,
+        "axes.labelcolor": _BLACK,
+        "axes.titlecolor": MIAMI_RED,
+        "axes.labelsize": 12,
+        "axes.titlesize": 14,
+        "axes.titleweight": "bold",
+        "axes.prop_cycle": plt.cycler(
+            color=[MIAMI_RED, _BLACK, "#4E79A7", "#F28E2B", "#76B7B2"]
+        ),
+        # Grid
+        "axes.grid": True,
+        "grid.color": _BORDER_GRAY,
+        "grid.linestyle": "--",
+        "grid.linewidth": 0.6,
+        "grid.alpha": 0.7,
+        # Ticks
+        "xtick.color": _BLACK,
+        "ytick.color": _BLACK,
+        "xtick.labelsize": 10,
+        "ytick.labelsize": 10,
+        # Legend
+        "legend.fontsize": 10,
+        "legend.frameon": True,
+        "legend.framealpha": 0.9,
+        "legend.edgecolor": _BORDER_GRAY,
+        # Font
+        "font.size": 11,
+        "font.family": "sans-serif",
+        # Savefig
+        "savefig.dpi": 150,
+        "savefig.bbox": "tight",
+    }
+# ---------------------------------------------------------------------------
+# ColorBrewer palette loading
+# ---------------------------------------------------------------------------
+# Mapping of short friendly names to palettable module paths.
+_PALETTE_MAP: Dict[str, str] = {
+    "Set1": "colorbrewer.qualitative.Set1",
+    "Set2": "colorbrewer.qualitative.Set2",
+    "Set3": "colorbrewer.qualitative.Set3",
+    "Dark2": "colorbrewer.qualitative.Dark2",
+    "Paired": "colorbrewer.qualitative.Paired",
+    "Pastel1": "colorbrewer.qualitative.Pastel1",
+    "Pastel2": "colorbrewer.qualitative.Pastel2",
+    "Accent": "colorbrewer.qualitative.Accent",
+    "Tab10": "colorbrewer.qualitative.Set1",  # fallback alias
+}
+_FALLBACK_COLORS: List[str] = [
+    MIAMI_RED,
+    MIAMI_BLACK,
+    "#4E79A7",
+    "#F28E2B",
+    "#76B7B2",
+    "#E15759",
+    "#59A14F",
+    "#EDC948",
+]
+def _resolve_palette(name: str) -> Optional[List[str]]:
+    """Dynamically import a palettable ColorBrewer palette by *name*.
+    Palettable organises palettes by maximum number of classes, e.g.
+    ``colorbrewer.qualitative.Set2_8``.  We find the variant with the
+    most colours available so the caller gets the richest palette.
+    """
+    import importlib
+    module_path = _PALETTE_MAP.get(name)
+    if module_path is None:
+        # Try a direct guess: colorbrewer.qualitative.<Name>
+        module_path = f"colorbrewer.qualitative.{name}"
+    # palettable stores each size variant as <Name>_<N> inside the module.
+    try:
+        mod = importlib.import_module(f"palettable.{module_path}")
+    except (ImportError, ModuleNotFoundError):
+        return None
+    # Discover the variant with the most colours.
+    best = None
+    best_n = 0
+    base = name.split(".")[-1] if "." in name else name
+    for attr_name in dir(mod):
+        if not attr_name.startswith(base + "_"):
+            continue
+        try:
+            suffix = int(attr_name.split("_")[-1])
+        except ValueError:
+            continue
+        if suffix > best_n:
+            best_n = suffix
+            best = attr_name
+    if best is None:
+        return None
+    palette_obj = getattr(mod, best, None)
+    if palette_obj is None:
+        return None
+    return [
+        "#{:02X}{:02X}{:02X}".format(*rgb) for rgb in palette_obj.colors
+    ]
+def get_palette_colors(name: str = "Set2", n: int = 8) -> List[str]:
+    """Load *n* hex colour strings from a ColorBrewer palette.
+    Parameters
+    ----------
+    name:
+        Friendly palette name such as ``"Set2"``, ``"Dark2"``, ``"Paired"``.
+    n:
+        Number of colours required.  If *n* exceeds the palette length the
+        colours are cycled.
+    Returns
+    -------
+    list[str]
+        List of *n* hex colour strings (e.g. ``["#66C2A5", ...]``).
+    Notes
+    -----
+    If the requested palette cannot be found, a sensible fallback list is
+    returned so that calling code never receives an empty list.
+    """
+    n = max(1, n)
+    colors = _resolve_palette(name)
+    if colors is None:
+        colors = _FALLBACK_COLORS
+    # Cycle if the caller needs more colours than the palette provides.
+    cycled = list(itertools.islice(itertools.cycle(colors), n))
+    return cycled
+# ---------------------------------------------------------------------------
+# Palette preview swatch
+# ---------------------------------------------------------------------------
+def render_palette_preview(
+    colors: List[str],
+    swatch_width: float = 1.0,
+    swatch_height: float = 0.4,
+) -> matplotlib.figure.Figure:
+    """Create a small matplotlib figure showing colour swatches.
+    Parameters
+    ----------
+    colors:
+        List of hex colour strings to display.
+    swatch_width:
+        Width of each individual swatch in inches.
+    swatch_height:
+        Height of the swatch strip in inches.
+    Returns
+    -------
+    matplotlib.figure.Figure
+        A Figure instance ready to be passed to ``st.pyplot()`` or saved.
+    """
+    n = len(colors)
+    fig_width = max(swatch_width * n, 2.0)
+    fig, ax = plt.subplots(
+        figsize=(fig_width, swatch_height + 0.3), dpi=100
+    )
+    for i, colour in enumerate(colors):
+        ax.add_patch(
+            plt.Rectangle(
+                (i, 0),
+                width=1,
+                height=1,
+                facecolor=colour,
+                edgecolor=_WHITE,
+                linewidth=1.5,
+            )
+        )
+    ax.set_xlim(0, n)
+    ax.set_ylim(0, 1)
+    ax.set_aspect("equal")
+    ax.axis("off")
+    fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
+    plt.close(fig)  # prevent display in non-Streamlit contexts
+    return fig