Spaces:

Teoman21
/

BID

Sleeping

App Files Files Community

Teoman21 commited on Nov 29, 2025

Commit

a48d292

1 Parent(s): 3ee1658

Add project files with LFS for large CSV

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.gitignore +2 -0
README.md +59 -13
app.py +747 -0
data/features.csv +0 -0
data/stores.csv +1 -0
data/test.csv +0 -0
data/train.csv +3 -0
data_processor.py +333 -0
filtered_htzxc454.csv +0 -0
insights.py +83 -0
requirements.txt +6 -0
utils.py +119 -0
visualizations.py +172 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/train.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .DS_Store
2	+ .pycache

README.md CHANGED Viewed

@@ -1,13 +1,59 @@
----
-title: BID
-emoji: 🐢
-colorFrom: green
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-short_description: Final Project space
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## Business Intelligence Dashboard
+Interactive Gradio application for exploring business datasets, generating insights, and exporting filtered results.
+### Features
+- Upload CSV or Excel files with automated validation and previews.
+- Comprehensive statistics: numeric and categorical summaries, missing value report, correlation matrix.
+- Dynamic filtering by numeric ranges, categorical selections, and date ranges with live row counts.
+- Visualizations: time series, distribution, category comparisons, scatter plots, correlation heatmap.
+- Automated insights: top/bottom performers, trend detection, anomaly identification.
+- Export filtered data as CSV and download charts as PNG (requires `kaleido`).
+### Project Structure
+```
+BID/
+├── app.py                  # Gradio UI wiring
+├── data_processor.py       # Data loading, cleaning, filtering utilities
+├── visualizations.py       # Plotly chart generators
+├── insights.py             # Insight extraction helpers
+├── utils.py                # Shared helpers/constants
+├── data/                   # Curated datasets from Kaggle & UCI
+│   ├── sales_train.csv
+│   ├── items.csv
+│   ├── item_categories.csv
+│   ├── shops.csv
+│   ├── test.csv
+│   └── online_retail.csv   # add this file from the UCI dataset
+├── requirements.txt        # Python dependencies
+└── README.md               # Project overview (this file)
+```
+### Sample Datasets
+- Kaggle *Predict Future Sales* (`sales_train.csv` plus lookup tables `items.csv`, `item_categories.csv`, `shops.csv`).
+- UCI *Online Retail* (`online_retail.csv` — place the downloaded CSV in `data/`).
+Use the **Load Sample** controls on the *Data Upload* tab to bootstrap analysis with these datasets. The app augments the Kaggle sales data by joining the lookup tables automatically.
+### Getting Started
+1. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+   PNG exports require the optional `kaleido` dependency included above.
+2. **Launch the dashboard**
+   ```bash
+   python app.py
+   ```
+3. **Load data**
+   - Upload your own CSV/Excel file **or** pick one of the bundled datasets via the *Load Sample* dropdown.
+   - Ensure the raw Kaggle/UCI CSV files reside in `data/` so the sample loader can detect them.
+4. **Explore**
+   - Apply filters, switch between visualizations, inspect automated insights, and download filtered results or charts.
+### Notes
+- The app infers column types automatically; ensure date columns are parseable for time-series plots and trend insights.
+- Large datasets may need additional preprocessing before upload to stay within local resource limits.

app.py ADDED Viewed

	@@ -0,0 +1,747 @@

+"""Gradio application wiring for the Business Intelligence dashboard."""
+from __future__ import annotations
+import tempfile
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+from data_processor import (
+    DatasetBundle,
+    dataset_overview,
+    dataset_preview,
+    filter_dataframe,
+    filter_metadata,
+    load_dataset,
+    load_sample_dataset,
+    missing_value_report,
+    numeric_summary,
+    categorical_summary,
+    correlation_matrix,
+    sample_dataset_options,
+)
+from insights import (
+    detect_anomalies,
+    detect_trend,
+    get_default_insight_columns,
+    top_bottom_performers,
+)
+from visualizations import (
+    create_category_plot,
+    create_correlation_heatmap,
+    create_distribution_plot,
+    create_scatter_plot,
+    create_time_series_plot,
+    figure_to_png_bytes,
+)
+DatasetState = Dict[str, Any]
+def _format_overview_text(info: Dict[str, Any], source_name: str) -> str:
+    """Render dataset information as Markdown."""
+    lines = [
+        f"**Source:** {source_name}",
+        f"- Rows: {info['Rows']}",
+        f"- Columns: {info['Columns']}",
+        f"- Memory Usage: {info['Memory Usage (MB)']} MB",
+    ]
+    return "\n".join(lines)
+def _empty_dataframe(message: str = "No data available") -> pd.DataFrame:
+    """Return a placeholder DataFrame for empty displays."""
+    return pd.DataFrame({"status": [message]})
+def _ensure_state(state: Optional[DatasetState]) -> DatasetState:
+    """Guarantee a dictionary-based state object."""
+    return state or {}
+def _current_dataframe(state: DatasetState, filtered: bool = True) -> pd.DataFrame:
+    """Return the filtered or raw dataframe from state."""
+    key = "filtered_df" if filtered else "dataframe"
+    df = state.get(key)
+    if isinstance(df, pd.DataFrame):
+        return df
+    raise ValueError("Please upload a dataset before performing this action.")
+def _finalize_dataset_load(bundle: DatasetBundle, state: DatasetState) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
+    """Populate shared outputs after a dataset is loaded."""
+    df = bundle.dataframe
+    state = {
+        "dataframe": df,
+        "filtered_df": df,
+        "column_types": bundle.column_types,
+        "filter_meta": filter_metadata(df, bundle.column_types),
+        "source_name": bundle.source_name,
+    }
+    overview = dataset_overview(df)
+    preview = dataset_preview(df)
+    status = f"✅ Loaded '{bundle.source_name}' with {df.shape[0]} rows and {df.shape[1]} columns."
+    info_text = _format_overview_text(overview["info"], bundle.source_name)
+    dtypes_df = overview["dtypes"]
+    head_df = preview["head"]
+    tail_df = preview["tail"]
+    filter_preview = head_df
+    row_count = f"Rows displayed: {len(df)}"
+    return state, status, info_text, dtypes_df, head_df, tail_df, filter_preview, row_count
+def _handle_file_upload(file, state: Optional[DatasetState]) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
+    """Load a dataset from the uploaded file."""
+    state = _ensure_state(state)
+    try:
+        bundle: DatasetBundle = load_dataset(file)
+    except ValueError as exc:
+        return (
+            state,
+            f"❌ {exc}",
+            "No dataset loaded.",
+            _empty_dataframe(),
+            _empty_dataframe(),
+            _empty_dataframe(),
+            _empty_dataframe(),
+            "Rows displayed: 0",
+        )
+    return _finalize_dataset_load(bundle, state)
+def _handle_sample_dataset(selection: Optional[str], state: Optional[DatasetState]) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
+    """Load one of the bundled sample datasets."""
+    state = _ensure_state(state)
+    if not selection:
+        message = "Please choose a sample dataset before loading."
+        empty = _empty_dataframe(message)
+        return state, f"⚠️ {message}", "No dataset loaded.", empty, empty, empty, empty, "Rows displayed: 0"
+    try:
+        bundle = load_sample_dataset(selection)
+    except ValueError as exc:
+        empty = _empty_dataframe(str(exc))
+        return state, f"❌ {exc}", "No dataset loaded.", empty, empty, empty, empty, "Rows displayed: 0"
+    return _finalize_dataset_load(bundle, state)
+def _populate_column_options(
+    state: Optional[DatasetState],
+) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]:
+    """Populate dropdown choices based on the uploaded dataset."""
+    state = _ensure_state(state)
+    column_types = state.get("column_types")
+    if not column_types:
+        empty_dropdown = gr.update(choices=[], value=None, interactive=False, visible=True)
+        hidden_checkbox = gr.update(choices=[], value=[], visible=False, interactive=False)
+        return (
+            empty_dropdown,
+            empty_dropdown,
+            hidden_checkbox,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+            empty_dropdown,
+        )
+    numeric = list(column_types.numeric)
+    categorical = list(column_types.categorical)
+    datetime_cols = list(column_types.datetime)
+    all_columns = list(state["dataframe"].columns)
+    defaults = get_default_insight_columns(column_types)
+    def dropdown(values: Iterable[str], default: Optional[str] = None):
+        choices = list(values)
+        value = default if default in choices else None
+        return gr.update(
+            choices=choices,
+            value=value,
+            interactive=bool(choices),
+            visible=True,
+        )
+    return (
+        dropdown(numeric),        # numeric filter column
+        dropdown(datetime_cols),  # date filter column
+        gr.update(choices=[], value=[], visible=False, interactive=False),  # categorical values reset
+        dropdown(categorical),    # categorical filter column
+        dropdown(datetime_cols, defaults.get("datetime")),  # time series date
+        dropdown(numeric, defaults.get("numeric")),         # time series value
+        dropdown(numeric),        # distribution numeric
+        dropdown(categorical),    # category column
+        dropdown(numeric),        # category value
+        dropdown(numeric),        # scatter x
+        dropdown(numeric),        # scatter y
+        gr.update(choices=all_columns, value=None, interactive=bool(all_columns), visible=True),  # scatter color
+        dropdown(numeric, defaults.get("numeric")),         # insight numeric
+        dropdown(datetime_cols, defaults.get("datetime")),  # insight datetime
+        dropdown(numeric, defaults.get("numeric")),         # trend value
+        dropdown(numeric, defaults.get("numeric")),         # anomaly column
+    )
+def _update_numeric_inputs(column: Optional[str], state: Optional[DatasetState]) -> Tuple[Any, Any]:
+    """Update numeric min/max inputs when a column is selected."""
+    state = _ensure_state(state)
+    hidden = gr.update(visible=False, value=None)
+    if not column or "filter_meta" not in state:
+        return hidden, hidden
+    meta = state["filter_meta"]["numeric"].get(column)
+    if not meta:
+        return hidden, hidden
+    minimum = float(meta["min"])
+    maximum = float(meta["max"])
+    return (
+        gr.update(value=minimum, visible=True, interactive=True, label=f"Min ({column})"),
+        gr.update(value=maximum, visible=True, interactive=True, label=f"Max ({column})"),
+    )
+def _update_categorical_values(column: Optional[str], state: Optional[DatasetState]):
+    """Populate categorical value options for filtering."""
+    state = _ensure_state(state)
+    if not column or "filter_meta" not in state:
+        return gr.update(visible=False)
+    values = state["filter_meta"]["categorical"].get(column, [])
+    return gr.update(
+        choices=values,
+        value=values[: min(10, len(values))],
+        visible=bool(values),
+        interactive=bool(values),
+        label=f"Values to include ({column})",
+    )
+def _update_date_bounds(column: Optional[str], state: Optional[DatasetState]) -> Tuple[Any, Any]:
+    """Populate date inputs when a date column is selected."""
+    state = _ensure_state(state)
+    if not column or "filter_meta" not in state:
+        hidden = gr.update(visible=False, value=None)
+        return hidden, hidden
+    meta = state["filter_meta"]["datetime"].get(column)
+    if not meta:
+        hidden = gr.update(visible=False, value=None)
+        return hidden, hidden
+    start = str(meta["min"])
+    end = str(meta["max"])
+    return (
+        gr.update(value=start, visible=True, label=f"Start date ({column})"),
+        gr.update(value=end, visible=True, label=f"End date ({column})"),
+    )
+def _apply_filters(
+    state: Optional[DatasetState],
+    numeric_column: Optional[str],
+    numeric_min: Optional[float],
+    numeric_max: Optional[float],
+    categorical_column: Optional[str],
+    categorical_values: Optional[List[str]],
+    date_column: Optional[str],
+    start_date: Optional[str],
+    end_date: Optional[str],
+) -> Tuple[DatasetState, pd.DataFrame, str]:
+    """Filter the dataset according to user selections."""
+    state = _ensure_state(state)
+    df = _current_dataframe(state, filtered=False)
+    numeric_filters: Dict[str, Tuple[Optional[float], Optional[float]]] = {}
+    categorical_filters: Dict[str, List[str]] = {}
+    date_filters: Dict[str, Tuple[Optional[str], Optional[str]]] = {}
+    if numeric_column and (numeric_min is not None or numeric_max is not None):
+        lower = numeric_min
+        upper = numeric_max
+        if lower is not None and upper is not None and lower > upper:
+            lower, upper = upper, lower
+        numeric_filters[numeric_column] = (lower, upper)
+    if categorical_column and categorical_values:
+        categorical_filters[categorical_column] = categorical_values
+    if date_column and (start_date or end_date):
+        date_filters[date_column] = (start_date, end_date)
+    filtered_df = filter_dataframe(df, numeric_filters, categorical_filters, date_filters)
+    state["filtered_df"] = filtered_df
+    row_count = f"Rows displayed: {len(filtered_df)}"
+    preview = filtered_df.head(5) if not filtered_df.empty else _empty_dataframe("No rows match the filters.")
+    return state, preview, row_count
+def _generate_statistics(state: Optional[DatasetState]) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
+    """Produce summary statistics for the Statistics tab."""
+    state = _ensure_state(state)
+    try:
+        df = _current_dataframe(state, filtered=False)
+    except ValueError as exc:
+        message = str(exc)
+        empty = _empty_dataframe(message)
+        return empty, empty, empty, empty, f"⚠️ {message}"
+    num_summary = numeric_summary(df)
+    cat_summary = categorical_summary(df)
+    missing = missing_value_report(df)
+    corr = correlation_matrix(df)
+    message = "Statistics generated successfully."
+    return (
+        num_summary if not num_summary.empty else _empty_dataframe("No numeric columns available."),
+        cat_summary if not cat_summary.empty else _empty_dataframe("No categorical columns available."),
+        missing if not missing.empty else _empty_dataframe("No missing values detected."),
+        corr if not corr.empty else _empty_dataframe("Not enough numeric columns for correlation."),
+        message,
+    )
+def _generate_chart(
+    state: Optional[DatasetState],
+    chart_type: str,
+    ts_date: Optional[str],
+    ts_value: Optional[str],
+    ts_agg: str,
+    dist_column: Optional[str],
+    dist_type: str,
+    cat_column: Optional[str],
+    cat_value: Optional[str],
+    cat_chart_type: str,
+    cat_agg: str,
+    scatter_x: Optional[str],
+    scatter_y: Optional[str],
+    scatter_color: Optional[str],
+) -> Tuple[Optional[go.Figure], Optional[go.Figure], str]:
+    """Create a visualization based on user selections."""
+    state = _ensure_state(state)
+    try:
+        df = _current_dataframe(state, filtered=True)
+    except ValueError as exc:
+        return None, None, f"⚠️ {exc}"
+    try:
+        if chart_type == "Time Series":
+            if not ts_date or not ts_value:
+                raise ValueError("Select both a date and value column.")
+            fig = create_time_series_plot(df, ts_date, ts_value, aggregation=ts_agg)
+        elif chart_type == "Distribution":
+            if not dist_column:
+                raise ValueError("Select a numeric column for the distribution plot.")
+            fig = create_distribution_plot(df, dist_column, plot_type=dist_type)
+        elif chart_type == "Category":
+            if not cat_column or not cat_value:
+                raise ValueError("Select both category and value columns.")
+            fig = create_category_plot(df, cat_column, cat_value, aggregation=cat_agg, chart_type=cat_chart_type.lower())
+        elif chart_type == "Scatter":
+            if not scatter_x or not scatter_y:
+                raise ValueError("Select x and y columns for the scatter plot.")
+            fig = create_scatter_plot(df, scatter_x, scatter_y, color_column=scatter_color)
+        elif chart_type == "Correlation Heatmap":
+            fig = create_correlation_heatmap(df)
+        else:
+            raise ValueError("Unsupported chart type.")
+    except ValueError as exc:
+        return None, None, f"⚠️ {exc}"
+    return fig, fig, "Visualization generated."
+def _download_filtered(state: Optional[DatasetState]) -> str:
+    """Export the filtered dataset to a temporary CSV file."""
+    state = _ensure_state(state)
+    df = _current_dataframe(state, filtered=True)
+    if df.empty:
+        raise ValueError("There are no rows to export. Adjust your filters and try again.")
+    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="filtered_", dir=".")
+    df.to_csv(temp.name, index=False)
+    temp.close()
+    return temp.name
+def _download_chart(fig: Optional[go.Figure]) -> str:
+    """Export the most recent chart to PNG."""
+    if fig is None:
+        raise ValueError("Generate a visualization before exporting.")
+    buffer = figure_to_png_bytes(fig)
+    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix="chart_", dir=".")
+    with open(temp.name, "wb") as fp:
+        fp.write(buffer.read())
+    return temp.name
+def _generate_insights(
+    state: Optional[DatasetState],
+    numeric_column: Optional[str],
+    trend_date_column: Optional[str],
+    trend_value_column: Optional[str],
+    anomaly_column: Optional[str],
+) -> Tuple[pd.DataFrame, pd.DataFrame, str, pd.DataFrame, str]:
+    """Generate top/bottom performers, trends, and anomalies."""
+    state = _ensure_state(state)
+    try:
+        df = _current_dataframe(state, filtered=True)
+    except ValueError as exc:
+        empty = _empty_dataframe(str(exc))
+        return empty, empty, f"⚠️ {exc}", empty, f"⚠️ {exc}"
+    status_messages: List[str] = []
+    top_df = bottom_df = _empty_dataframe("Select a numeric column for insights.")
+    if numeric_column:
+        try:
+            performers = top_bottom_performers(df, numeric_column)
+            top_df = performers["top"]
+            bottom_df = performers["bottom"]
+            status_messages.append(f"Top/bottom performers calculated for {numeric_column}.")
+        except ValueError as exc:
+            top_df = bottom_df = _empty_dataframe(str(exc))
+            status_messages.append(f"⚠️ {exc}")
+    trend_text = "Select a date and value column to evaluate trend."
+    if trend_date_column and trend_value_column:
+        try:
+            trend_text = detect_trend(df, trend_date_column, trend_value_column)
+        except ValueError as exc:
+            trend_text = f"⚠️ {exc}"
+    anomaly_df = _empty_dataframe("Select a numeric column to detect anomalies.")
+    if anomaly_column:
+        anomalies = detect_anomalies(df, anomaly_column)
+        anomaly_df = anomalies if not anomalies.empty else _empty_dataframe("No significant anomalies detected.")
+    combined_status = "\n".join(status_messages) if status_messages else "Insights generated."
+    return top_df, bottom_df, trend_text, anomaly_df, combined_status
+def _describe_sample_dataset(selection: Optional[str]) -> str:
+    """Return a user-friendly description for the selected sample dataset."""
+    if not selection:
+        return "Select a sample dataset to view its description."
+    descriptions = sample_dataset_options()
+    description = descriptions.get(selection)
+    if not description:
+        return "Sample dataset description unavailable. Ensure the file exists in the `data/` directory."
+    return f"**{selection}**\n\n{description}"
+def create_dashboard():
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# Business Intelligence Dashboard")
+        dataset_state = gr.State({})
+        last_figure_state = gr.State(None)
+        sample_choices = list(sample_dataset_options().keys())
+        with gr.Tab("Data Upload"):
+            with gr.Row():
+                file_input = gr.File(label="Upload CSV or Excel", file_types=[".csv", ".xlsx", ".xls"])
+                load_button = gr.Button("Load Data", variant="primary")
+            gr.Markdown("Or load one of the curated datasets bundled with the project:")
+            with gr.Row():
+                sample_dropdown = gr.Dropdown(label="Sample Dataset", choices=sample_choices, value=None, interactive=bool(sample_choices))
+                load_sample_button = gr.Button("Load Sample", variant="secondary", interactive=bool(sample_choices))
+            if sample_choices:
+                sample_description = gr.Markdown("Select a sample dataset to view its description.")
+            else:
+                sample_description = gr.Markdown("⚠️ No sample datasets detected in the `data/` folder.")
+            upload_status = gr.Markdown("No dataset loaded.")
+            dataset_info = gr.Markdown()
+            dtypes_table = gr.Dataframe(label="Column Types", interactive=False)
+            with gr.Row():
+                head_table = gr.Dataframe(label="Preview (Head)", interactive=False)
+                tail_table = gr.Dataframe(label="Preview (Tail)", interactive=False)
+        with gr.Tab("Statistics"):
+            stats_status = gr.Markdown()
+            numeric_table = gr.Dataframe(label="Numeric Summary", interactive=False)
+            categorical_table = gr.Dataframe(label="Categorical Summary", interactive=False)
+            missing_table = gr.Dataframe(label="Missing Value Report", interactive=False)
+            correlation_table = gr.Dataframe(label="Correlation Matrix", interactive=False)
+            generate_stats_button = gr.Button("Generate Statistics", variant="secondary")
+        with gr.Tab("Filter & Explore"):
+            filter_status = gr.Markdown("Rows displayed: 0")
+            with gr.Accordion("Numeric Filter", open=False):
+                numeric_column_dropdown = gr.Dropdown(label="Numeric Column", choices=[])
+                numeric_min_input = gr.Number(label="Minimum Value", visible=False)
+                numeric_max_input = gr.Number(label="Maximum Value", visible=False)
+            with gr.Accordion("Categorical Filter", open=False):
+                categorical_column_dropdown = gr.Dropdown(label="Category Column", choices=[])
+                categorical_values = gr.CheckboxGroup(label="Values", choices=[], visible=False)
+            with gr.Accordion("Date Filter", open=False):
+                date_column_dropdown = gr.Dropdown(label="Date Column", choices=[])
+                start_date_picker = gr.Textbox(label="Start Date (YYYY-MM-DD)", visible=False)
+                end_date_picker = gr.Textbox(label="End Date (YYYY-MM-DD)", visible=False)
+            apply_filters_button = gr.Button("Apply Filters", variant="primary")
+            filter_preview_table = gr.Dataframe(label="Filtered Preview", interactive=False)
+            export_filtered_button = gr.Button("Download Filtered Data", variant="secondary")
+            export_filtered_file = gr.File(label="Filtered CSV", interactive=False)
+        with gr.Tab("Visualizations"):
+            viz_status = gr.Markdown()
+            chart_type = gr.Radio(
+                label="Chart Type",
+                choices=["Time Series", "Distribution", "Category", "Scatter", "Correlation Heatmap"],
+                value="Time Series",
+            )
+            with gr.Column(visible=True) as time_series_controls:
+                ts_date_column = gr.Dropdown(label="Date Column", choices=[])
+                ts_value_column = gr.Dropdown(label="Value Column", choices=[])
+                ts_aggregation = gr.Dropdown(label="Aggregation", choices=["sum", "mean", "median", "count"], value="sum")
+            with gr.Column(visible=False) as distribution_controls:
+                dist_column = gr.Dropdown(label="Numeric Column", choices=[])
+                dist_type = gr.Radio(label="Distribution Type", choices=["histogram", "box"], value="histogram")
+            with gr.Column(visible=False) as category_controls:
+                category_column = gr.Dropdown(label="Category Column", choices=[])
+                category_value_column = gr.Dropdown(label="Value Column", choices=[])
+                category_chart_type = gr.Radio(label="Chart Style", choices=["Bar", "Pie"], value="Bar")
+                category_aggregation = gr.Dropdown(label="Aggregation", choices=["sum", "mean", "median", "count"], value="sum")
+            with gr.Column(visible=False) as scatter_controls:
+                scatter_x_column = gr.Dropdown(label="X Axis", choices=[])
+                scatter_y_column = gr.Dropdown(label="Y Axis", choices=[])
+                scatter_color_column = gr.Dropdown(label="Color (optional)", choices=[])
+            generate_chart_button = gr.Button("Generate Visualization", variant="primary")
+            chart_output = gr.Plot(label="Visualization")
+            download_chart_button = gr.Button("Download Chart as PNG", variant="secondary")
+            chart_file_output = gr.File(label="Chart PNG", interactive=False)
+        with gr.Tab("Insights"):
+            insights_status = gr.Markdown()
+            insight_numeric_column = gr.Dropdown(label="Numeric Column", choices=[])
+            trend_date_column = gr.Dropdown(label="Date Column", choices=[])
+            trend_value_column = gr.Dropdown(label="Value Column", choices=[])
+            anomaly_column = gr.Dropdown(label="Column for Anomaly Detection", choices=[])
+            generate_insights_button = gr.Button("Generate Insights", variant="primary")
+            top_table = gr.Dataframe(label="Top Performers", interactive=False)
+            bottom_table = gr.Dataframe(label="Bottom Performers", interactive=False)
+            trend_output = gr.Markdown()
+            anomaly_table = gr.Dataframe(label="Potential Anomalies", interactive=False)
+        # Interactions
+        load_button.click(
+            fn=_handle_file_upload,
+            inputs=[file_input, dataset_state],
+            outputs=[
+                dataset_state,
+                upload_status,
+                dataset_info,
+                dtypes_table,
+                head_table,
+                tail_table,
+                filter_preview_table,
+                filter_status,
+            ],
+        ).then(
+            fn=_populate_column_options,
+            inputs=[dataset_state],
+            outputs=[
+                numeric_column_dropdown,
+                date_column_dropdown,
+                categorical_values,
+                categorical_column_dropdown,
+                ts_date_column,
+                ts_value_column,
+                dist_column,
+                category_column,
+                category_value_column,
+                scatter_x_column,
+                scatter_y_column,
+                scatter_color_column,
+                insight_numeric_column,
+                trend_date_column,
+                trend_value_column,
+                anomaly_column,
+            ],
+        ).then(
+            fn=_generate_statistics,
+            inputs=[dataset_state],
+            outputs=[
+                numeric_table,
+                categorical_table,
+                missing_table,
+                correlation_table,
+                stats_status,
+            ],
+        )
+        load_sample_button.click(
+            fn=_handle_sample_dataset,
+            inputs=[sample_dropdown, dataset_state],
+            outputs=[
+                dataset_state,
+                upload_status,
+                dataset_info,
+                dtypes_table,
+                head_table,
+                tail_table,
+                filter_preview_table,
+                filter_status,
+            ],
+        ).then(
+            fn=_populate_column_options,
+            inputs=[dataset_state],
+            outputs=[
+                numeric_column_dropdown,
+                date_column_dropdown,
+                categorical_values,
+                categorical_column_dropdown,
+                ts_date_column,
+                ts_value_column,
+                dist_column,
+                category_column,
+                category_value_column,
+                scatter_x_column,
+                scatter_y_column,
+                scatter_color_column,
+                insight_numeric_column,
+                trend_date_column,
+                trend_value_column,
+                anomaly_column,
+            ],
+        ).then(
+            fn=_generate_statistics,
+            inputs=[dataset_state],
+            outputs=[
+                numeric_table,
+                categorical_table,
+                missing_table,
+                correlation_table,
+                stats_status,
+            ],
+        )
+        sample_dropdown.change(
+            fn=_describe_sample_dataset,
+            inputs=[sample_dropdown],
+            outputs=[sample_description],
+        )
+        numeric_column_dropdown.change(
+            fn=_update_numeric_inputs,
+            inputs=[numeric_column_dropdown, dataset_state],
+            outputs=[numeric_min_input, numeric_max_input],
+        )
+        categorical_column_dropdown.change(
+            fn=_update_categorical_values,
+            inputs=[categorical_column_dropdown, dataset_state],
+            outputs=[categorical_values],
+        )
+        date_column_dropdown.change(
+            fn=_update_date_bounds,
+            inputs=[date_column_dropdown, dataset_state],
+            outputs=[start_date_picker, end_date_picker],
+        )
+        generate_stats_button.click(
+            fn=_generate_statistics,
+            inputs=[dataset_state],
+            outputs=[numeric_table, categorical_table, missing_table, correlation_table, stats_status],
+        )
+        apply_filters_button.click(
+            fn=_apply_filters,
+            inputs=[
+                dataset_state,
+                numeric_column_dropdown,
+                numeric_min_input,
+                numeric_max_input,
+                categorical_column_dropdown,
+                categorical_values,
+                date_column_dropdown,
+                start_date_picker,
+                end_date_picker,
+            ],
+            outputs=[dataset_state, filter_preview_table, filter_status],
+        )
+        export_filtered_button.click(
+            fn=_download_filtered,
+            inputs=[dataset_state],
+            outputs=[export_filtered_file],
+        )
+        def _toggle_controls(selected: str) -> Tuple[Any, Any, Any, Any]:
+            return (
+                gr.update(visible=selected == "Time Series"),
+                gr.update(visible=selected == "Distribution"),
+                gr.update(visible=selected == "Category"),
+                gr.update(visible=selected == "Scatter"),
+            )
+        chart_type.change(
+            fn=_toggle_controls,
+            inputs=[chart_type],
+            outputs=[time_series_controls, distribution_controls, category_controls, scatter_controls],
+        )
+        generate_chart_button.click(
+            fn=_generate_chart,
+            inputs=[
+                dataset_state,
+                chart_type,
+                ts_date_column,
+                ts_value_column,
+                ts_aggregation,
+                dist_column,
+                dist_type,
+                category_column,
+                category_value_column,
+                category_chart_type,
+                category_aggregation,
+                scatter_x_column,
+                scatter_y_column,
+                scatter_color_column,
+            ],
+            outputs=[last_figure_state, chart_output, viz_status],
+        )
+        download_chart_button.click(
+            fn=_download_chart,
+            inputs=[last_figure_state],
+            outputs=[chart_file_output],
+        )
+        generate_insights_button.click(
+            fn=_generate_insights,
+            inputs=[
+                dataset_state,
+                insight_numeric_column,
+                trend_date_column,
+                trend_value_column,
+                anomaly_column,
+            ],
+            outputs=[
+                top_table,
+                bottom_table,
+                trend_output,
+                anomaly_table,
+                insights_status,
+            ],
+        )
+    return demo
+if __name__ == "__main__":
+    dashboard = create_dashboard()
+    dashboard.launch()

data/features.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/stores.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Store,Type,Size

data/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65834f65a2ddccc85d8f4bf6544f73625be553f8ca5f8fdee976b9d0c900e95d
+size 12842546

data_processor.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""Data loading, cleaning, and filtering helpers for the BI dashboard."""
+from __future__ import annotations
+from dataclasses import dataclass
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, Iterable, List, Mapping, Optional, Tuple
+import pandas as pd
+from utils import (
+    ColumnTypes,
+    PREVIEW_ROWS,
+    coerce_datetime_columns,
+    ensure_unique_columns,
+    infer_column_types,
+    is_supported_file,
+)
+SAMPLE_DATA_DIR = Path(__file__).resolve().parent / "data"
+SAMPLE_DESCRIPTIONS = {
+    "train.csv": "Weekly Walmart sales with markdowns and holidays (training set).",
+    "test.csv": "Companion test set without weekly sales labels.",
+    "features.csv": "Store-level features such as markdowns, CPI, unemployment.",
+    "stores.csv": "Store metadata including type and size.",
+}
+@dataclass(frozen=True)
+class DatasetBundle:
+    """Container storing the dataset and metadata required by the UI."""
+    dataframe: pd.DataFrame
+    column_types: ColumnTypes
+    source_name: str
+def load_dataset(file_obj) -> DatasetBundle:
+    """Load the provided uploaded file into a pandas DataFrame.
+    Parameters
+    ----------
+    file_obj:
+        File-like object produced by the Gradio upload widget.
+    Returns
+    -------
+    DatasetBundle
+        Loaded dataset alongside inferred column metadata.
+    Raises
+    ------
+    ValueError
+        If the file cannot be read or uses an unsupported format.
+    """
+    if file_obj is None:
+        raise ValueError("Please upload a CSV or Excel file.")
+    file_name = getattr(file_obj, "name", None)
+    original_name = getattr(file_obj, "orig_name", file_name)
+    if not original_name or not is_supported_file(original_name):
+        raise ValueError("Unsupported file type. Please upload a CSV or Excel file.")
+    path_candidate = Path(str(file_name)) if file_name else None
+    dataframe: Optional[pd.DataFrame] = None
+    try:
+        if path_candidate and path_candidate.exists():
+            dataframe = _read_from_path(path_candidate, original_name)
+        else:
+            dataframe = _read_from_buffer(file_obj, original_name)
+    except Exception as exc:  # pragma: no cover - defensive conversion
+        raise ValueError(f"Unable to load dataset: {exc}") from exc
+    if dataframe is None:
+        raise ValueError("Failed to load dataset. The file may be empty or corrupted.")
+    dataframe = ensure_unique_columns(dataframe)
+    dataframe, datetime_cols = coerce_datetime_columns(dataframe)
+    column_types = infer_column_types(dataframe)
+    # Ensure newly detected datetime columns are included in metadata
+    column_types = ColumnTypes(
+        numeric=column_types.numeric,
+        categorical=column_types.categorical,
+        datetime=tuple(sorted(set(column_types.datetime + tuple(datetime_cols)))),
+    )
+    return DatasetBundle(
+        dataframe=dataframe,
+        column_types=column_types,
+        source_name=Path(original_name).name,
+    )
+def _read_from_path(path: Path, original_name: str) -> pd.DataFrame:
+    """Read a dataset from disk."""
+    suffix = path.suffix.lower()
+    if suffix == ".csv":
+        return pd.read_csv(path)
+    if suffix in {".xlsx", ".xls"}:
+        return pd.read_excel(path)
+    raise ValueError(f"Unsupported file extension in {original_name}.")
+def _read_from_buffer(file_obj, original_name: str) -> pd.DataFrame:
+    """Read a dataset from an in-memory buffer."""
+    bytes_data = getattr(file_obj, "read", lambda: b"")()
+    if not bytes_data:
+        raise ValueError(f"The uploaded file '{original_name}' is empty.")
+    buffer = BytesIO(bytes_data)
+    lowered = original_name.lower()
+    if lowered.endswith(".csv"):
+        return pd.read_csv(buffer)
+    if lowered.endswith((".xlsx", ".xls")):
+        return pd.read_excel(buffer)
+    raise ValueError("Only CSV and Excel files are supported.")
+def dataset_overview(df: pd.DataFrame) -> Dict[str, object]:
+    """Return basic information about the dataset."""
+    info = {
+        "Rows": int(df.shape[0]),
+        "Columns": int(df.shape[1]),
+        "Memory Usage (MB)": round(df.memory_usage(deep=True).sum() / (1024**2), 2),
+    }
+    dtypes = pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)})
+    return {"info": info, "dtypes": dtypes}
+def dataset_preview(df: pd.DataFrame, rows: int = PREVIEW_ROWS) -> Dict[str, pd.DataFrame]:
+    """Return head and tail previews of the dataset."""
+    return {
+        "head": df.head(rows),
+        "tail": df.tail(rows),
+    }
+def numeric_summary(df: pd.DataFrame) -> pd.DataFrame:
+    """Compute descriptive statistics for numeric columns."""
+    numeric_df = df.select_dtypes(include=["number"])
+    if numeric_df.empty:
+        return pd.DataFrame()
+    summary = pd.DataFrame(
+        {
+            "count": numeric_df.count(),
+            "mean": numeric_df.mean(),
+            "median": numeric_df.median(),
+            "std": numeric_df.std(),
+            "min": numeric_df.min(),
+            "25%": numeric_df.quantile(0.25),
+            "75%": numeric_df.quantile(0.75),
+            "max": numeric_df.max(),
+        }
+    )
+    summary.index.name = "column"
+    return summary.round(3)
+def categorical_summary(df: pd.DataFrame, top_values: int = 5) -> pd.DataFrame:
+    """Compute summary statistics for categorical columns."""
+    categorical_cols = df.select_dtypes(exclude=["number", "datetime64[ns]", "datetime64[ns, UTC]"])
+    if categorical_cols.empty:
+        return pd.DataFrame()
+    rows: List[Dict[str, object]] = []
+    for column in categorical_cols:
+        series = categorical_cols[column]
+        mode_series = series.mode(dropna=True)
+        mode_value = mode_series.iloc[0] if not mode_series.empty else None
+        counts = series.value_counts(dropna=True).head(top_values)
+        top_repr = ", ".join(f"{idx} ({count})" for idx, count in counts.items())
+        rows.append(
+            {
+                "column": column,
+                "unique_values": int(series.nunique(dropna=True)),
+                "mode": mode_value,
+                "mode_count": int(counts.iloc[0]) if not counts.empty else 0,
+                f"top_{top_values}": top_repr,
+            }
+        )
+    return pd.DataFrame(rows)
+def missing_value_report(df: pd.DataFrame) -> pd.DataFrame:
+    """Return the count and percentage of missing values per column."""
+    missing_counts = df.isna().sum()
+    if missing_counts.sum() == 0:
+        return pd.DataFrame(columns=["column", "missing_count", "missing_pct"])
+    missing_pct = (missing_counts / len(df)) * 100
+    report = pd.DataFrame(
+        {
+            "column": missing_counts.index,
+            "missing_count": missing_counts.values,
+            "missing_pct": missing_pct.values,
+        }
+    )
+    return report.sort_values(by="missing_pct", ascending=False).reset_index(drop=True).round({"missing_pct": 2})
+def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Compute the correlation matrix for numeric columns."""
+    numeric_df = df.select_dtypes(include=["number"])
+    if numeric_df.empty or numeric_df.shape[1] < 2:
+        return pd.DataFrame()
+    corr = numeric_df.corr()
+    return corr.round(3)
+def filter_dataframe(
+    df: pd.DataFrame,
+    numeric_filters: Mapping[str, Tuple[Optional[float], Optional[float]]],
+    categorical_filters: Mapping[str, Iterable[str]],
+    date_filters: Mapping[str, Tuple[Optional[str], Optional[str]]],
+) -> pd.DataFrame:
+    """Filter the dataset according to the provided filter definitions."""
+    filtered = df.copy()
+    for column, bounds in numeric_filters.items():
+        if column not in filtered.columns or bounds is None:
+            continue
+        lower, upper = bounds
+        series = filtered[column]
+        if lower is not None:
+            filtered = filtered[series >= lower]
+        if upper is not None:
+            filtered = filtered[series <= upper]
+    for column, values in categorical_filters.items():
+        if column not in filtered.columns:
+            continue
+        values = list(values)
+        if not values:
+            continue
+        filtered = filtered[filtered[column].isin(values)]
+    for column, bounds in date_filters.items():
+        if column not in filtered.columns or bounds is None:
+            continue
+        start, end = bounds
+        series = pd.to_datetime(filtered[column], errors="coerce")
+        if start:
+            filtered = filtered[series >= pd.to_datetime(start)]
+        if end:
+            filtered = filtered[series <= pd.to_datetime(end)]
+    return filtered
+def filter_metadata(df: pd.DataFrame, column_types: ColumnTypes, categorical_limit: int = 200) -> Dict[str, object]:
+    """Pre-compute useful metadata for rendering filter controls."""
+    metadata: Dict[str, object] = {"numeric": {}, "categorical": {}, "datetime": {}}
+    for column in column_types.numeric:
+        series = df[column].dropna()
+        if series.empty:
+            continue
+        metadata["numeric"][column] = {
+            "min": float(series.min()),
+            "max": float(series.max()),
+        }
+    for column in column_types.categorical:
+        series = df[column].dropna().astype(str)
+        unique_values = series.unique().tolist()
+        if len(unique_values) > categorical_limit:
+            unique_values = unique_values[:categorical_limit]
+        metadata["categorical"][column] = unique_values
+    for column in column_types.datetime:
+        series = pd.to_datetime(df[column], errors="coerce")
+        series = series.dropna()
+        if series.empty:
+            continue
+        metadata["datetime"][column] = {
+            "min": series.min().date(),
+            "max": series.max().date(),
+        }
+    return metadata
+def sample_dataset_options() -> Dict[str, str]:
+    """Return available bundled datasets and their descriptions."""
+    options: Dict[str, str] = {}
+    if not SAMPLE_DATA_DIR.exists():
+        return options
+    for path in sorted(SAMPLE_DATA_DIR.iterdir()):
+        if not path.is_file():
+            continue
+        if path.suffix.lower() not in {".csv", ".xlsx", ".xls"}:
+            continue
+        description = SAMPLE_DESCRIPTIONS.get(path.name, f"Sample dataset sourced from '{path.name}'.")
+        options[path.name] = description
+    return options
+def load_sample_dataset(selection: str) -> DatasetBundle:
+    """Load a dataset bundled inside the local data directory."""
+    if not selection:
+        raise ValueError("Please select a sample dataset from the dropdown.")
+    path = SAMPLE_DATA_DIR / selection
+    if not path.exists():
+        raise ValueError(
+            f"Sample dataset '{selection}' was not found in the 'data/' directory. "
+            "Ensure the file exists and try again."
+        )
+    dataframe = _read_from_path(path, selection)
+    dataframe = ensure_unique_columns(dataframe)
+    dataframe, datetime_cols = coerce_datetime_columns(dataframe)
+    column_types = infer_column_types(dataframe)
+    column_types = ColumnTypes(
+        numeric=column_types.numeric,
+        categorical=column_types.categorical,
+        datetime=tuple(sorted(set(column_types.datetime + tuple(datetime_cols)))),
+    )
+    return DatasetBundle(
+        dataframe=dataframe,
+        column_types=column_types,
+        source_name=selection,
+    )

filtered_htzxc454.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

insights.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Insight generation utilities for the BI dashboard."""
+from __future__ import annotations
+from typing import Dict, Iterable, Optional, Tuple
+import numpy as np
+import pandas as pd
+from utils import ColumnTypes
+def top_bottom_performers(df: pd.DataFrame, column: str, n: int = 5) -> Dict[str, pd.DataFrame]:
+    """Return the top and bottom performers for a numeric column."""
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' not found in dataset.")
+    numeric_series = pd.to_numeric(df[column], errors="coerce").dropna()
+    if numeric_series.empty:
+        raise ValueError(f"Column '{column}' does not contain numeric data.")
+    top = numeric_series.nlargest(n)
+    bottom = numeric_series.nsmallest(n)
+    return {
+        "top": top.reset_index(),
+        "bottom": bottom.reset_index(),
+    }
+def detect_trend(df: pd.DataFrame, date_column: str, value_column: str) -> str:
+    """Analyze basic trend between the first and last data points."""
+    if date_column not in df.columns or value_column not in df.columns:
+        raise ValueError("Selected columns are not present in the dataset.")
+    working = df[[date_column, value_column]].dropna()
+    working[date_column] = pd.to_datetime(working[date_column], errors="coerce")
+    working = working.dropna()
+    if working.empty or working[date_column].nunique() < 2:
+        return "Not enough data to evaluate a trend."
+    working = working.sort_values(by=date_column)
+    first_date = working[date_column].iloc[0]
+    last_date = working[date_column].iloc[-1]
+    first_value = working[value_column].iloc[0]
+    last_value = working[value_column].iloc[-1]
+    change = last_value - first_value
+    pct_change = (change / first_value * 100) if first_value != 0 else np.nan
+    if np.isnan(pct_change):
+        direction = "changed"
+    elif pct_change > 0:
+        direction = "increased"
+    elif pct_change < 0:
+        direction = "decreased"
+    else:
+        direction = "remained stable"
+    pct_text = f" ({pct_change:.2f}%)" if not np.isnan(pct_change) else ""
+    return (
+        f"Between {first_date.date()} and {last_date.date()}, "
+        f"{value_column} {direction} by {change:.2f}{pct_text}."
+    )
+def detect_anomalies(df: pd.DataFrame, column: str, z_threshold: float = 3.0, limit: int = 5) -> pd.DataFrame:
+    """Identify potential outliers using a simple z-score approach."""
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' not found in dataset.")
+    series = pd.to_numeric(df[column], errors="coerce")
+    z_scores = ((series - series.mean()) / series.std()).abs()
+    anomalies = df.loc[z_scores > z_threshold, [column]].copy()
+    anomalies["z_score"] = z_scores[z_scores > z_threshold]
+    return anomalies.sort_values(by="z_score", ascending=False).head(limit)
+def get_default_insight_columns(column_types: ColumnTypes) -> Dict[str, Optional[str]]:
+    """Determine default columns to use when auto-generating insights."""
+    numeric_col = column_types.numeric[0] if column_types.numeric else None
+    date_col = column_types.datetime[0] if column_types.datetime else None
+    return {"numeric": numeric_col, "datetime": date_col}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.0,<5.0
+pandas>=2.0,<3.0
+plotly>=5.18
+kaleido>=0.2.1
+numpy>=1.24
+openpyxl>=3.1

utils.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Utility helpers for the Business Intelligence dashboard."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Tuple
+import pandas as pd
+SUPPORTED_FILE_TYPES: Tuple[str, ...] = (".csv", ".xlsx", ".xls")
+"""Allowed file extensions for uploads."""
+PREVIEW_ROWS: int = 5
+"""Default number of rows to display in dataset previews."""
+@dataclass(frozen=True)
+class ColumnTypes:
+    """Container describing inferred column groupings."""
+    numeric: Tuple[str, ...]
+    categorical: Tuple[str, ...]
+    datetime: Tuple[str, ...]
+def is_supported_file(filename: str | None) -> bool:
+    """Return True when the provided filename uses a supported extension."""
+    if not filename:
+        return False
+    lowered = filename.lower()
+    return any(lowered.endswith(ext) for ext in SUPPORTED_FILE_TYPES)
+def coerce_datetime_columns(df: pd.DataFrame, threshold: float = 0.6) -> Tuple[pd.DataFrame, Tuple[str, ...]]:
+    """Attempt to parse object columns as datetimes when enough values can be converted.
+    Parameters
+    ----------
+    df:
+        Input DataFrame to mutate in-place.
+    threshold:
+        Minimum fraction of non-null values that must successfully convert
+        for the column to be promoted to datetime.
+    Returns
+    -------
+    tuple
+        Mutated DataFrame and the tuple of datetime column names.
+    """
+    datetime_cols: List[str] = list(
+        df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
+    )
+    object_cols = df.select_dtypes(include=["object"]).columns
+    for col in object_cols:
+        series = df[col]
+        non_null_ratio = series.notna().mean()
+        if non_null_ratio == 0 or non_null_ratio < threshold:
+            continue
+        converted = pd.to_datetime(series, errors="coerce", utc=False, infer_datetime_format=True)
+        success_ratio = converted.notna().mean()
+        if success_ratio >= threshold:
+            df[col] = converted
+            datetime_cols.append(col)
+    return df, tuple(sorted(set(datetime_cols)))
+def infer_column_types(df: pd.DataFrame) -> ColumnTypes:
+    """Infer high-level data types for the provided DataFrame's columns."""
+    numeric_cols = tuple(df.select_dtypes(include=["number"]).columns)
+    datetime_cols = tuple(df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns)
+    categorical_cols: List[str] = []
+    for col in df.columns:
+        if col in numeric_cols or col in datetime_cols:
+            continue
+        categorical_cols.append(col)
+    return ColumnTypes(numeric=numeric_cols, categorical=tuple(categorical_cols), datetime=datetime_cols)
+def clamp_numeric(value: float, minimum: float, maximum: float) -> float:
+    """Clamp *value* into the closed range [minimum, maximum]."""
+    return max(minimum, min(maximum, value))
+def ensure_unique_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Rename duplicate columns to maintain uniqueness."""
+    if df.columns.is_unique:
+        return df
+    new_columns: List[str] = []
+    seen: Dict[str, int] = {}
+    for col in df.columns:
+        count = seen.get(col, 0)
+        if count == 0:
+            new_columns.append(col)
+        else:
+            new_columns.append(f"{col}_{count}")
+        seen[col] = count + 1
+    df = df.copy()
+    df.columns = new_columns
+    return df
+def shorten_text(value: str, max_length: int = 80) -> str:
+    """Truncate long text values for cleaner display."""
+    if len(value) <= max_length:
+        return value
+    return f"{value[: max_length - 3]}..."
+def safe_column_subset(columns: Iterable[str], allowed: Iterable[str]) -> List[str]:
+    """Return a list of *columns* that exist inside *allowed*."""
+    allowed_set = set(allowed)
+    return [col for col in columns if col in allowed_set]

visualizations.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Visualization utilities leveraging the Strategy Pattern for the BI dashboard."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from io import BytesIO
+from typing import Any, Dict, Iterable, Optional
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+AGGREGATIONS = {
+    "sum": "sum",
+    "mean": "mean",
+    "median": "median",
+    "count": "count",
+}
+class VisualizationStrategy(ABC):
+    """Abstract base class for visualization strategies."""
+    @abstractmethod
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        """Generate a Plotly figure from the provided dataframe and arguments."""
+        pass
+    def validate_columns(self, df: pd.DataFrame, columns: Iterable[str]) -> None:
+        """Ensure every column exists inside the DataFrame."""
+        missing = [col for col in columns if col not in df.columns]
+        if missing:
+            raise ValueError(f"Column(s) not found in dataset: {', '.join(missing)}")
+class TimeSeriesStrategy(VisualizationStrategy):
+    """Strategy for generating time-series plots."""
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        date_column = kwargs.get("date_column")
+        value_column = kwargs.get("value_column")
+        aggregation = kwargs.get("aggregation", "sum")
+        if not date_column or not value_column:
+            raise ValueError("Date and value columns are required for Time Series.")
+        self.validate_columns(df, [date_column, value_column])
+        if aggregation not in AGGREGATIONS:
+            raise ValueError("Unsupported aggregation method.")
+        date_series = pd.to_datetime(df[date_column], errors="coerce")
+        subset = df.loc[date_series.notna(), [date_column, value_column]].copy()
+        subset[date_column] = pd.to_datetime(subset[date_column])
+        grouped = subset.groupby(subset[date_column].dt.date)[value_column].agg(aggregation).reset_index()
+        fig = px.line(
+            grouped,
+            x=date_column,
+            y=value_column,
+            title=f"{value_column} over time ({aggregation})",
+        )
+        fig.update_layout(xaxis_title=date_column, yaxis_title=value_column)
+        return fig
+class DistributionStrategy(VisualizationStrategy):
+    """Strategy for generating distribution plots (histogram/box)."""
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        column = kwargs.get("column")
+        plot_type = kwargs.get("plot_type", "histogram")
+        if not column:
+            raise ValueError("Numeric column is required for Distribution plot.")
+        self.validate_columns(df, [column])
+        numeric_series = pd.to_numeric(df[column], errors="coerce").dropna()
+        if numeric_series.empty:
+            raise ValueError("Selected column does not contain numeric data.")
+        if plot_type == "box":
+            fig = px.box(numeric_series, y=column, points="suspectedoutliers", title=f"Distribution of {column}")
+        else:
+            fig = px.histogram(
+                numeric_series,
+                nbins=30,
+                title=f"Distribution of {column}",
+            )
+            fig.update_layout(xaxis_title=column, yaxis_title="Frequency")
+        return fig
+class CategoryStrategy(VisualizationStrategy):
+    """Strategy for generating categorical charts (bar/pie)."""
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        category_column = kwargs.get("category_column")
+        value_column = kwargs.get("value_column")
+        aggregation = kwargs.get("aggregation", "sum")
+        chart_type = kwargs.get("chart_type", "bar").lower()
+        if not category_column or not value_column:
+            raise ValueError("Category and value columns are required for Category plot.")
+        self.validate_columns(df, [category_column, value_column])
+        if aggregation not in AGGREGATIONS:
+            raise ValueError("Unsupported aggregation method.")
+        grouped = (
+            df.groupby(category_column)[value_column]
+            .agg(aggregation)
+            .reset_index()
+            .sort_values(by=value_column, ascending=False)
+        )
+        if chart_type == "pie":
+            fig = px.pie(grouped, names=category_column, values=value_column, title=f"{value_column} by {category_column}")
+        else:
+            fig = px.bar(grouped, x=category_column, y=value_column, title=f"{value_column} by {category_column}")
+            fig.update_layout(xaxis_title=category_column, yaxis_title=f"{aggregation} of {value_column}")
+        return fig
+class ScatterStrategy(VisualizationStrategy):
+    """Strategy for generating scatter plots."""
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        x_column = kwargs.get("x_column")
+        y_column = kwargs.get("y_column")
+        color_column = kwargs.get("color_column")
+        if not x_column or not y_column:
+            raise ValueError("X and Y columns are required for Scatter plot.")
+        columns = [x_column, y_column]
+        if color_column:
+            columns.append(color_column)
+        self.validate_columns(df, columns)
+        fig = px.scatter(df, x=x_column, y=y_column, color=color_column, title=f"{y_column} vs {x_column}")
+        fig.update_layout(xaxis_title=x_column, yaxis_title=y_column)
+        return fig
+class CorrelationHeatmapStrategy(VisualizationStrategy):
+    """Strategy for generating correlation heatmaps."""
+    def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
+        numeric_df = df.select_dtypes(include=["number"])
+        if numeric_df.shape[1] < 2:
+            raise ValueError("At least two numeric columns are required for a correlation heatmap.")
+        corr = numeric_df.corr()
+        fig = px.imshow(
+            corr,
+            text_auto=True,
+            title="Correlation Heatmap",
+            color_continuous_scale="RdBu",
+            aspect="auto",
+        )
+        return fig
+def figure_to_png_bytes(fig: go.Figure) -> BytesIO:
+    """Export the figure to an in-memory PNG buffer."""
+    try:
+        image_bytes = fig.to_image(format="png")
+    except ValueError as exc:  # pragma: no cover - fallback for environments without kaleido
+        raise ValueError("PNG export requires the 'kaleido' package. Please install it to enable downloads.") from exc
+    return BytesIO(image_bytes)