Add project files with LFS for large CSV
Browse files- .gitattributes +1 -0
- .gitignore +2 -0
- README.md +59 -13
- app.py +747 -0
- data/features.csv +0 -0
- data/stores.csv +1 -0
- data/test.csv +0 -0
- data/train.csv +3 -0
- data_processor.py +333 -0
- filtered_htzxc454.csv +0 -0
- insights.py +83 -0
- requirements.txt +6 -0
- utils.py +119 -0
- visualizations.py +172 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/train.csv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.DS_Store
|
| 2 |
+
.pycache
|
README.md
CHANGED
|
@@ -1,13 +1,59 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Business Intelligence Dashboard
|
| 2 |
+
|
| 3 |
+
Interactive Gradio application for exploring business datasets, generating insights, and exporting filtered results.
|
| 4 |
+
|
| 5 |
+
### Features
|
| 6 |
+
- Upload CSV or Excel files with automated validation and previews.
|
| 7 |
+
- Comprehensive statistics: numeric and categorical summaries, missing value report, correlation matrix.
|
| 8 |
+
- Dynamic filtering by numeric ranges, categorical selections, and date ranges with live row counts.
|
| 9 |
+
- Visualizations: time series, distribution, category comparisons, scatter plots, correlation heatmap.
|
| 10 |
+
- Automated insights: top/bottom performers, trend detection, anomaly identification.
|
| 11 |
+
- Export filtered data as CSV and download charts as PNG (requires `kaleido`).
|
| 12 |
+
|
| 13 |
+
### Project Structure
|
| 14 |
+
```
|
| 15 |
+
BID/
|
| 16 |
+
├── app.py # Gradio UI wiring
|
| 17 |
+
├── data_processor.py # Data loading, cleaning, filtering utilities
|
| 18 |
+
├── visualizations.py # Plotly chart generators
|
| 19 |
+
├── insights.py # Insight extraction helpers
|
| 20 |
+
├── utils.py # Shared helpers/constants
|
| 21 |
+
├── data/ # Curated datasets from Kaggle & UCI
|
| 22 |
+
│ ├── sales_train.csv
|
| 23 |
+
│ ├── items.csv
|
| 24 |
+
│ ├── item_categories.csv
|
| 25 |
+
│ ├── shops.csv
|
| 26 |
+
│ ├── test.csv
|
| 27 |
+
│ └── online_retail.csv # add this file from the UCI dataset
|
| 28 |
+
├── requirements.txt # Python dependencies
|
| 29 |
+
└── README.md # Project overview (this file)
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Sample Datasets
|
| 33 |
+
- Kaggle *Predict Future Sales* (`sales_train.csv` plus lookup tables `items.csv`, `item_categories.csv`, `shops.csv`).
|
| 34 |
+
- UCI *Online Retail* (`online_retail.csv` — place the downloaded CSV in `data/`).
|
| 35 |
+
|
| 36 |
+
Use the **Load Sample** controls on the *Data Upload* tab to bootstrap analysis with these datasets. The app augments the Kaggle sales data by joining the lookup tables automatically.
|
| 37 |
+
|
| 38 |
+
### Getting Started
|
| 39 |
+
1. **Install dependencies**
|
| 40 |
+
```bash
|
| 41 |
+
pip install -r requirements.txt
|
| 42 |
+
```
|
| 43 |
+
PNG exports require the optional `kaleido` dependency included above.
|
| 44 |
+
|
| 45 |
+
2. **Launch the dashboard**
|
| 46 |
+
```bash
|
| 47 |
+
python app.py
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
3. **Load data**
|
| 51 |
+
- Upload your own CSV/Excel file **or** pick one of the bundled datasets via the *Load Sample* dropdown.
|
| 52 |
+
- Ensure the raw Kaggle/UCI CSV files reside in `data/` so the sample loader can detect them.
|
| 53 |
+
|
| 54 |
+
4. **Explore**
|
| 55 |
+
- Apply filters, switch between visualizations, inspect automated insights, and download filtered results or charts.
|
| 56 |
+
|
| 57 |
+
### Notes
|
| 58 |
+
- The app infers column types automatically; ensure date columns are parseable for time-series plots and trend insights.
|
| 59 |
+
- Large datasets may need additional preprocessing before upload to stay within local resource limits.
|
app.py
ADDED
|
@@ -0,0 +1,747 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio application wiring for the Business Intelligence dashboard."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import tempfile
|
| 6 |
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import plotly.graph_objects as go
|
| 11 |
+
|
| 12 |
+
from data_processor import (
|
| 13 |
+
DatasetBundle,
|
| 14 |
+
dataset_overview,
|
| 15 |
+
dataset_preview,
|
| 16 |
+
filter_dataframe,
|
| 17 |
+
filter_metadata,
|
| 18 |
+
load_dataset,
|
| 19 |
+
load_sample_dataset,
|
| 20 |
+
missing_value_report,
|
| 21 |
+
numeric_summary,
|
| 22 |
+
categorical_summary,
|
| 23 |
+
correlation_matrix,
|
| 24 |
+
sample_dataset_options,
|
| 25 |
+
)
|
| 26 |
+
from insights import (
|
| 27 |
+
detect_anomalies,
|
| 28 |
+
detect_trend,
|
| 29 |
+
get_default_insight_columns,
|
| 30 |
+
top_bottom_performers,
|
| 31 |
+
)
|
| 32 |
+
from visualizations import (
|
| 33 |
+
create_category_plot,
|
| 34 |
+
create_correlation_heatmap,
|
| 35 |
+
create_distribution_plot,
|
| 36 |
+
create_scatter_plot,
|
| 37 |
+
create_time_series_plot,
|
| 38 |
+
figure_to_png_bytes,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
DatasetState = Dict[str, Any]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _format_overview_text(info: Dict[str, Any], source_name: str) -> str:
|
| 46 |
+
"""Render dataset information as Markdown."""
|
| 47 |
+
lines = [
|
| 48 |
+
f"**Source:** {source_name}",
|
| 49 |
+
f"- Rows: {info['Rows']}",
|
| 50 |
+
f"- Columns: {info['Columns']}",
|
| 51 |
+
f"- Memory Usage: {info['Memory Usage (MB)']} MB",
|
| 52 |
+
]
|
| 53 |
+
return "\n".join(lines)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _empty_dataframe(message: str = "No data available") -> pd.DataFrame:
|
| 57 |
+
"""Return a placeholder DataFrame for empty displays."""
|
| 58 |
+
return pd.DataFrame({"status": [message]})
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _ensure_state(state: Optional[DatasetState]) -> DatasetState:
|
| 62 |
+
"""Guarantee a dictionary-based state object."""
|
| 63 |
+
return state or {}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _current_dataframe(state: DatasetState, filtered: bool = True) -> pd.DataFrame:
|
| 67 |
+
"""Return the filtered or raw dataframe from state."""
|
| 68 |
+
key = "filtered_df" if filtered else "dataframe"
|
| 69 |
+
df = state.get(key)
|
| 70 |
+
if isinstance(df, pd.DataFrame):
|
| 71 |
+
return df
|
| 72 |
+
raise ValueError("Please upload a dataset before performing this action.")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _finalize_dataset_load(bundle: DatasetBundle, state: DatasetState) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
|
| 76 |
+
"""Populate shared outputs after a dataset is loaded."""
|
| 77 |
+
df = bundle.dataframe
|
| 78 |
+
state = {
|
| 79 |
+
"dataframe": df,
|
| 80 |
+
"filtered_df": df,
|
| 81 |
+
"column_types": bundle.column_types,
|
| 82 |
+
"filter_meta": filter_metadata(df, bundle.column_types),
|
| 83 |
+
"source_name": bundle.source_name,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
overview = dataset_overview(df)
|
| 87 |
+
preview = dataset_preview(df)
|
| 88 |
+
|
| 89 |
+
status = f"✅ Loaded '{bundle.source_name}' with {df.shape[0]} rows and {df.shape[1]} columns."
|
| 90 |
+
info_text = _format_overview_text(overview["info"], bundle.source_name)
|
| 91 |
+
dtypes_df = overview["dtypes"]
|
| 92 |
+
head_df = preview["head"]
|
| 93 |
+
tail_df = preview["tail"]
|
| 94 |
+
filter_preview = head_df
|
| 95 |
+
row_count = f"Rows displayed: {len(df)}"
|
| 96 |
+
|
| 97 |
+
return state, status, info_text, dtypes_df, head_df, tail_df, filter_preview, row_count
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _handle_file_upload(file, state: Optional[DatasetState]) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
|
| 101 |
+
"""Load a dataset from the uploaded file."""
|
| 102 |
+
state = _ensure_state(state)
|
| 103 |
+
try:
|
| 104 |
+
bundle: DatasetBundle = load_dataset(file)
|
| 105 |
+
except ValueError as exc:
|
| 106 |
+
return (
|
| 107 |
+
state,
|
| 108 |
+
f"❌ {exc}",
|
| 109 |
+
"No dataset loaded.",
|
| 110 |
+
_empty_dataframe(),
|
| 111 |
+
_empty_dataframe(),
|
| 112 |
+
_empty_dataframe(),
|
| 113 |
+
_empty_dataframe(),
|
| 114 |
+
"Rows displayed: 0",
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return _finalize_dataset_load(bundle, state)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def _handle_sample_dataset(selection: Optional[str], state: Optional[DatasetState]) -> Tuple[DatasetState, str, str, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
|
| 121 |
+
"""Load one of the bundled sample datasets."""
|
| 122 |
+
state = _ensure_state(state)
|
| 123 |
+
if not selection:
|
| 124 |
+
message = "Please choose a sample dataset before loading."
|
| 125 |
+
empty = _empty_dataframe(message)
|
| 126 |
+
return state, f"⚠️ {message}", "No dataset loaded.", empty, empty, empty, empty, "Rows displayed: 0"
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
bundle = load_sample_dataset(selection)
|
| 130 |
+
except ValueError as exc:
|
| 131 |
+
empty = _empty_dataframe(str(exc))
|
| 132 |
+
return state, f"❌ {exc}", "No dataset loaded.", empty, empty, empty, empty, "Rows displayed: 0"
|
| 133 |
+
|
| 134 |
+
return _finalize_dataset_load(bundle, state)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _populate_column_options(
|
| 138 |
+
state: Optional[DatasetState],
|
| 139 |
+
) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]:
|
| 140 |
+
"""Populate dropdown choices based on the uploaded dataset."""
|
| 141 |
+
state = _ensure_state(state)
|
| 142 |
+
column_types = state.get("column_types")
|
| 143 |
+
if not column_types:
|
| 144 |
+
empty_dropdown = gr.update(choices=[], value=None, interactive=False, visible=True)
|
| 145 |
+
hidden_checkbox = gr.update(choices=[], value=[], visible=False, interactive=False)
|
| 146 |
+
return (
|
| 147 |
+
empty_dropdown,
|
| 148 |
+
empty_dropdown,
|
| 149 |
+
hidden_checkbox,
|
| 150 |
+
empty_dropdown,
|
| 151 |
+
empty_dropdown,
|
| 152 |
+
empty_dropdown,
|
| 153 |
+
empty_dropdown,
|
| 154 |
+
empty_dropdown,
|
| 155 |
+
empty_dropdown,
|
| 156 |
+
empty_dropdown,
|
| 157 |
+
empty_dropdown,
|
| 158 |
+
empty_dropdown,
|
| 159 |
+
empty_dropdown,
|
| 160 |
+
empty_dropdown,
|
| 161 |
+
empty_dropdown,
|
| 162 |
+
empty_dropdown,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
numeric = list(column_types.numeric)
|
| 166 |
+
categorical = list(column_types.categorical)
|
| 167 |
+
datetime_cols = list(column_types.datetime)
|
| 168 |
+
all_columns = list(state["dataframe"].columns)
|
| 169 |
+
defaults = get_default_insight_columns(column_types)
|
| 170 |
+
|
| 171 |
+
def dropdown(values: Iterable[str], default: Optional[str] = None):
|
| 172 |
+
choices = list(values)
|
| 173 |
+
value = default if default in choices else None
|
| 174 |
+
return gr.update(
|
| 175 |
+
choices=choices,
|
| 176 |
+
value=value,
|
| 177 |
+
interactive=bool(choices),
|
| 178 |
+
visible=True,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
return (
|
| 182 |
+
dropdown(numeric), # numeric filter column
|
| 183 |
+
dropdown(datetime_cols), # date filter column
|
| 184 |
+
gr.update(choices=[], value=[], visible=False, interactive=False), # categorical values reset
|
| 185 |
+
dropdown(categorical), # categorical filter column
|
| 186 |
+
dropdown(datetime_cols, defaults.get("datetime")), # time series date
|
| 187 |
+
dropdown(numeric, defaults.get("numeric")), # time series value
|
| 188 |
+
dropdown(numeric), # distribution numeric
|
| 189 |
+
dropdown(categorical), # category column
|
| 190 |
+
dropdown(numeric), # category value
|
| 191 |
+
dropdown(numeric), # scatter x
|
| 192 |
+
dropdown(numeric), # scatter y
|
| 193 |
+
gr.update(choices=all_columns, value=None, interactive=bool(all_columns), visible=True), # scatter color
|
| 194 |
+
dropdown(numeric, defaults.get("numeric")), # insight numeric
|
| 195 |
+
dropdown(datetime_cols, defaults.get("datetime")), # insight datetime
|
| 196 |
+
dropdown(numeric, defaults.get("numeric")), # trend value
|
| 197 |
+
dropdown(numeric, defaults.get("numeric")), # anomaly column
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _update_numeric_inputs(column: Optional[str], state: Optional[DatasetState]) -> Tuple[Any, Any]:
|
| 202 |
+
"""Update numeric min/max inputs when a column is selected."""
|
| 203 |
+
state = _ensure_state(state)
|
| 204 |
+
hidden = gr.update(visible=False, value=None)
|
| 205 |
+
if not column or "filter_meta" not in state:
|
| 206 |
+
return hidden, hidden
|
| 207 |
+
meta = state["filter_meta"]["numeric"].get(column)
|
| 208 |
+
if not meta:
|
| 209 |
+
return hidden, hidden
|
| 210 |
+
minimum = float(meta["min"])
|
| 211 |
+
maximum = float(meta["max"])
|
| 212 |
+
return (
|
| 213 |
+
gr.update(value=minimum, visible=True, interactive=True, label=f"Min ({column})"),
|
| 214 |
+
gr.update(value=maximum, visible=True, interactive=True, label=f"Max ({column})"),
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _update_categorical_values(column: Optional[str], state: Optional[DatasetState]):
|
| 219 |
+
"""Populate categorical value options for filtering."""
|
| 220 |
+
state = _ensure_state(state)
|
| 221 |
+
if not column or "filter_meta" not in state:
|
| 222 |
+
return gr.update(visible=False)
|
| 223 |
+
values = state["filter_meta"]["categorical"].get(column, [])
|
| 224 |
+
return gr.update(
|
| 225 |
+
choices=values,
|
| 226 |
+
value=values[: min(10, len(values))],
|
| 227 |
+
visible=bool(values),
|
| 228 |
+
interactive=bool(values),
|
| 229 |
+
label=f"Values to include ({column})",
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _update_date_bounds(column: Optional[str], state: Optional[DatasetState]) -> Tuple[Any, Any]:
|
| 234 |
+
"""Populate date inputs when a date column is selected."""
|
| 235 |
+
state = _ensure_state(state)
|
| 236 |
+
if not column or "filter_meta" not in state:
|
| 237 |
+
hidden = gr.update(visible=False, value=None)
|
| 238 |
+
return hidden, hidden
|
| 239 |
+
meta = state["filter_meta"]["datetime"].get(column)
|
| 240 |
+
if not meta:
|
| 241 |
+
hidden = gr.update(visible=False, value=None)
|
| 242 |
+
return hidden, hidden
|
| 243 |
+
start = str(meta["min"])
|
| 244 |
+
end = str(meta["max"])
|
| 245 |
+
return (
|
| 246 |
+
gr.update(value=start, visible=True, label=f"Start date ({column})"),
|
| 247 |
+
gr.update(value=end, visible=True, label=f"End date ({column})"),
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _apply_filters(
|
| 252 |
+
state: Optional[DatasetState],
|
| 253 |
+
numeric_column: Optional[str],
|
| 254 |
+
numeric_min: Optional[float],
|
| 255 |
+
numeric_max: Optional[float],
|
| 256 |
+
categorical_column: Optional[str],
|
| 257 |
+
categorical_values: Optional[List[str]],
|
| 258 |
+
date_column: Optional[str],
|
| 259 |
+
start_date: Optional[str],
|
| 260 |
+
end_date: Optional[str],
|
| 261 |
+
) -> Tuple[DatasetState, pd.DataFrame, str]:
|
| 262 |
+
"""Filter the dataset according to user selections."""
|
| 263 |
+
state = _ensure_state(state)
|
| 264 |
+
df = _current_dataframe(state, filtered=False)
|
| 265 |
+
|
| 266 |
+
numeric_filters: Dict[str, Tuple[Optional[float], Optional[float]]] = {}
|
| 267 |
+
categorical_filters: Dict[str, List[str]] = {}
|
| 268 |
+
date_filters: Dict[str, Tuple[Optional[str], Optional[str]]] = {}
|
| 269 |
+
|
| 270 |
+
if numeric_column and (numeric_min is not None or numeric_max is not None):
|
| 271 |
+
lower = numeric_min
|
| 272 |
+
upper = numeric_max
|
| 273 |
+
if lower is not None and upper is not None and lower > upper:
|
| 274 |
+
lower, upper = upper, lower
|
| 275 |
+
numeric_filters[numeric_column] = (lower, upper)
|
| 276 |
+
|
| 277 |
+
if categorical_column and categorical_values:
|
| 278 |
+
categorical_filters[categorical_column] = categorical_values
|
| 279 |
+
|
| 280 |
+
if date_column and (start_date or end_date):
|
| 281 |
+
date_filters[date_column] = (start_date, end_date)
|
| 282 |
+
|
| 283 |
+
filtered_df = filter_dataframe(df, numeric_filters, categorical_filters, date_filters)
|
| 284 |
+
state["filtered_df"] = filtered_df
|
| 285 |
+
|
| 286 |
+
row_count = f"Rows displayed: {len(filtered_df)}"
|
| 287 |
+
preview = filtered_df.head(5) if not filtered_df.empty else _empty_dataframe("No rows match the filters.")
|
| 288 |
+
return state, preview, row_count
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _generate_statistics(state: Optional[DatasetState]) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
|
| 292 |
+
"""Produce summary statistics for the Statistics tab."""
|
| 293 |
+
state = _ensure_state(state)
|
| 294 |
+
try:
|
| 295 |
+
df = _current_dataframe(state, filtered=False)
|
| 296 |
+
except ValueError as exc:
|
| 297 |
+
message = str(exc)
|
| 298 |
+
empty = _empty_dataframe(message)
|
| 299 |
+
return empty, empty, empty, empty, f"⚠️ {message}"
|
| 300 |
+
|
| 301 |
+
num_summary = numeric_summary(df)
|
| 302 |
+
cat_summary = categorical_summary(df)
|
| 303 |
+
missing = missing_value_report(df)
|
| 304 |
+
corr = correlation_matrix(df)
|
| 305 |
+
message = "Statistics generated successfully."
|
| 306 |
+
return (
|
| 307 |
+
num_summary if not num_summary.empty else _empty_dataframe("No numeric columns available."),
|
| 308 |
+
cat_summary if not cat_summary.empty else _empty_dataframe("No categorical columns available."),
|
| 309 |
+
missing if not missing.empty else _empty_dataframe("No missing values detected."),
|
| 310 |
+
corr if not corr.empty else _empty_dataframe("Not enough numeric columns for correlation."),
|
| 311 |
+
message,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def _generate_chart(
|
| 316 |
+
state: Optional[DatasetState],
|
| 317 |
+
chart_type: str,
|
| 318 |
+
ts_date: Optional[str],
|
| 319 |
+
ts_value: Optional[str],
|
| 320 |
+
ts_agg: str,
|
| 321 |
+
dist_column: Optional[str],
|
| 322 |
+
dist_type: str,
|
| 323 |
+
cat_column: Optional[str],
|
| 324 |
+
cat_value: Optional[str],
|
| 325 |
+
cat_chart_type: str,
|
| 326 |
+
cat_agg: str,
|
| 327 |
+
scatter_x: Optional[str],
|
| 328 |
+
scatter_y: Optional[str],
|
| 329 |
+
scatter_color: Optional[str],
|
| 330 |
+
) -> Tuple[Optional[go.Figure], Optional[go.Figure], str]:
|
| 331 |
+
"""Create a visualization based on user selections."""
|
| 332 |
+
state = _ensure_state(state)
|
| 333 |
+
try:
|
| 334 |
+
df = _current_dataframe(state, filtered=True)
|
| 335 |
+
except ValueError as exc:
|
| 336 |
+
return None, None, f"⚠️ {exc}"
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
if chart_type == "Time Series":
|
| 340 |
+
if not ts_date or not ts_value:
|
| 341 |
+
raise ValueError("Select both a date and value column.")
|
| 342 |
+
fig = create_time_series_plot(df, ts_date, ts_value, aggregation=ts_agg)
|
| 343 |
+
elif chart_type == "Distribution":
|
| 344 |
+
if not dist_column:
|
| 345 |
+
raise ValueError("Select a numeric column for the distribution plot.")
|
| 346 |
+
fig = create_distribution_plot(df, dist_column, plot_type=dist_type)
|
| 347 |
+
elif chart_type == "Category":
|
| 348 |
+
if not cat_column or not cat_value:
|
| 349 |
+
raise ValueError("Select both category and value columns.")
|
| 350 |
+
fig = create_category_plot(df, cat_column, cat_value, aggregation=cat_agg, chart_type=cat_chart_type.lower())
|
| 351 |
+
elif chart_type == "Scatter":
|
| 352 |
+
if not scatter_x or not scatter_y:
|
| 353 |
+
raise ValueError("Select x and y columns for the scatter plot.")
|
| 354 |
+
fig = create_scatter_plot(df, scatter_x, scatter_y, color_column=scatter_color)
|
| 355 |
+
elif chart_type == "Correlation Heatmap":
|
| 356 |
+
fig = create_correlation_heatmap(df)
|
| 357 |
+
else:
|
| 358 |
+
raise ValueError("Unsupported chart type.")
|
| 359 |
+
except ValueError as exc:
|
| 360 |
+
return None, None, f"⚠️ {exc}"
|
| 361 |
+
|
| 362 |
+
return fig, fig, "Visualization generated."
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def _download_filtered(state: Optional[DatasetState]) -> str:
|
| 366 |
+
"""Export the filtered dataset to a temporary CSV file."""
|
| 367 |
+
state = _ensure_state(state)
|
| 368 |
+
df = _current_dataframe(state, filtered=True)
|
| 369 |
+
if df.empty:
|
| 370 |
+
raise ValueError("There are no rows to export. Adjust your filters and try again.")
|
| 371 |
+
|
| 372 |
+
temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="filtered_", dir=".")
|
| 373 |
+
df.to_csv(temp.name, index=False)
|
| 374 |
+
temp.close()
|
| 375 |
+
return temp.name
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def _download_chart(fig: Optional[go.Figure]) -> str:
|
| 379 |
+
"""Export the most recent chart to PNG."""
|
| 380 |
+
if fig is None:
|
| 381 |
+
raise ValueError("Generate a visualization before exporting.")
|
| 382 |
+
buffer = figure_to_png_bytes(fig)
|
| 383 |
+
temp = tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix="chart_", dir=".")
|
| 384 |
+
with open(temp.name, "wb") as fp:
|
| 385 |
+
fp.write(buffer.read())
|
| 386 |
+
return temp.name
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def _generate_insights(
|
| 390 |
+
state: Optional[DatasetState],
|
| 391 |
+
numeric_column: Optional[str],
|
| 392 |
+
trend_date_column: Optional[str],
|
| 393 |
+
trend_value_column: Optional[str],
|
| 394 |
+
anomaly_column: Optional[str],
|
| 395 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame, str, pd.DataFrame, str]:
|
| 396 |
+
"""Generate top/bottom performers, trends, and anomalies."""
|
| 397 |
+
state = _ensure_state(state)
|
| 398 |
+
try:
|
| 399 |
+
df = _current_dataframe(state, filtered=True)
|
| 400 |
+
except ValueError as exc:
|
| 401 |
+
empty = _empty_dataframe(str(exc))
|
| 402 |
+
return empty, empty, f"⚠️ {exc}", empty, f"⚠️ {exc}"
|
| 403 |
+
|
| 404 |
+
status_messages: List[str] = []
|
| 405 |
+
|
| 406 |
+
top_df = bottom_df = _empty_dataframe("Select a numeric column for insights.")
|
| 407 |
+
if numeric_column:
|
| 408 |
+
try:
|
| 409 |
+
performers = top_bottom_performers(df, numeric_column)
|
| 410 |
+
top_df = performers["top"]
|
| 411 |
+
bottom_df = performers["bottom"]
|
| 412 |
+
status_messages.append(f"Top/bottom performers calculated for {numeric_column}.")
|
| 413 |
+
except ValueError as exc:
|
| 414 |
+
top_df = bottom_df = _empty_dataframe(str(exc))
|
| 415 |
+
status_messages.append(f"⚠️ {exc}")
|
| 416 |
+
|
| 417 |
+
trend_text = "Select a date and value column to evaluate trend."
|
| 418 |
+
if trend_date_column and trend_value_column:
|
| 419 |
+
try:
|
| 420 |
+
trend_text = detect_trend(df, trend_date_column, trend_value_column)
|
| 421 |
+
except ValueError as exc:
|
| 422 |
+
trend_text = f"⚠️ {exc}"
|
| 423 |
+
|
| 424 |
+
anomaly_df = _empty_dataframe("Select a numeric column to detect anomalies.")
|
| 425 |
+
if anomaly_column:
|
| 426 |
+
anomalies = detect_anomalies(df, anomaly_column)
|
| 427 |
+
anomaly_df = anomalies if not anomalies.empty else _empty_dataframe("No significant anomalies detected.")
|
| 428 |
+
|
| 429 |
+
combined_status = "\n".join(status_messages) if status_messages else "Insights generated."
|
| 430 |
+
return top_df, bottom_df, trend_text, anomaly_df, combined_status
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def _describe_sample_dataset(selection: Optional[str]) -> str:
|
| 434 |
+
"""Return a user-friendly description for the selected sample dataset."""
|
| 435 |
+
if not selection:
|
| 436 |
+
return "Select a sample dataset to view its description."
|
| 437 |
+
descriptions = sample_dataset_options()
|
| 438 |
+
description = descriptions.get(selection)
|
| 439 |
+
if not description:
|
| 440 |
+
return "Sample dataset description unavailable. Ensure the file exists in the `data/` directory."
|
| 441 |
+
return f"**{selection}**\n\n{description}"
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def create_dashboard():
|
| 445 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 446 |
+
gr.Markdown("# Business Intelligence Dashboard")
|
| 447 |
+
dataset_state = gr.State({})
|
| 448 |
+
last_figure_state = gr.State(None)
|
| 449 |
+
|
| 450 |
+
sample_choices = list(sample_dataset_options().keys())
|
| 451 |
+
|
| 452 |
+
with gr.Tab("Data Upload"):
|
| 453 |
+
with gr.Row():
|
| 454 |
+
file_input = gr.File(label="Upload CSV or Excel", file_types=[".csv", ".xlsx", ".xls"])
|
| 455 |
+
load_button = gr.Button("Load Data", variant="primary")
|
| 456 |
+
gr.Markdown("Or load one of the curated datasets bundled with the project:")
|
| 457 |
+
with gr.Row():
|
| 458 |
+
sample_dropdown = gr.Dropdown(label="Sample Dataset", choices=sample_choices, value=None, interactive=bool(sample_choices))
|
| 459 |
+
load_sample_button = gr.Button("Load Sample", variant="secondary", interactive=bool(sample_choices))
|
| 460 |
+
if sample_choices:
|
| 461 |
+
sample_description = gr.Markdown("Select a sample dataset to view its description.")
|
| 462 |
+
else:
|
| 463 |
+
sample_description = gr.Markdown("⚠️ No sample datasets detected in the `data/` folder.")
|
| 464 |
+
upload_status = gr.Markdown("No dataset loaded.")
|
| 465 |
+
dataset_info = gr.Markdown()
|
| 466 |
+
dtypes_table = gr.Dataframe(label="Column Types", interactive=False)
|
| 467 |
+
with gr.Row():
|
| 468 |
+
head_table = gr.Dataframe(label="Preview (Head)", interactive=False)
|
| 469 |
+
tail_table = gr.Dataframe(label="Preview (Tail)", interactive=False)
|
| 470 |
+
|
| 471 |
+
with gr.Tab("Statistics"):
|
| 472 |
+
stats_status = gr.Markdown()
|
| 473 |
+
numeric_table = gr.Dataframe(label="Numeric Summary", interactive=False)
|
| 474 |
+
categorical_table = gr.Dataframe(label="Categorical Summary", interactive=False)
|
| 475 |
+
missing_table = gr.Dataframe(label="Missing Value Report", interactive=False)
|
| 476 |
+
correlation_table = gr.Dataframe(label="Correlation Matrix", interactive=False)
|
| 477 |
+
generate_stats_button = gr.Button("Generate Statistics", variant="secondary")
|
| 478 |
+
|
| 479 |
+
with gr.Tab("Filter & Explore"):
|
| 480 |
+
filter_status = gr.Markdown("Rows displayed: 0")
|
| 481 |
+
with gr.Accordion("Numeric Filter", open=False):
|
| 482 |
+
numeric_column_dropdown = gr.Dropdown(label="Numeric Column", choices=[])
|
| 483 |
+
numeric_min_input = gr.Number(label="Minimum Value", visible=False)
|
| 484 |
+
numeric_max_input = gr.Number(label="Maximum Value", visible=False)
|
| 485 |
+
with gr.Accordion("Categorical Filter", open=False):
|
| 486 |
+
categorical_column_dropdown = gr.Dropdown(label="Category Column", choices=[])
|
| 487 |
+
categorical_values = gr.CheckboxGroup(label="Values", choices=[], visible=False)
|
| 488 |
+
with gr.Accordion("Date Filter", open=False):
|
| 489 |
+
date_column_dropdown = gr.Dropdown(label="Date Column", choices=[])
|
| 490 |
+
start_date_picker = gr.Textbox(label="Start Date (YYYY-MM-DD)", visible=False)
|
| 491 |
+
end_date_picker = gr.Textbox(label="End Date (YYYY-MM-DD)", visible=False)
|
| 492 |
+
apply_filters_button = gr.Button("Apply Filters", variant="primary")
|
| 493 |
+
filter_preview_table = gr.Dataframe(label="Filtered Preview", interactive=False)
|
| 494 |
+
export_filtered_button = gr.Button("Download Filtered Data", variant="secondary")
|
| 495 |
+
export_filtered_file = gr.File(label="Filtered CSV", interactive=False)
|
| 496 |
+
|
| 497 |
+
with gr.Tab("Visualizations"):
|
| 498 |
+
viz_status = gr.Markdown()
|
| 499 |
+
chart_type = gr.Radio(
|
| 500 |
+
label="Chart Type",
|
| 501 |
+
choices=["Time Series", "Distribution", "Category", "Scatter", "Correlation Heatmap"],
|
| 502 |
+
value="Time Series",
|
| 503 |
+
)
|
| 504 |
+
with gr.Column(visible=True) as time_series_controls:
|
| 505 |
+
ts_date_column = gr.Dropdown(label="Date Column", choices=[])
|
| 506 |
+
ts_value_column = gr.Dropdown(label="Value Column", choices=[])
|
| 507 |
+
ts_aggregation = gr.Dropdown(label="Aggregation", choices=["sum", "mean", "median", "count"], value="sum")
|
| 508 |
+
with gr.Column(visible=False) as distribution_controls:
|
| 509 |
+
dist_column = gr.Dropdown(label="Numeric Column", choices=[])
|
| 510 |
+
dist_type = gr.Radio(label="Distribution Type", choices=["histogram", "box"], value="histogram")
|
| 511 |
+
with gr.Column(visible=False) as category_controls:
|
| 512 |
+
category_column = gr.Dropdown(label="Category Column", choices=[])
|
| 513 |
+
category_value_column = gr.Dropdown(label="Value Column", choices=[])
|
| 514 |
+
category_chart_type = gr.Radio(label="Chart Style", choices=["Bar", "Pie"], value="Bar")
|
| 515 |
+
category_aggregation = gr.Dropdown(label="Aggregation", choices=["sum", "mean", "median", "count"], value="sum")
|
| 516 |
+
with gr.Column(visible=False) as scatter_controls:
|
| 517 |
+
scatter_x_column = gr.Dropdown(label="X Axis", choices=[])
|
| 518 |
+
scatter_y_column = gr.Dropdown(label="Y Axis", choices=[])
|
| 519 |
+
scatter_color_column = gr.Dropdown(label="Color (optional)", choices=[])
|
| 520 |
+
|
| 521 |
+
generate_chart_button = gr.Button("Generate Visualization", variant="primary")
|
| 522 |
+
chart_output = gr.Plot(label="Visualization")
|
| 523 |
+
download_chart_button = gr.Button("Download Chart as PNG", variant="secondary")
|
| 524 |
+
chart_file_output = gr.File(label="Chart PNG", interactive=False)
|
| 525 |
+
|
| 526 |
+
with gr.Tab("Insights"):
|
| 527 |
+
insights_status = gr.Markdown()
|
| 528 |
+
insight_numeric_column = gr.Dropdown(label="Numeric Column", choices=[])
|
| 529 |
+
trend_date_column = gr.Dropdown(label="Date Column", choices=[])
|
| 530 |
+
trend_value_column = gr.Dropdown(label="Value Column", choices=[])
|
| 531 |
+
anomaly_column = gr.Dropdown(label="Column for Anomaly Detection", choices=[])
|
| 532 |
+
generate_insights_button = gr.Button("Generate Insights", variant="primary")
|
| 533 |
+
top_table = gr.Dataframe(label="Top Performers", interactive=False)
|
| 534 |
+
bottom_table = gr.Dataframe(label="Bottom Performers", interactive=False)
|
| 535 |
+
trend_output = gr.Markdown()
|
| 536 |
+
anomaly_table = gr.Dataframe(label="Potential Anomalies", interactive=False)
|
| 537 |
+
|
| 538 |
+
# Interactions
|
| 539 |
+
load_button.click(
|
| 540 |
+
fn=_handle_file_upload,
|
| 541 |
+
inputs=[file_input, dataset_state],
|
| 542 |
+
outputs=[
|
| 543 |
+
dataset_state,
|
| 544 |
+
upload_status,
|
| 545 |
+
dataset_info,
|
| 546 |
+
dtypes_table,
|
| 547 |
+
head_table,
|
| 548 |
+
tail_table,
|
| 549 |
+
filter_preview_table,
|
| 550 |
+
filter_status,
|
| 551 |
+
],
|
| 552 |
+
).then(
|
| 553 |
+
fn=_populate_column_options,
|
| 554 |
+
inputs=[dataset_state],
|
| 555 |
+
outputs=[
|
| 556 |
+
numeric_column_dropdown,
|
| 557 |
+
date_column_dropdown,
|
| 558 |
+
categorical_values,
|
| 559 |
+
categorical_column_dropdown,
|
| 560 |
+
ts_date_column,
|
| 561 |
+
ts_value_column,
|
| 562 |
+
dist_column,
|
| 563 |
+
category_column,
|
| 564 |
+
category_value_column,
|
| 565 |
+
scatter_x_column,
|
| 566 |
+
scatter_y_column,
|
| 567 |
+
scatter_color_column,
|
| 568 |
+
insight_numeric_column,
|
| 569 |
+
trend_date_column,
|
| 570 |
+
trend_value_column,
|
| 571 |
+
anomaly_column,
|
| 572 |
+
],
|
| 573 |
+
).then(
|
| 574 |
+
fn=_generate_statistics,
|
| 575 |
+
inputs=[dataset_state],
|
| 576 |
+
outputs=[
|
| 577 |
+
numeric_table,
|
| 578 |
+
categorical_table,
|
| 579 |
+
missing_table,
|
| 580 |
+
correlation_table,
|
| 581 |
+
stats_status,
|
| 582 |
+
],
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
load_sample_button.click(
|
| 586 |
+
fn=_handle_sample_dataset,
|
| 587 |
+
inputs=[sample_dropdown, dataset_state],
|
| 588 |
+
outputs=[
|
| 589 |
+
dataset_state,
|
| 590 |
+
upload_status,
|
| 591 |
+
dataset_info,
|
| 592 |
+
dtypes_table,
|
| 593 |
+
head_table,
|
| 594 |
+
tail_table,
|
| 595 |
+
filter_preview_table,
|
| 596 |
+
filter_status,
|
| 597 |
+
],
|
| 598 |
+
).then(
|
| 599 |
+
fn=_populate_column_options,
|
| 600 |
+
inputs=[dataset_state],
|
| 601 |
+
outputs=[
|
| 602 |
+
numeric_column_dropdown,
|
| 603 |
+
date_column_dropdown,
|
| 604 |
+
categorical_values,
|
| 605 |
+
categorical_column_dropdown,
|
| 606 |
+
ts_date_column,
|
| 607 |
+
ts_value_column,
|
| 608 |
+
dist_column,
|
| 609 |
+
category_column,
|
| 610 |
+
category_value_column,
|
| 611 |
+
scatter_x_column,
|
| 612 |
+
scatter_y_column,
|
| 613 |
+
scatter_color_column,
|
| 614 |
+
insight_numeric_column,
|
| 615 |
+
trend_date_column,
|
| 616 |
+
trend_value_column,
|
| 617 |
+
anomaly_column,
|
| 618 |
+
],
|
| 619 |
+
).then(
|
| 620 |
+
fn=_generate_statistics,
|
| 621 |
+
inputs=[dataset_state],
|
| 622 |
+
outputs=[
|
| 623 |
+
numeric_table,
|
| 624 |
+
categorical_table,
|
| 625 |
+
missing_table,
|
| 626 |
+
correlation_table,
|
| 627 |
+
stats_status,
|
| 628 |
+
],
|
| 629 |
+
)
|
| 630 |
+
|
| 631 |
+
sample_dropdown.change(
|
| 632 |
+
fn=_describe_sample_dataset,
|
| 633 |
+
inputs=[sample_dropdown],
|
| 634 |
+
outputs=[sample_description],
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
numeric_column_dropdown.change(
|
| 638 |
+
fn=_update_numeric_inputs,
|
| 639 |
+
inputs=[numeric_column_dropdown, dataset_state],
|
| 640 |
+
outputs=[numeric_min_input, numeric_max_input],
|
| 641 |
+
)
|
| 642 |
+
|
| 643 |
+
categorical_column_dropdown.change(
|
| 644 |
+
fn=_update_categorical_values,
|
| 645 |
+
inputs=[categorical_column_dropdown, dataset_state],
|
| 646 |
+
outputs=[categorical_values],
|
| 647 |
+
)
|
| 648 |
+
|
| 649 |
+
date_column_dropdown.change(
|
| 650 |
+
fn=_update_date_bounds,
|
| 651 |
+
inputs=[date_column_dropdown, dataset_state],
|
| 652 |
+
outputs=[start_date_picker, end_date_picker],
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
generate_stats_button.click(
|
| 656 |
+
fn=_generate_statistics,
|
| 657 |
+
inputs=[dataset_state],
|
| 658 |
+
outputs=[numeric_table, categorical_table, missing_table, correlation_table, stats_status],
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
apply_filters_button.click(
|
| 662 |
+
fn=_apply_filters,
|
| 663 |
+
inputs=[
|
| 664 |
+
dataset_state,
|
| 665 |
+
numeric_column_dropdown,
|
| 666 |
+
numeric_min_input,
|
| 667 |
+
numeric_max_input,
|
| 668 |
+
categorical_column_dropdown,
|
| 669 |
+
categorical_values,
|
| 670 |
+
date_column_dropdown,
|
| 671 |
+
start_date_picker,
|
| 672 |
+
end_date_picker,
|
| 673 |
+
],
|
| 674 |
+
outputs=[dataset_state, filter_preview_table, filter_status],
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
export_filtered_button.click(
|
| 678 |
+
fn=_download_filtered,
|
| 679 |
+
inputs=[dataset_state],
|
| 680 |
+
outputs=[export_filtered_file],
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
def _toggle_controls(selected: str) -> Tuple[Any, Any, Any, Any]:
|
| 684 |
+
return (
|
| 685 |
+
gr.update(visible=selected == "Time Series"),
|
| 686 |
+
gr.update(visible=selected == "Distribution"),
|
| 687 |
+
gr.update(visible=selected == "Category"),
|
| 688 |
+
gr.update(visible=selected == "Scatter"),
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
chart_type.change(
|
| 692 |
+
fn=_toggle_controls,
|
| 693 |
+
inputs=[chart_type],
|
| 694 |
+
outputs=[time_series_controls, distribution_controls, category_controls, scatter_controls],
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
generate_chart_button.click(
|
| 698 |
+
fn=_generate_chart,
|
| 699 |
+
inputs=[
|
| 700 |
+
dataset_state,
|
| 701 |
+
chart_type,
|
| 702 |
+
ts_date_column,
|
| 703 |
+
ts_value_column,
|
| 704 |
+
ts_aggregation,
|
| 705 |
+
dist_column,
|
| 706 |
+
dist_type,
|
| 707 |
+
category_column,
|
| 708 |
+
category_value_column,
|
| 709 |
+
category_chart_type,
|
| 710 |
+
category_aggregation,
|
| 711 |
+
scatter_x_column,
|
| 712 |
+
scatter_y_column,
|
| 713 |
+
scatter_color_column,
|
| 714 |
+
],
|
| 715 |
+
outputs=[last_figure_state, chart_output, viz_status],
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
download_chart_button.click(
|
| 719 |
+
fn=_download_chart,
|
| 720 |
+
inputs=[last_figure_state],
|
| 721 |
+
outputs=[chart_file_output],
|
| 722 |
+
)
|
| 723 |
+
|
| 724 |
+
generate_insights_button.click(
|
| 725 |
+
fn=_generate_insights,
|
| 726 |
+
inputs=[
|
| 727 |
+
dataset_state,
|
| 728 |
+
insight_numeric_column,
|
| 729 |
+
trend_date_column,
|
| 730 |
+
trend_value_column,
|
| 731 |
+
anomaly_column,
|
| 732 |
+
],
|
| 733 |
+
outputs=[
|
| 734 |
+
top_table,
|
| 735 |
+
bottom_table,
|
| 736 |
+
trend_output,
|
| 737 |
+
anomaly_table,
|
| 738 |
+
insights_status,
|
| 739 |
+
],
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
+
return demo
|
| 743 |
+
|
| 744 |
+
|
| 745 |
+
if __name__ == "__main__":
|
| 746 |
+
dashboard = create_dashboard()
|
| 747 |
+
dashboard.launch()
|
data/features.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/stores.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Store,Type,Size
|
data/test.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65834f65a2ddccc85d8f4bf6544f73625be553f8ca5f8fdee976b9d0c900e95d
|
| 3 |
+
size 12842546
|
data_processor.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data loading, cleaning, and filtering helpers for the BI dashboard."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from io import BytesIO
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Iterable, List, Mapping, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
from utils import (
|
| 13 |
+
ColumnTypes,
|
| 14 |
+
PREVIEW_ROWS,
|
| 15 |
+
coerce_datetime_columns,
|
| 16 |
+
ensure_unique_columns,
|
| 17 |
+
infer_column_types,
|
| 18 |
+
is_supported_file,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
SAMPLE_DATA_DIR = Path(__file__).resolve().parent / "data"
|
| 22 |
+
SAMPLE_DESCRIPTIONS = {
|
| 23 |
+
"train.csv": "Weekly Walmart sales with markdowns and holidays (training set).",
|
| 24 |
+
"test.csv": "Companion test set without weekly sales labels.",
|
| 25 |
+
"features.csv": "Store-level features such as markdowns, CPI, unemployment.",
|
| 26 |
+
"stores.csv": "Store metadata including type and size.",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class DatasetBundle:
|
| 32 |
+
"""Container storing the dataset and metadata required by the UI."""
|
| 33 |
+
|
| 34 |
+
dataframe: pd.DataFrame
|
| 35 |
+
column_types: ColumnTypes
|
| 36 |
+
source_name: str
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_dataset(file_obj) -> DatasetBundle:
|
| 40 |
+
"""Load the provided uploaded file into a pandas DataFrame.
|
| 41 |
+
|
| 42 |
+
Parameters
|
| 43 |
+
----------
|
| 44 |
+
file_obj:
|
| 45 |
+
File-like object produced by the Gradio upload widget.
|
| 46 |
+
|
| 47 |
+
Returns
|
| 48 |
+
-------
|
| 49 |
+
DatasetBundle
|
| 50 |
+
Loaded dataset alongside inferred column metadata.
|
| 51 |
+
|
| 52 |
+
Raises
|
| 53 |
+
------
|
| 54 |
+
ValueError
|
| 55 |
+
If the file cannot be read or uses an unsupported format.
|
| 56 |
+
"""
|
| 57 |
+
if file_obj is None:
|
| 58 |
+
raise ValueError("Please upload a CSV or Excel file.")
|
| 59 |
+
|
| 60 |
+
file_name = getattr(file_obj, "name", None)
|
| 61 |
+
original_name = getattr(file_obj, "orig_name", file_name)
|
| 62 |
+
|
| 63 |
+
if not original_name or not is_supported_file(original_name):
|
| 64 |
+
raise ValueError("Unsupported file type. Please upload a CSV or Excel file.")
|
| 65 |
+
|
| 66 |
+
path_candidate = Path(str(file_name)) if file_name else None
|
| 67 |
+
dataframe: Optional[pd.DataFrame] = None
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
if path_candidate and path_candidate.exists():
|
| 71 |
+
dataframe = _read_from_path(path_candidate, original_name)
|
| 72 |
+
else:
|
| 73 |
+
dataframe = _read_from_buffer(file_obj, original_name)
|
| 74 |
+
except Exception as exc: # pragma: no cover - defensive conversion
|
| 75 |
+
raise ValueError(f"Unable to load dataset: {exc}") from exc
|
| 76 |
+
|
| 77 |
+
if dataframe is None:
|
| 78 |
+
raise ValueError("Failed to load dataset. The file may be empty or corrupted.")
|
| 79 |
+
|
| 80 |
+
dataframe = ensure_unique_columns(dataframe)
|
| 81 |
+
dataframe, datetime_cols = coerce_datetime_columns(dataframe)
|
| 82 |
+
column_types = infer_column_types(dataframe)
|
| 83 |
+
|
| 84 |
+
# Ensure newly detected datetime columns are included in metadata
|
| 85 |
+
column_types = ColumnTypes(
|
| 86 |
+
numeric=column_types.numeric,
|
| 87 |
+
categorical=column_types.categorical,
|
| 88 |
+
datetime=tuple(sorted(set(column_types.datetime + tuple(datetime_cols)))),
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
return DatasetBundle(
|
| 92 |
+
dataframe=dataframe,
|
| 93 |
+
column_types=column_types,
|
| 94 |
+
source_name=Path(original_name).name,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _read_from_path(path: Path, original_name: str) -> pd.DataFrame:
|
| 99 |
+
"""Read a dataset from disk."""
|
| 100 |
+
suffix = path.suffix.lower()
|
| 101 |
+
if suffix == ".csv":
|
| 102 |
+
return pd.read_csv(path)
|
| 103 |
+
if suffix in {".xlsx", ".xls"}:
|
| 104 |
+
return pd.read_excel(path)
|
| 105 |
+
raise ValueError(f"Unsupported file extension in {original_name}.")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _read_from_buffer(file_obj, original_name: str) -> pd.DataFrame:
|
| 109 |
+
"""Read a dataset from an in-memory buffer."""
|
| 110 |
+
bytes_data = getattr(file_obj, "read", lambda: b"")()
|
| 111 |
+
if not bytes_data:
|
| 112 |
+
raise ValueError(f"The uploaded file '{original_name}' is empty.")
|
| 113 |
+
|
| 114 |
+
buffer = BytesIO(bytes_data)
|
| 115 |
+
lowered = original_name.lower()
|
| 116 |
+
if lowered.endswith(".csv"):
|
| 117 |
+
return pd.read_csv(buffer)
|
| 118 |
+
if lowered.endswith((".xlsx", ".xls")):
|
| 119 |
+
return pd.read_excel(buffer)
|
| 120 |
+
|
| 121 |
+
raise ValueError("Only CSV and Excel files are supported.")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def dataset_overview(df: pd.DataFrame) -> Dict[str, object]:
|
| 125 |
+
"""Return basic information about the dataset."""
|
| 126 |
+
info = {
|
| 127 |
+
"Rows": int(df.shape[0]),
|
| 128 |
+
"Columns": int(df.shape[1]),
|
| 129 |
+
"Memory Usage (MB)": round(df.memory_usage(deep=True).sum() / (1024**2), 2),
|
| 130 |
+
}
|
| 131 |
+
dtypes = pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)})
|
| 132 |
+
return {"info": info, "dtypes": dtypes}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def dataset_preview(df: pd.DataFrame, rows: int = PREVIEW_ROWS) -> Dict[str, pd.DataFrame]:
|
| 136 |
+
"""Return head and tail previews of the dataset."""
|
| 137 |
+
return {
|
| 138 |
+
"head": df.head(rows),
|
| 139 |
+
"tail": df.tail(rows),
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def numeric_summary(df: pd.DataFrame) -> pd.DataFrame:
|
| 144 |
+
"""Compute descriptive statistics for numeric columns."""
|
| 145 |
+
numeric_df = df.select_dtypes(include=["number"])
|
| 146 |
+
if numeric_df.empty:
|
| 147 |
+
return pd.DataFrame()
|
| 148 |
+
|
| 149 |
+
summary = pd.DataFrame(
|
| 150 |
+
{
|
| 151 |
+
"count": numeric_df.count(),
|
| 152 |
+
"mean": numeric_df.mean(),
|
| 153 |
+
"median": numeric_df.median(),
|
| 154 |
+
"std": numeric_df.std(),
|
| 155 |
+
"min": numeric_df.min(),
|
| 156 |
+
"25%": numeric_df.quantile(0.25),
|
| 157 |
+
"75%": numeric_df.quantile(0.75),
|
| 158 |
+
"max": numeric_df.max(),
|
| 159 |
+
}
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
summary.index.name = "column"
|
| 163 |
+
return summary.round(3)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def categorical_summary(df: pd.DataFrame, top_values: int = 5) -> pd.DataFrame:
|
| 167 |
+
"""Compute summary statistics for categorical columns."""
|
| 168 |
+
categorical_cols = df.select_dtypes(exclude=["number", "datetime64[ns]", "datetime64[ns, UTC]"])
|
| 169 |
+
if categorical_cols.empty:
|
| 170 |
+
return pd.DataFrame()
|
| 171 |
+
|
| 172 |
+
rows: List[Dict[str, object]] = []
|
| 173 |
+
for column in categorical_cols:
|
| 174 |
+
series = categorical_cols[column]
|
| 175 |
+
mode_series = series.mode(dropna=True)
|
| 176 |
+
mode_value = mode_series.iloc[0] if not mode_series.empty else None
|
| 177 |
+
counts = series.value_counts(dropna=True).head(top_values)
|
| 178 |
+
top_repr = ", ".join(f"{idx} ({count})" for idx, count in counts.items())
|
| 179 |
+
rows.append(
|
| 180 |
+
{
|
| 181 |
+
"column": column,
|
| 182 |
+
"unique_values": int(series.nunique(dropna=True)),
|
| 183 |
+
"mode": mode_value,
|
| 184 |
+
"mode_count": int(counts.iloc[0]) if not counts.empty else 0,
|
| 185 |
+
f"top_{top_values}": top_repr,
|
| 186 |
+
}
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
return pd.DataFrame(rows)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def missing_value_report(df: pd.DataFrame) -> pd.DataFrame:
|
| 193 |
+
"""Return the count and percentage of missing values per column."""
|
| 194 |
+
missing_counts = df.isna().sum()
|
| 195 |
+
if missing_counts.sum() == 0:
|
| 196 |
+
return pd.DataFrame(columns=["column", "missing_count", "missing_pct"])
|
| 197 |
+
|
| 198 |
+
missing_pct = (missing_counts / len(df)) * 100
|
| 199 |
+
report = pd.DataFrame(
|
| 200 |
+
{
|
| 201 |
+
"column": missing_counts.index,
|
| 202 |
+
"missing_count": missing_counts.values,
|
| 203 |
+
"missing_pct": missing_pct.values,
|
| 204 |
+
}
|
| 205 |
+
)
|
| 206 |
+
return report.sort_values(by="missing_pct", ascending=False).reset_index(drop=True).round({"missing_pct": 2})
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
|
| 210 |
+
"""Compute the correlation matrix for numeric columns."""
|
| 211 |
+
numeric_df = df.select_dtypes(include=["number"])
|
| 212 |
+
if numeric_df.empty or numeric_df.shape[1] < 2:
|
| 213 |
+
return pd.DataFrame()
|
| 214 |
+
corr = numeric_df.corr()
|
| 215 |
+
return corr.round(3)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def filter_dataframe(
|
| 219 |
+
df: pd.DataFrame,
|
| 220 |
+
numeric_filters: Mapping[str, Tuple[Optional[float], Optional[float]]],
|
| 221 |
+
categorical_filters: Mapping[str, Iterable[str]],
|
| 222 |
+
date_filters: Mapping[str, Tuple[Optional[str], Optional[str]]],
|
| 223 |
+
) -> pd.DataFrame:
|
| 224 |
+
"""Filter the dataset according to the provided filter definitions."""
|
| 225 |
+
filtered = df.copy()
|
| 226 |
+
|
| 227 |
+
for column, bounds in numeric_filters.items():
|
| 228 |
+
if column not in filtered.columns or bounds is None:
|
| 229 |
+
continue
|
| 230 |
+
lower, upper = bounds
|
| 231 |
+
series = filtered[column]
|
| 232 |
+
if lower is not None:
|
| 233 |
+
filtered = filtered[series >= lower]
|
| 234 |
+
if upper is not None:
|
| 235 |
+
filtered = filtered[series <= upper]
|
| 236 |
+
|
| 237 |
+
for column, values in categorical_filters.items():
|
| 238 |
+
if column not in filtered.columns:
|
| 239 |
+
continue
|
| 240 |
+
values = list(values)
|
| 241 |
+
if not values:
|
| 242 |
+
continue
|
| 243 |
+
filtered = filtered[filtered[column].isin(values)]
|
| 244 |
+
|
| 245 |
+
for column, bounds in date_filters.items():
|
| 246 |
+
if column not in filtered.columns or bounds is None:
|
| 247 |
+
continue
|
| 248 |
+
start, end = bounds
|
| 249 |
+
series = pd.to_datetime(filtered[column], errors="coerce")
|
| 250 |
+
if start:
|
| 251 |
+
filtered = filtered[series >= pd.to_datetime(start)]
|
| 252 |
+
if end:
|
| 253 |
+
filtered = filtered[series <= pd.to_datetime(end)]
|
| 254 |
+
|
| 255 |
+
return filtered
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def filter_metadata(df: pd.DataFrame, column_types: ColumnTypes, categorical_limit: int = 200) -> Dict[str, object]:
|
| 259 |
+
"""Pre-compute useful metadata for rendering filter controls."""
|
| 260 |
+
metadata: Dict[str, object] = {"numeric": {}, "categorical": {}, "datetime": {}}
|
| 261 |
+
|
| 262 |
+
for column in column_types.numeric:
|
| 263 |
+
series = df[column].dropna()
|
| 264 |
+
if series.empty:
|
| 265 |
+
continue
|
| 266 |
+
metadata["numeric"][column] = {
|
| 267 |
+
"min": float(series.min()),
|
| 268 |
+
"max": float(series.max()),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
for column in column_types.categorical:
|
| 272 |
+
series = df[column].dropna().astype(str)
|
| 273 |
+
unique_values = series.unique().tolist()
|
| 274 |
+
if len(unique_values) > categorical_limit:
|
| 275 |
+
unique_values = unique_values[:categorical_limit]
|
| 276 |
+
metadata["categorical"][column] = unique_values
|
| 277 |
+
|
| 278 |
+
for column in column_types.datetime:
|
| 279 |
+
series = pd.to_datetime(df[column], errors="coerce")
|
| 280 |
+
series = series.dropna()
|
| 281 |
+
if series.empty:
|
| 282 |
+
continue
|
| 283 |
+
metadata["datetime"][column] = {
|
| 284 |
+
"min": series.min().date(),
|
| 285 |
+
"max": series.max().date(),
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
return metadata
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def sample_dataset_options() -> Dict[str, str]:
|
| 292 |
+
"""Return available bundled datasets and their descriptions."""
|
| 293 |
+
options: Dict[str, str] = {}
|
| 294 |
+
if not SAMPLE_DATA_DIR.exists():
|
| 295 |
+
return options
|
| 296 |
+
|
| 297 |
+
for path in sorted(SAMPLE_DATA_DIR.iterdir()):
|
| 298 |
+
if not path.is_file():
|
| 299 |
+
continue
|
| 300 |
+
if path.suffix.lower() not in {".csv", ".xlsx", ".xls"}:
|
| 301 |
+
continue
|
| 302 |
+
description = SAMPLE_DESCRIPTIONS.get(path.name, f"Sample dataset sourced from '{path.name}'.")
|
| 303 |
+
options[path.name] = description
|
| 304 |
+
return options
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def load_sample_dataset(selection: str) -> DatasetBundle:
|
| 308 |
+
"""Load a dataset bundled inside the local data directory."""
|
| 309 |
+
if not selection:
|
| 310 |
+
raise ValueError("Please select a sample dataset from the dropdown.")
|
| 311 |
+
|
| 312 |
+
path = SAMPLE_DATA_DIR / selection
|
| 313 |
+
if not path.exists():
|
| 314 |
+
raise ValueError(
|
| 315 |
+
f"Sample dataset '{selection}' was not found in the 'data/' directory. "
|
| 316 |
+
"Ensure the file exists and try again."
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
dataframe = _read_from_path(path, selection)
|
| 320 |
+
dataframe = ensure_unique_columns(dataframe)
|
| 321 |
+
dataframe, datetime_cols = coerce_datetime_columns(dataframe)
|
| 322 |
+
column_types = infer_column_types(dataframe)
|
| 323 |
+
column_types = ColumnTypes(
|
| 324 |
+
numeric=column_types.numeric,
|
| 325 |
+
categorical=column_types.categorical,
|
| 326 |
+
datetime=tuple(sorted(set(column_types.datetime + tuple(datetime_cols)))),
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
return DatasetBundle(
|
| 330 |
+
dataframe=dataframe,
|
| 331 |
+
column_types=column_types,
|
| 332 |
+
source_name=selection,
|
| 333 |
+
)
|
filtered_htzxc454.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
insights.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Insight generation utilities for the BI dashboard."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Iterable, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from utils import ColumnTypes
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def top_bottom_performers(df: pd.DataFrame, column: str, n: int = 5) -> Dict[str, pd.DataFrame]:
|
| 14 |
+
"""Return the top and bottom performers for a numeric column."""
|
| 15 |
+
if column not in df.columns:
|
| 16 |
+
raise ValueError(f"Column '{column}' not found in dataset.")
|
| 17 |
+
|
| 18 |
+
numeric_series = pd.to_numeric(df[column], errors="coerce").dropna()
|
| 19 |
+
if numeric_series.empty:
|
| 20 |
+
raise ValueError(f"Column '{column}' does not contain numeric data.")
|
| 21 |
+
|
| 22 |
+
top = numeric_series.nlargest(n)
|
| 23 |
+
bottom = numeric_series.nsmallest(n)
|
| 24 |
+
return {
|
| 25 |
+
"top": top.reset_index(),
|
| 26 |
+
"bottom": bottom.reset_index(),
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def detect_trend(df: pd.DataFrame, date_column: str, value_column: str) -> str:
|
| 31 |
+
"""Analyze basic trend between the first and last data points."""
|
| 32 |
+
if date_column not in df.columns or value_column not in df.columns:
|
| 33 |
+
raise ValueError("Selected columns are not present in the dataset.")
|
| 34 |
+
|
| 35 |
+
working = df[[date_column, value_column]].dropna()
|
| 36 |
+
working[date_column] = pd.to_datetime(working[date_column], errors="coerce")
|
| 37 |
+
working = working.dropna()
|
| 38 |
+
|
| 39 |
+
if working.empty or working[date_column].nunique() < 2:
|
| 40 |
+
return "Not enough data to evaluate a trend."
|
| 41 |
+
|
| 42 |
+
working = working.sort_values(by=date_column)
|
| 43 |
+
first_date = working[date_column].iloc[0]
|
| 44 |
+
last_date = working[date_column].iloc[-1]
|
| 45 |
+
first_value = working[value_column].iloc[0]
|
| 46 |
+
last_value = working[value_column].iloc[-1]
|
| 47 |
+
|
| 48 |
+
change = last_value - first_value
|
| 49 |
+
pct_change = (change / first_value * 100) if first_value != 0 else np.nan
|
| 50 |
+
|
| 51 |
+
if np.isnan(pct_change):
|
| 52 |
+
direction = "changed"
|
| 53 |
+
elif pct_change > 0:
|
| 54 |
+
direction = "increased"
|
| 55 |
+
elif pct_change < 0:
|
| 56 |
+
direction = "decreased"
|
| 57 |
+
else:
|
| 58 |
+
direction = "remained stable"
|
| 59 |
+
|
| 60 |
+
pct_text = f" ({pct_change:.2f}%)" if not np.isnan(pct_change) else ""
|
| 61 |
+
return (
|
| 62 |
+
f"Between {first_date.date()} and {last_date.date()}, "
|
| 63 |
+
f"{value_column} {direction} by {change:.2f}{pct_text}."
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def detect_anomalies(df: pd.DataFrame, column: str, z_threshold: float = 3.0, limit: int = 5) -> pd.DataFrame:
|
| 68 |
+
"""Identify potential outliers using a simple z-score approach."""
|
| 69 |
+
if column not in df.columns:
|
| 70 |
+
raise ValueError(f"Column '{column}' not found in dataset.")
|
| 71 |
+
|
| 72 |
+
series = pd.to_numeric(df[column], errors="coerce")
|
| 73 |
+
z_scores = ((series - series.mean()) / series.std()).abs()
|
| 74 |
+
anomalies = df.loc[z_scores > z_threshold, [column]].copy()
|
| 75 |
+
anomalies["z_score"] = z_scores[z_scores > z_threshold]
|
| 76 |
+
return anomalies.sort_values(by="z_score", ascending=False).head(limit)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_default_insight_columns(column_types: ColumnTypes) -> Dict[str, Optional[str]]:
|
| 80 |
+
"""Determine default columns to use when auto-generating insights."""
|
| 81 |
+
numeric_col = column_types.numeric[0] if column_types.numeric else None
|
| 82 |
+
date_col = column_types.datetime[0] if column_types.datetime else None
|
| 83 |
+
return {"numeric": numeric_col, "datetime": date_col}
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0,<5.0
|
| 2 |
+
pandas>=2.0,<3.0
|
| 3 |
+
plotly>=5.18
|
| 4 |
+
kaleido>=0.2.1
|
| 5 |
+
numpy>=1.24
|
| 6 |
+
openpyxl>=3.1
|
utils.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility helpers for the Business Intelligence dashboard."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Dict, Iterable, List, Tuple
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SUPPORTED_FILE_TYPES: Tuple[str, ...] = (".csv", ".xlsx", ".xls")
|
| 12 |
+
"""Allowed file extensions for uploads."""
|
| 13 |
+
|
| 14 |
+
PREVIEW_ROWS: int = 5
|
| 15 |
+
"""Default number of rows to display in dataset previews."""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass(frozen=True)
|
| 19 |
+
class ColumnTypes:
|
| 20 |
+
"""Container describing inferred column groupings."""
|
| 21 |
+
|
| 22 |
+
numeric: Tuple[str, ...]
|
| 23 |
+
categorical: Tuple[str, ...]
|
| 24 |
+
datetime: Tuple[str, ...]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def is_supported_file(filename: str | None) -> bool:
|
| 28 |
+
"""Return True when the provided filename uses a supported extension."""
|
| 29 |
+
if not filename:
|
| 30 |
+
return False
|
| 31 |
+
lowered = filename.lower()
|
| 32 |
+
return any(lowered.endswith(ext) for ext in SUPPORTED_FILE_TYPES)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def coerce_datetime_columns(df: pd.DataFrame, threshold: float = 0.6) -> Tuple[pd.DataFrame, Tuple[str, ...]]:
|
| 36 |
+
"""Attempt to parse object columns as datetimes when enough values can be converted.
|
| 37 |
+
|
| 38 |
+
Parameters
|
| 39 |
+
----------
|
| 40 |
+
df:
|
| 41 |
+
Input DataFrame to mutate in-place.
|
| 42 |
+
threshold:
|
| 43 |
+
Minimum fraction of non-null values that must successfully convert
|
| 44 |
+
for the column to be promoted to datetime.
|
| 45 |
+
|
| 46 |
+
Returns
|
| 47 |
+
-------
|
| 48 |
+
tuple
|
| 49 |
+
Mutated DataFrame and the tuple of datetime column names.
|
| 50 |
+
"""
|
| 51 |
+
datetime_cols: List[str] = list(
|
| 52 |
+
df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
object_cols = df.select_dtypes(include=["object"]).columns
|
| 56 |
+
for col in object_cols:
|
| 57 |
+
series = df[col]
|
| 58 |
+
non_null_ratio = series.notna().mean()
|
| 59 |
+
if non_null_ratio == 0 or non_null_ratio < threshold:
|
| 60 |
+
continue
|
| 61 |
+
converted = pd.to_datetime(series, errors="coerce", utc=False, infer_datetime_format=True)
|
| 62 |
+
success_ratio = converted.notna().mean()
|
| 63 |
+
if success_ratio >= threshold:
|
| 64 |
+
df[col] = converted
|
| 65 |
+
datetime_cols.append(col)
|
| 66 |
+
|
| 67 |
+
return df, tuple(sorted(set(datetime_cols)))
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def infer_column_types(df: pd.DataFrame) -> ColumnTypes:
|
| 71 |
+
"""Infer high-level data types for the provided DataFrame's columns."""
|
| 72 |
+
numeric_cols = tuple(df.select_dtypes(include=["number"]).columns)
|
| 73 |
+
datetime_cols = tuple(df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns)
|
| 74 |
+
categorical_cols: List[str] = []
|
| 75 |
+
|
| 76 |
+
for col in df.columns:
|
| 77 |
+
if col in numeric_cols or col in datetime_cols:
|
| 78 |
+
continue
|
| 79 |
+
categorical_cols.append(col)
|
| 80 |
+
|
| 81 |
+
return ColumnTypes(numeric=numeric_cols, categorical=tuple(categorical_cols), datetime=datetime_cols)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def clamp_numeric(value: float, minimum: float, maximum: float) -> float:
|
| 85 |
+
"""Clamp *value* into the closed range [minimum, maximum]."""
|
| 86 |
+
return max(minimum, min(maximum, value))
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def ensure_unique_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 90 |
+
"""Rename duplicate columns to maintain uniqueness."""
|
| 91 |
+
if df.columns.is_unique:
|
| 92 |
+
return df
|
| 93 |
+
|
| 94 |
+
new_columns: List[str] = []
|
| 95 |
+
seen: Dict[str, int] = {}
|
| 96 |
+
for col in df.columns:
|
| 97 |
+
count = seen.get(col, 0)
|
| 98 |
+
if count == 0:
|
| 99 |
+
new_columns.append(col)
|
| 100 |
+
else:
|
| 101 |
+
new_columns.append(f"{col}_{count}")
|
| 102 |
+
seen[col] = count + 1
|
| 103 |
+
|
| 104 |
+
df = df.copy()
|
| 105 |
+
df.columns = new_columns
|
| 106 |
+
return df
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def shorten_text(value: str, max_length: int = 80) -> str:
|
| 110 |
+
"""Truncate long text values for cleaner display."""
|
| 111 |
+
if len(value) <= max_length:
|
| 112 |
+
return value
|
| 113 |
+
return f"{value[: max_length - 3]}..."
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def safe_column_subset(columns: Iterable[str], allowed: Iterable[str]) -> List[str]:
|
| 117 |
+
"""Return a list of *columns* that exist inside *allowed*."""
|
| 118 |
+
allowed_set = set(allowed)
|
| 119 |
+
return [col for col in columns if col in allowed_set]
|
visualizations.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Visualization utilities leveraging the Strategy Pattern for the BI dashboard."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from io import BytesIO
|
| 7 |
+
from typing import Any, Dict, Iterable, Optional
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
|
| 13 |
+
AGGREGATIONS = {
|
| 14 |
+
"sum": "sum",
|
| 15 |
+
"mean": "mean",
|
| 16 |
+
"median": "median",
|
| 17 |
+
"count": "count",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class VisualizationStrategy(ABC):
|
| 22 |
+
"""Abstract base class for visualization strategies."""
|
| 23 |
+
|
| 24 |
+
@abstractmethod
|
| 25 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 26 |
+
"""Generate a Plotly figure from the provided dataframe and arguments."""
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
def validate_columns(self, df: pd.DataFrame, columns: Iterable[str]) -> None:
|
| 30 |
+
"""Ensure every column exists inside the DataFrame."""
|
| 31 |
+
missing = [col for col in columns if col not in df.columns]
|
| 32 |
+
if missing:
|
| 33 |
+
raise ValueError(f"Column(s) not found in dataset: {', '.join(missing)}")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TimeSeriesStrategy(VisualizationStrategy):
|
| 37 |
+
"""Strategy for generating time-series plots."""
|
| 38 |
+
|
| 39 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 40 |
+
date_column = kwargs.get("date_column")
|
| 41 |
+
value_column = kwargs.get("value_column")
|
| 42 |
+
aggregation = kwargs.get("aggregation", "sum")
|
| 43 |
+
|
| 44 |
+
if not date_column or not value_column:
|
| 45 |
+
raise ValueError("Date and value columns are required for Time Series.")
|
| 46 |
+
|
| 47 |
+
self.validate_columns(df, [date_column, value_column])
|
| 48 |
+
|
| 49 |
+
if aggregation not in AGGREGATIONS:
|
| 50 |
+
raise ValueError("Unsupported aggregation method.")
|
| 51 |
+
|
| 52 |
+
date_series = pd.to_datetime(df[date_column], errors="coerce")
|
| 53 |
+
subset = df.loc[date_series.notna(), [date_column, value_column]].copy()
|
| 54 |
+
subset[date_column] = pd.to_datetime(subset[date_column])
|
| 55 |
+
grouped = subset.groupby(subset[date_column].dt.date)[value_column].agg(aggregation).reset_index()
|
| 56 |
+
|
| 57 |
+
fig = px.line(
|
| 58 |
+
grouped,
|
| 59 |
+
x=date_column,
|
| 60 |
+
y=value_column,
|
| 61 |
+
title=f"{value_column} over time ({aggregation})",
|
| 62 |
+
)
|
| 63 |
+
fig.update_layout(xaxis_title=date_column, yaxis_title=value_column)
|
| 64 |
+
return fig
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class DistributionStrategy(VisualizationStrategy):
|
| 68 |
+
"""Strategy for generating distribution plots (histogram/box)."""
|
| 69 |
+
|
| 70 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 71 |
+
column = kwargs.get("column")
|
| 72 |
+
plot_type = kwargs.get("plot_type", "histogram")
|
| 73 |
+
|
| 74 |
+
if not column:
|
| 75 |
+
raise ValueError("Numeric column is required for Distribution plot.")
|
| 76 |
+
|
| 77 |
+
self.validate_columns(df, [column])
|
| 78 |
+
numeric_series = pd.to_numeric(df[column], errors="coerce").dropna()
|
| 79 |
+
if numeric_series.empty:
|
| 80 |
+
raise ValueError("Selected column does not contain numeric data.")
|
| 81 |
+
|
| 82 |
+
if plot_type == "box":
|
| 83 |
+
fig = px.box(numeric_series, y=column, points="suspectedoutliers", title=f"Distribution of {column}")
|
| 84 |
+
else:
|
| 85 |
+
fig = px.histogram(
|
| 86 |
+
numeric_series,
|
| 87 |
+
nbins=30,
|
| 88 |
+
title=f"Distribution of {column}",
|
| 89 |
+
)
|
| 90 |
+
fig.update_layout(xaxis_title=column, yaxis_title="Frequency")
|
| 91 |
+
return fig
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class CategoryStrategy(VisualizationStrategy):
|
| 95 |
+
"""Strategy for generating categorical charts (bar/pie)."""
|
| 96 |
+
|
| 97 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 98 |
+
category_column = kwargs.get("category_column")
|
| 99 |
+
value_column = kwargs.get("value_column")
|
| 100 |
+
aggregation = kwargs.get("aggregation", "sum")
|
| 101 |
+
chart_type = kwargs.get("chart_type", "bar").lower()
|
| 102 |
+
|
| 103 |
+
if not category_column or not value_column:
|
| 104 |
+
raise ValueError("Category and value columns are required for Category plot.")
|
| 105 |
+
|
| 106 |
+
self.validate_columns(df, [category_column, value_column])
|
| 107 |
+
if aggregation not in AGGREGATIONS:
|
| 108 |
+
raise ValueError("Unsupported aggregation method.")
|
| 109 |
+
|
| 110 |
+
grouped = (
|
| 111 |
+
df.groupby(category_column)[value_column]
|
| 112 |
+
.agg(aggregation)
|
| 113 |
+
.reset_index()
|
| 114 |
+
.sort_values(by=value_column, ascending=False)
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
if chart_type == "pie":
|
| 118 |
+
fig = px.pie(grouped, names=category_column, values=value_column, title=f"{value_column} by {category_column}")
|
| 119 |
+
else:
|
| 120 |
+
fig = px.bar(grouped, x=category_column, y=value_column, title=f"{value_column} by {category_column}")
|
| 121 |
+
fig.update_layout(xaxis_title=category_column, yaxis_title=f"{aggregation} of {value_column}")
|
| 122 |
+
|
| 123 |
+
return fig
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class ScatterStrategy(VisualizationStrategy):
|
| 127 |
+
"""Strategy for generating scatter plots."""
|
| 128 |
+
|
| 129 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 130 |
+
x_column = kwargs.get("x_column")
|
| 131 |
+
y_column = kwargs.get("y_column")
|
| 132 |
+
color_column = kwargs.get("color_column")
|
| 133 |
+
|
| 134 |
+
if not x_column or not y_column:
|
| 135 |
+
raise ValueError("X and Y columns are required for Scatter plot.")
|
| 136 |
+
|
| 137 |
+
columns = [x_column, y_column]
|
| 138 |
+
if color_column:
|
| 139 |
+
columns.append(color_column)
|
| 140 |
+
self.validate_columns(df, columns)
|
| 141 |
+
|
| 142 |
+
fig = px.scatter(df, x=x_column, y=y_column, color=color_column, title=f"{y_column} vs {x_column}")
|
| 143 |
+
fig.update_layout(xaxis_title=x_column, yaxis_title=y_column)
|
| 144 |
+
return fig
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class CorrelationHeatmapStrategy(VisualizationStrategy):
|
| 148 |
+
"""Strategy for generating correlation heatmaps."""
|
| 149 |
+
|
| 150 |
+
def generate(self, df: pd.DataFrame, **kwargs: Any) -> go.Figure:
|
| 151 |
+
numeric_df = df.select_dtypes(include=["number"])
|
| 152 |
+
if numeric_df.shape[1] < 2:
|
| 153 |
+
raise ValueError("At least two numeric columns are required for a correlation heatmap.")
|
| 154 |
+
|
| 155 |
+
corr = numeric_df.corr()
|
| 156 |
+
fig = px.imshow(
|
| 157 |
+
corr,
|
| 158 |
+
text_auto=True,
|
| 159 |
+
title="Correlation Heatmap",
|
| 160 |
+
color_continuous_scale="RdBu",
|
| 161 |
+
aspect="auto",
|
| 162 |
+
)
|
| 163 |
+
return fig
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def figure_to_png_bytes(fig: go.Figure) -> BytesIO:
|
| 167 |
+
"""Export the figure to an in-memory PNG buffer."""
|
| 168 |
+
try:
|
| 169 |
+
image_bytes = fig.to_image(format="png")
|
| 170 |
+
except ValueError as exc: # pragma: no cover - fallback for environments without kaleido
|
| 171 |
+
raise ValueError("PNG export requires the 'kaleido' package. Please install it to enable downloads.") from exc
|
| 172 |
+
return BytesIO(image_bytes)
|