|
|
|
|
|
""" |
|
|
Data loader utilities for Gradio app. |
|
|
Handles loading and preprocessing of data files. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
class DataLoader: |
|
|
"""Load and cache data for the Gradio app.""" |
|
|
|
|
|
def __init__(self, data_dir="../data/processed"): |
|
|
""" |
|
|
Initialize DataLoader. |
|
|
|
|
|
Args: |
|
|
data_dir: Path to processed data directory |
|
|
""" |
|
|
self.data_dir = Path(data_dir) |
|
|
self._raw_data = None |
|
|
self._scaled_features = None |
|
|
self._original_features = None |
|
|
self._pca_data = None |
|
|
|
|
|
@property |
|
|
def raw_data(self): |
|
|
"""Load and cache cleaned UK data.""" |
|
|
if self._raw_data is None: |
|
|
self._raw_data = pd.read_csv( |
|
|
self.data_dir / "cleaned_uk_data.csv", |
|
|
parse_dates=["InvoiceDate"] |
|
|
) |
|
|
|
|
|
if "DayOfWeek" not in self._raw_data.columns: |
|
|
self._raw_data["DayOfWeek"] = self._raw_data["InvoiceDate"].dt.dayofweek |
|
|
if "HourOfDay" not in self._raw_data.columns: |
|
|
self._raw_data["HourOfDay"] = self._raw_data["InvoiceDate"].dt.hour |
|
|
return self._raw_data |
|
|
|
|
|
@property |
|
|
def scaled_features(self): |
|
|
"""Load and cache scaled customer features.""" |
|
|
if self._scaled_features is None: |
|
|
self._scaled_features = pd.read_csv( |
|
|
self.data_dir / "customer_features_scaled.csv", |
|
|
index_col=0 |
|
|
) |
|
|
return self._scaled_features |
|
|
|
|
|
@property |
|
|
def original_features(self): |
|
|
"""Load and cache original customer features.""" |
|
|
if self._original_features is None: |
|
|
self._original_features = pd.read_csv( |
|
|
self.data_dir / "customer_features.csv", |
|
|
index_col=0 |
|
|
) |
|
|
return self._original_features |
|
|
|
|
|
def get_feature_ranges(self): |
|
|
"""Get min and max values for each feature for UI sliders.""" |
|
|
features = self.original_features |
|
|
feature_stats = {} |
|
|
|
|
|
for col in features.columns: |
|
|
feature_stats[col] = { |
|
|
"min": float(features[col].min()), |
|
|
"max": float(features[col].max()), |
|
|
"median": float(features[col].median()), |
|
|
"mean": float(features[col].mean()), |
|
|
} |
|
|
|
|
|
return feature_stats |
|
|
|
|
|
def get_kpi_metrics(self): |
|
|
"""Calculate KPI metrics from raw data.""" |
|
|
df = self.raw_data |
|
|
|
|
|
kpis = { |
|
|
"total_customers": df["CustomerID"].nunique(), |
|
|
"total_transactions": df["InvoiceNo"].nunique(), |
|
|
"avg_revenue": float(df["TotalPrice"].mean()), |
|
|
"total_revenue": float(df["TotalPrice"].sum()), |
|
|
} |
|
|
|
|
|
return kpis |
|
|
|
|
|
|
|
|
|
|
|
_data_loader = None |
|
|
|
|
|
|
|
|
def get_data_loader(data_dir="../data/processed"): |
|
|
"""Get or create the global DataLoader instance.""" |
|
|
global _data_loader |
|
|
if _data_loader is None: |
|
|
_data_loader = DataLoader(data_dir) |
|
|
return _data_loader |
|
|
|