# -*- coding: utf-8 -*- """ Data loader utilities for Gradio app. Handles loading and preprocessing of data files. """ import pandas as pd import numpy as np from pathlib import Path class DataLoader: """Load and cache data for the Gradio app.""" def __init__(self, data_dir="../data/processed"): """ Initialize DataLoader. Args: data_dir: Path to processed data directory """ self.data_dir = Path(data_dir) self._raw_data = None self._scaled_features = None self._original_features = None self._pca_data = None @property def raw_data(self): """Load and cache cleaned UK data.""" if self._raw_data is None: self._raw_data = pd.read_csv( self.data_dir / "cleaned_uk_data.csv", parse_dates=["InvoiceDate"] ) # Create time features if not present if "DayOfWeek" not in self._raw_data.columns: self._raw_data["DayOfWeek"] = self._raw_data["InvoiceDate"].dt.dayofweek if "HourOfDay" not in self._raw_data.columns: self._raw_data["HourOfDay"] = self._raw_data["InvoiceDate"].dt.hour return self._raw_data @property def scaled_features(self): """Load and cache scaled customer features.""" if self._scaled_features is None: self._scaled_features = pd.read_csv( self.data_dir / "customer_features_scaled.csv", index_col=0 ) return self._scaled_features @property def original_features(self): """Load and cache original customer features.""" if self._original_features is None: self._original_features = pd.read_csv( self.data_dir / "customer_features.csv", index_col=0 ) return self._original_features def get_feature_ranges(self): """Get min and max values for each feature for UI sliders.""" features = self.original_features feature_stats = {} for col in features.columns: feature_stats[col] = { "min": float(features[col].min()), "max": float(features[col].max()), "median": float(features[col].median()), "mean": float(features[col].mean()), } return feature_stats def get_kpi_metrics(self): """Calculate KPI metrics from raw data.""" df = self.raw_data kpis = { "total_customers": df["CustomerID"].nunique(), "total_transactions": df["InvoiceNo"].nunique(), "avg_revenue": float(df["TotalPrice"].mean()), "total_revenue": float(df["TotalPrice"].sum()), } return kpis # Global instance for caching _data_loader = None def get_data_loader(data_dir="../data/processed"): """Get or create the global DataLoader instance.""" global _data_loader if _data_loader is None: _data_loader = DataLoader(data_dir) return _data_loader