xxnithicxx's picture
Init project
63255af
# -*- coding: utf-8 -*-
"""
Data loader utilities for Gradio app.
Handles loading and preprocessing of data files.
"""
import pandas as pd
import numpy as np
from pathlib import Path
class DataLoader:
"""Load and cache data for the Gradio app."""
def __init__(self, data_dir="../data/processed"):
"""
Initialize DataLoader.
Args:
data_dir: Path to processed data directory
"""
self.data_dir = Path(data_dir)
self._raw_data = None
self._scaled_features = None
self._original_features = None
self._pca_data = None
@property
def raw_data(self):
"""Load and cache cleaned UK data."""
if self._raw_data is None:
self._raw_data = pd.read_csv(
self.data_dir / "cleaned_uk_data.csv",
parse_dates=["InvoiceDate"]
)
# Create time features if not present
if "DayOfWeek" not in self._raw_data.columns:
self._raw_data["DayOfWeek"] = self._raw_data["InvoiceDate"].dt.dayofweek
if "HourOfDay" not in self._raw_data.columns:
self._raw_data["HourOfDay"] = self._raw_data["InvoiceDate"].dt.hour
return self._raw_data
@property
def scaled_features(self):
"""Load and cache scaled customer features."""
if self._scaled_features is None:
self._scaled_features = pd.read_csv(
self.data_dir / "customer_features_scaled.csv",
index_col=0
)
return self._scaled_features
@property
def original_features(self):
"""Load and cache original customer features."""
if self._original_features is None:
self._original_features = pd.read_csv(
self.data_dir / "customer_features.csv",
index_col=0
)
return self._original_features
def get_feature_ranges(self):
"""Get min and max values for each feature for UI sliders."""
features = self.original_features
feature_stats = {}
for col in features.columns:
feature_stats[col] = {
"min": float(features[col].min()),
"max": float(features[col].max()),
"median": float(features[col].median()),
"mean": float(features[col].mean()),
}
return feature_stats
def get_kpi_metrics(self):
"""Calculate KPI metrics from raw data."""
df = self.raw_data
kpis = {
"total_customers": df["CustomerID"].nunique(),
"total_transactions": df["InvoiceNo"].nunique(),
"avg_revenue": float(df["TotalPrice"].mean()),
"total_revenue": float(df["TotalPrice"].sum()),
}
return kpis
# Global instance for caching
_data_loader = None
def get_data_loader(data_dir="../data/processed"):
"""Get or create the global DataLoader instance."""
global _data_loader
if _data_loader is None:
_data_loader = DataLoader(data_dir)
return _data_loader