File size: 3,173 Bytes
63255af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""
Data loader utilities for Gradio app.
Handles loading and preprocessing of data files.
"""

import pandas as pd
import numpy as np
from pathlib import Path


class DataLoader:
    """Load and cache data for the Gradio app."""
    
    def __init__(self, data_dir="../data/processed"):
        """
        Initialize DataLoader.
        
        Args:
            data_dir: Path to processed data directory
        """
        self.data_dir = Path(data_dir)
        self._raw_data = None
        self._scaled_features = None
        self._original_features = None
        self._pca_data = None
    
    @property
    def raw_data(self):
        """Load and cache cleaned UK data."""
        if self._raw_data is None:
            self._raw_data = pd.read_csv(
                self.data_dir / "cleaned_uk_data.csv",
                parse_dates=["InvoiceDate"]
            )
            # Create time features if not present
            if "DayOfWeek" not in self._raw_data.columns:
                self._raw_data["DayOfWeek"] = self._raw_data["InvoiceDate"].dt.dayofweek
            if "HourOfDay" not in self._raw_data.columns:
                self._raw_data["HourOfDay"] = self._raw_data["InvoiceDate"].dt.hour
        return self._raw_data
    
    @property
    def scaled_features(self):
        """Load and cache scaled customer features."""
        if self._scaled_features is None:
            self._scaled_features = pd.read_csv(
                self.data_dir / "customer_features_scaled.csv",
                index_col=0
            )
        return self._scaled_features
    
    @property
    def original_features(self):
        """Load and cache original customer features."""
        if self._original_features is None:
            self._original_features = pd.read_csv(
                self.data_dir / "customer_features.csv",
                index_col=0
            )
        return self._original_features
    
    def get_feature_ranges(self):
        """Get min and max values for each feature for UI sliders."""
        features = self.original_features
        feature_stats = {}
        
        for col in features.columns:
            feature_stats[col] = {
                "min": float(features[col].min()),
                "max": float(features[col].max()),
                "median": float(features[col].median()),
                "mean": float(features[col].mean()),
            }
        
        return feature_stats
    
    def get_kpi_metrics(self):
        """Calculate KPI metrics from raw data."""
        df = self.raw_data
        
        kpis = {
            "total_customers": df["CustomerID"].nunique(),
            "total_transactions": df["InvoiceNo"].nunique(),
            "avg_revenue": float(df["TotalPrice"].mean()),
            "total_revenue": float(df["TotalPrice"].sum()),
        }
        
        return kpis


# Global instance for caching
_data_loader = None


def get_data_loader(data_dir="../data/processed"):
    """Get or create the global DataLoader instance."""
    global _data_loader
    if _data_loader is None:
        _data_loader = DataLoader(data_dir)
    return _data_loader