Spaces:

PD03
/

RICA-AIRevenueIntelligenceAgent

Sleeping

App Files Files Community

PD03 commited on Aug 31, 2025

Commit

a3796f2

verified ·

1 Parent(s): d131cca

Update utils/model_trainer.py

Browse files

Files changed (1) hide show

utils/model_trainer.py +299 -153

utils/model_trainer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Embedded Model Training for HF Spaces
 Auto-trains model on first app load if not present
 """
 import pandas as pd
@@ -15,6 +16,7 @@ import json
 import streamlit as st
 from pathlib import Path
 from datetime import datetime
 class EmbeddedChurnTrainer:
     """Embedded trainer that works within HF Spaces constraints"""
@@ -32,46 +34,148 @@ class EmbeddedChurnTrainer:
     @st.cache_data
     def load_sap_data(_self):
-        """Load SAP data with Streamlit caching"""
         try:
-            conn = duckdb.connect(':memory:')
-            # Load SAP datasets with limits for HF Spaces performance
-            conn.execute("""
-                CREATE TABLE customers AS
-                SELECT * FROM 'hf://datasets/SAP/SALT/I_Customer.parquet'
-                LIMIT 5000
-            """)
-            conn.execute("""
-                CREATE TABLE sales_docs AS
-                SELECT * FROM 'hf://datasets/SAP/SALT/I_SalesDocument.parquet'
-                LIMIT 10000
-            """)
-            # Join data
-            training_data = conn.execute("""
-                SELECT
-                    c.Customer,
-                    c.CustomerName,
-                    c.Country,
-                    c.CustomerGroup,
-                    s.SalesDocument,
-                    s.CreationDate,
-                    s.SoldToParty,
-                    COUNT(s.SalesDocument) OVER (PARTITION BY c.Customer) as total_orders,
-                    MAX(s.CreationDate) OVER (PARTITION BY c.Customer) as last_order_date,
-                    MIN(s.CreationDate) OVER (PARTITION BY c.Customer) as first_order_date
-                FROM customers c
-                LEFT JOIN sales_docs s ON c.Customer = s.SoldToParty
-                WHERE c.Customer IS NOT NULL
-            """).df()
-            return training_data
         except Exception as e:
-            st.error(f"Data loading failed: {str(e)}")
-            return pd.DataFrame()
     def train_model_if_needed(self):
         """Train model if it doesn't exist, with progress updates"""
@@ -84,26 +188,34 @@ class EmbeddedChurnTrainer:
         try:
             # Step 1: Load data
-            status_text.text("Loading SAP data...")
             progress_bar.progress(20)
             data = self.load_sap_data()
             if len(data) == 0:
-                st.error("No training data available")
                 return None
             # Step 2: Feature engineering
-            status_text.text("Engineering features...")
             progress_bar.progress(40)
             features_data = self.engineer_features(data)
             # Step 3: Train model
-            status_text.text("Training ML model...")
             progress_bar.progress(60)
             metrics = self.train_model(features_data)
             # Step 4: Save model
-            status_text.text("Saving model...")
             progress_bar.progress(80)
             self.save_model_artifacts(metrics)
@@ -114,131 +226,165 @@ class EmbeddedChurnTrainer:
             return metrics
         except Exception as e:
-            st.error(f"Training failed: {str(e)}")
             return None
     def engineer_features(self, data):
         """Feature engineering for churn prediction"""
-        # Customer-level aggregation
-        customer_features = data.groupby('Customer').agg({
-            'CustomerName': 'first',
-            'Country': 'first',
-            'CustomerGroup': 'first',
-            'total_orders': 'first',
-            'last_order_date': 'first',
-            'first_order_date': 'first'
-        }).reset_index()
-        # Handle missing dates
-        reference_date = pd.to_datetime('2024-12-31')
-        customer_features['last_order_date'] = pd.to_datetime(customer_features['last_order_date'])
-        customer_features['first_order_date'] = pd.to_datetime(customer_features['first_order_date'])
-        # RFM Features
-        customer_features['Recency'] = (reference_date - customer_features['last_order_date']).dt.days
-        customer_features['Recency'] = customer_features['Recency'].fillna(365)
-        customer_features['Frequency'] = customer_features['total_orders'].fillna(0)
-        # Simulated monetary value
-        np.random.seed(42)
-        customer_features['Monetary'] = customer_features['Frequency'] * np.random.exponential(500, len(customer_features))
-        # Lifecycle features
-        customer_features['Tenure'] = (reference_date - customer_features['first_order_date']).dt.days
-        customer_features['Tenure'] = customer_features['Tenure'].fillna(0)
-        customer_features['OrderVelocity'] = customer_features['Frequency'] / (customer_features['Tenure'] / 30 + 1)
-        # Categorical encoding
-        self.label_encoders = {}
-        for col in ['Country', 'CustomerGroup']:
-            if col in customer_features.columns:
-                self.label_encoders[col] = LabelEncoder()
-                customer_features[f'{col}_encoded'] = self.label_encoders[col].fit_transform(
-                    customer_features[col].fillna('Unknown')
-                )
-        # Target variable
-        customer_features['IsChurned'] = (
-            (customer_features['Recency'] > 90) &
-            (customer_features['Frequency'] > 0)
-        ).astype(int)
-        # Select features
-        self.feature_columns = [
-            'Recency', 'Frequency', 'Monetary', 'Tenure', 'OrderVelocity',
-            'Country_encoded', 'CustomerGroup_encoded'
-        ]
-        return customer_features[self.feature_columns + ['IsChurned', 'Customer', 'CustomerName']]
     def train_model(self, data):
         """Train RandomForest model"""
-        X = data[self.feature_columns]
-        y = data['IsChurned']
-        # Train-test split
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y
-        )
-        # Train model (optimized for HF Spaces)
-        self.model = RandomForestClassifier(
-            n_estimators=50,  # Reduced for performance
-            max_depth=8,
-            min_samples_split=20,
-            class_weight='balanced',
-            random_state=42,
-            n_jobs=1  # Single thread for HF Spaces
-        )
-        self.model.fit(X_train, y_train)
-        # Evaluate
-        test_score = self.model.score(X_test, y_test)
-        y_pred = self.model.predict(X_test)
-        metrics = {
-            'test_accuracy': test_score,
-            'feature_columns': self.feature_columns,
-            'training_samples': len(X_train),
-            'churn_rate': y.mean(),
-            'feature_importance': dict(zip(self.feature_columns, self.model.feature_importances_))
-        }
-        return metrics
     def save_model_artifacts(self, metrics):
         """Save model and metadata"""
-        # Ensure models directory exists
-        Path('models').mkdir(exist_ok=True)
-        # Save model with encoders
-        model_data = {
-            'model': self.model,
-            'label_encoders': self.label_encoders,
-            'feature_columns': self.feature_columns,
-            'version': 'v1',
-            'training_date': datetime.now().isoformat()
-        }
-        joblib.dump(model_data, self.model_path)
-        # Save metadata
-        metadata = {
-            'model_name': 'churn_predictor',
-            'version': 'v1',
-            'training_date': datetime.now().isoformat(),
-            'metrics': metrics,
-            'status': 'trained'
-        }
-        with open(self.metadata_path, 'w') as f:
-            json.dump(metadata, f, indent=2)
     def load_existing_metadata(self):
         """Load existing model metadata"""
         try:
             with open(self.metadata_path, 'r') as f:
                 return json.load(f)
-        except:
             return None

 """
 Embedded Model Training for HF Spaces
 Auto-trains model on first app load if not present
+Handles SAP SALT dataset access with multiple fallback methods
 """
 import pandas as pd
 import streamlit as st
 from pathlib import Path
 from datetime import datetime
+import requests
 class EmbeddedChurnTrainer:
     """Embedded trainer that works within HF Spaces constraints"""
     @st.cache_data
     def load_sap_data(_self):
+        """Load SAP data with multiple fallback methods"""
+        # Method 1: Try using datasets library (preferred)
         try:
+            from datasets import load_dataset
+            st.info("🔄 Loading SAP SALT data using Hugging Face datasets library...")
+            # Try to load the dataset using proper HF datasets library
+            dataset = load_dataset("SAP/SALT", split="train", streaming=True)
+            # Convert to pandas DataFrame (limit for HF Spaces)
+            all_data = []
+            count = 0
+            max_records = 3000  # Limit for HF Spaces performance
+            for item in dataset:
+                if count >= max_records:
+                    break
+                # Handle the data structure from SAP SALT dataset
+                record = {
+                    'Customer': item.get('Customer') or f'CUST_{count:06d}',
+                    'CustomerName': item.get('CustomerName') or f'Customer {count}',
+                    'Country': item.get('Country') or np.random.choice(['DE', 'US', 'FR', 'UK']),
+                    'CustomerGroup': item.get('CustomerGroup') or np.random.choice(['RETAIL', 'WHOLESALE']),
+                    'SalesDocument': item.get('SalesDocument') or f'SO_{count:08d}',
+                    'CreationDate': item.get('CreationDate') or '2024-01-01',
+                    'SoldToParty': item.get('Customer') or f'CUST_{count:06d}'
+                }
+                all_data.append(record)
+                count += 1
+            if all_data:
+                training_data = pd.DataFrame(all_data)
+                training_data = _self._add_aggregated_fields(training_data)
+                st.success(f"✅ Loaded {len(training_data)} records using HF datasets library")
+                return training_data
+        except ImportError:
+            st.warning("⚠️ Hugging Face datasets library not available, trying alternative method...")
         except Exception as e:
+            st.warning(f"⚠️ Datasets library failed ({str(e)}), trying alternative method...")
+        # Method 2: Try HF API endpoints
+        try:
+            st.info("🔄 Trying alternative data loading via Hugging Face API...")
+            return _self._load_via_hf_api()
+        except Exception as e:
+            st.warning(f"⚠️ HF API method failed ({str(e)}), creating synthetic data...")
+        # Method 3: Create synthetic data as fallback
+        return _self._create_synthetic_data()
+    def _load_via_hf_api(self):
+        """Alternative method using HF API"""
+        try:
+            # Try the HF dataset viewer API
+            base_url = "https://datasets-server.huggingface.co/rows"
+            response = requests.get(
+                f"{base_url}?dataset=SAP/SALT&config=default&split=train&offset=0&length=1000",
+                timeout=30
+            )
+            if response.status_code == 200:
+                data = response.json()
+                if 'rows' in data:
+                    rows_data = []
+                    for row in data['rows']:
+                        if 'row' in row:
+                            rows_data.append(row['row'])
+                    if rows_data:
+                        training_data = pd.DataFrame(rows_data)
+                        training_data = self._add_aggregated_fields(training_data)
+                        st.success(f"✅ Loaded {len(training_data)} records using HF API")
+                        return training_data
+            raise Exception("No valid data returned from API")
+        except Exception as e:
+            raise Exception(f"API loading failed: {str(e)}")
+    def _create_synthetic_data(self):
+        """Create realistic synthetic SAP-like data for demonstration"""
+        st.info("🔄 Creating synthetic SAP-like data for demonstration...")
+        np.random.seed(42)
+        n_customers = 1000
+        n_sales_docs = 3000
+        # Generate realistic customer data
+        countries = ['DE', 'US', 'FR', 'UK', 'JP', 'CN', 'IN', 'BR', 'AU', 'CA']
+        customer_groups = ['RETAIL', 'WHOLESALE', 'DISTRIBUTOR', 'ENTERPRISE', 'SMB']
+        # Create base data
+        all_data = []
+        # Generate sales documents with customer data
+        for i in range(n_sales_docs):
+            customer_idx = np.random.randint(0, n_customers)
+            customer_id = f"CUST_{customer_idx:06d}"
+            # Create realistic date distribution (more recent orders more likely)
+            days_ago = max(1, int(np.random.exponential(50)))  # Average 50 days ago
+            creation_date = (datetime.now() - pd.Timedelta(days=days_ago)).strftime('%Y-%m-%d')
+            record = {
+                'Customer': customer_id,
+                'CustomerName': f'Customer {customer_idx}',
+                'Country': np.random.choice(countries),
+                'CustomerGroup': np.random.choice(customer_groups),
+                'SalesDocument': f"SO_{i:08d}",
+                'CreationDate': creation_date,
+                'SoldToParty': customer_id
+            }
+            all_data.append(record)
+        # Create DataFrame
+        training_data = pd.DataFrame(all_data)
+        training_data = self._add_aggregated_fields(training_data)
+        st.success(f"✅ Created {len(training_data)} synthetic records for demonstration")
+        st.info("📝 **Note**: Using synthetic data for demo. In production, configure proper SAP SALT access.")
+        return training_data
+    def _add_aggregated_fields(self, data):
+        """Add aggregated fields for feature engineering"""
+        # Add customer-level aggregations
+        customer_aggs = data.groupby('Customer').agg({
+            'SalesDocument': 'count',
+            'CreationDate': ['min', 'max']
+        }).reset_index()
+        # Flatten column names
+        customer_aggs.columns = ['Customer', 'total_orders', 'first_order_date', 'last_order_date']
+        # Merge back to original data
+        data = data.merge(customer_aggs, on='Customer', how='left')
+        return data
     def train_model_if_needed(self):
         """Train model if it doesn't exist, with progress updates"""
         try:
             # Step 1: Load data
+            status_text.text("📥 Loading SAP data...")
             progress_bar.progress(20)
             data = self.load_sap_data()
             if len(data) == 0:
+                st.error("❌ No training data available")
                 return None
             # Step 2: Feature engineering
+            status_text.text("🔧 Engineering features...")
             progress_bar.progress(40)
             features_data = self.engineer_features(data)
+            if len(features_data) == 0:
+                st.error("❌ Feature engineering failed")
+                return None
             # Step 3: Train model
+            status_text.text("🏋️ Training ML model...")
             progress_bar.progress(60)
             metrics = self.train_model(features_data)
+            if not metrics:
+                st.error("❌ Model training failed")
+                return None
             # Step 4: Save model
+            status_text.text("💾 Saving model...")
             progress_bar.progress(80)
             self.save_model_artifacts(metrics)
             return metrics
         except Exception as e:
+            st.error(f"❌ Training failed: {str(e)}")
             return None
     def engineer_features(self, data):
         """Feature engineering for churn prediction"""
+        try:
+            # Customer-level aggregation
+            customer_features = data.groupby('Customer').agg({
+                'CustomerName': 'first',
+                'Country': 'first',
+                'CustomerGroup': 'first',
+                'total_orders': 'first',
+                'last_order_date': 'first',
+                'first_order_date': 'first'
+            }).reset_index()
+            # Handle missing dates
+            reference_date = pd.to_datetime('2024-12-31')
+            customer_features['last_order_date'] = pd.to_datetime(customer_features['last_order_date'])
+            customer_features['first_order_date'] = pd.to_datetime(customer_features['first_order_date'])
+            # RFM Features
+            customer_features['Recency'] = (reference_date - customer_features['last_order_date']).dt.days
+            customer_features['Recency'] = customer_features['Recency'].fillna(365)
+            customer_features['Frequency'] = customer_features['total_orders'].fillna(0)
+            # Simulated monetary value (consistent with seed)
+            np.random.seed(42)
+            customer_features['Monetary'] = customer_features['Frequency'] * np.random.exponential(500, len(customer_features))
+            # Lifecycle features
+            customer_features['Tenure'] = (reference_date - customer_features['first_order_date']).dt.days
+            customer_features['Tenure'] = customer_features['Tenure'].fillna(0)
+            customer_features['OrderVelocity'] = customer_features['Frequency'] / (customer_features['Tenure'] / 30 + 1)
+            # Categorical encoding
+            self.label_encoders = {}
+            for col in ['Country', 'CustomerGroup']:
+                if col in customer_features.columns:
+                    self.label_encoders[col] = LabelEncoder()
+                    customer_features[f'{col}_encoded'] = self.label_encoders[col].fit_transform(
+                        customer_features[col].fillna('Unknown')
+                    )
+            # Target variable (churn definition)
+            customer_features['IsChurned'] = (
+                (customer_features['Recency'] > 90) &
+                (customer_features['Frequency'] > 0)
+            ).astype(int)
+            # Select features for model
+            self.feature_columns = [
+                'Recency', 'Frequency', 'Monetary', 'Tenure', 'OrderVelocity',
+                'Country_encoded', 'CustomerGroup_encoded'
+            ]
+            # Return final dataset
+            final_features = customer_features[self.feature_columns + ['IsChurned', 'Customer', 'CustomerName']]
+            # Validate data
+            if len(final_features) < 10:
+                raise Exception("Insufficient data for training")
+            return final_features
+        except Exception as e:
+            st.error(f"Feature engineering failed: {str(e)}")
+            return pd.DataFrame()
     def train_model(self, data):
         """Train RandomForest model"""
+        try:
+            X = data[self.feature_columns]
+            y = data['IsChurned']
+            # Check for sufficient data
+            if len(X) < 20:
+                raise Exception("Insufficient training data")
+            if y.sum() == 0 or (y == 0).sum() == 0:
+                # Handle case where all customers are churned or none are churned
+                st.warning("⚠️ Unbalanced target variable detected")
+            # Train-test split
+            test_size = min(0.2, max(0.1, len(X) // 10))  # Adaptive test size
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=test_size, random_state=42, stratify=y if len(np.unique(y)) > 1 else None
+            )
+            # Train model (optimized for HF Spaces)
+            self.model = RandomForestClassifier(
+                n_estimators=50,  # Reduced for performance
+                max_depth=8,
+                min_samples_split=max(2, len(X_train) // 50),
+                min_samples_leaf=max(1, len(X_train) // 100),
+                class_weight='balanced',
+                random_state=42,
+                n_jobs=1  # Single thread for HF Spaces
+            )
+            self.model.fit(X_train, y_train)
+            # Evaluate
+            train_score = self.model.score(X_train, y_train)
+            test_score = self.model.score(X_test, y_test)
+            metrics = {
+                'train_accuracy': train_score,
+                'test_accuracy': test_score,
+                'feature_columns': self.feature_columns,
+                'training_samples': len(X_train),
+                'test_samples': len(X_test),
+                'churn_rate': float(y.mean()),
+                'feature_importance': dict(zip(self.feature_columns, self.model.feature_importances_))
+            }
+            return metrics
+        except Exception as e:
+            st.error(f"Model training failed: {str(e)}")
+            return None
     def save_model_artifacts(self, metrics):
         """Save model and metadata"""
+        try:
+            # Ensure models directory exists
+            Path('models').mkdir(exist_ok=True)
+            # Save model with encoders and metadata
+            model_data = {
+                'model': self.model,
+                'label_encoders': self.label_encoders,
+                'feature_columns': self.feature_columns,
+                'version': 'v1',
+                'training_date': datetime.now().isoformat()
+            }
+            joblib.dump(model_data, self.model_path)
+            # Save metadata
+            metadata = {
+                'model_name': 'churn_predictor',
+                'version': 'v1',
+                'training_date': datetime.now().isoformat(),
+                'metrics': metrics,
+                'status': 'trained'
+            }
+            with open(self.metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+        except Exception as e:
+            st.error(f"Failed to save model: {str(e)}")
+            raise
     def load_existing_metadata(self):
         """Load existing model metadata"""
         try:
             with open(self.metadata_path, 'r') as f:
                 return json.load(f)
+        except Exception:
             return None