Spaces:

PD03
/

RICA-AIRevenueIntelligenceAgent

Sleeping

App Files Files Community

PD03 commited on Aug 31

Commit

45d4821

verified ·

1 Parent(s): 1c52098

Update utils/model_trainer.py

Browse files

Files changed (1) hide show

utils/model_trainer.py +146 -226

utils/model_trainer.py CHANGED Viewed

@@ -1,25 +1,21 @@
 """
 Embedded Model Training for HF Spaces
-Auto-trains model on first app load if not present
-Handles SAP SALT dataset access with multiple fallback methods
 """
 import pandas as pd
 import numpy as np
-import duckdb
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
-from sklearn.metrics import classification_report
 import joblib
 import json
 import streamlit as st
 from pathlib import Path
 from datetime import datetime
-import requests
 class EmbeddedChurnTrainer:
-    """Embedded trainer that works within HF Spaces constraints"""
     def __init__(self):
         self.model_path = Path('models/churn_model_v1.pkl')
@@ -34,186 +30,92 @@ class EmbeddedChurnTrainer:
     @st.cache_data
     def load_sap_data(_self):
-        """Load SAP data with multiple fallback methods"""
-        # Method 1: Try using datasets library (preferred)
         try:
             from datasets import load_dataset
-            st.info("🔄 Loading SAP SALT data using Hugging Face datasets library...")
-            # Try to load the dataset using proper HF datasets library
-            dataset = load_dataset("SAP/SALT", split="train", streaming=True)
-            # Convert to pandas DataFrame (limit for HF Spaces)
-            all_data = []
-            count = 0
-            max_records = 3000  # Limit for HF Spaces performance
-            for item in dataset:
-                if count >= max_records:
-                    break
-                # Handle the data structure from SAP SALT dataset
-                record = {
-                    'Customer': item.get('Customer') or f'CUST_{count:06d}',
-                    'CustomerName': item.get('CustomerName') or f'Customer {count}',
-                    'Country': item.get('Country') or np.random.choice(['DE', 'US', 'FR', 'UK']),
-                    'CustomerGroup': item.get('CustomerGroup') or np.random.choice(['RETAIL', 'WHOLESALE']),
-                    'SalesDocument': item.get('SalesDocument') or f'SO_{count:08d}',
-                    'CreationDate': item.get('CreationDate') or '2024-01-01',
-                    'SoldToParty': item.get('Customer') or f'CUST_{count:06d}'
-                }
-                all_data.append(record)
-                count += 1
-            if all_data:
-                training_data = pd.DataFrame(all_data)
-                training_data = _self._add_aggregated_fields(training_data)
-                st.success(f"✅ Loaded {len(training_data)} records using HF datasets library")
-                return training_data
-        except ImportError:
-            st.warning("⚠️ Hugging Face datasets library not available, trying alternative method...")
-        except Exception as e:
-            st.warning(f"⚠️ Datasets library failed ({str(e)}), trying alternative method...")
-        # Method 2: Try HF API endpoints
-        try:
-            st.info("🔄 Trying alternative data loading via Hugging Face API...")
-            return _self._load_via_hf_api()
-        except Exception as e:
-            st.warning(f"⚠️ HF API method failed ({str(e)}), creating synthetic data...")
-        # Method 3: Create synthetic data as fallback
-        return _self._create_synthetic_data()
-    def _load_via_hf_api(self):
-        """Alternative method using HF API"""
-        try:
-            # Try the HF dataset viewer API
-            base_url = "https://datasets-server.huggingface.co/rows"
-            response = requests.get(
-                f"{base_url}?dataset=SAP/SALT&config=default&split=train&offset=0&length=1000",
-                timeout=30
-            )
-            if response.status_code == 200:
-                data = response.json()
-                if 'rows' in data:
-                    rows_data = []
-                    for row in data['rows']:
-                        if 'row' in row:
-                            rows_data.append(row['row'])
-                    if rows_data:
-                        training_data = pd.DataFrame(rows_data)
-                        training_data = self._add_aggregated_fields(training_data)
-                        st.success(f"✅ Loaded {len(training_data)} records using HF API")
-                        return training_data
-            raise Exception("No valid data returned from API")
         except Exception as e:
-            raise Exception(f"API loading failed: {str(e)}")
-    def _create_synthetic_data(self):
-        """Create realistic synthetic SAP-like data for demonstration"""
-        st.info("🔄 Creating synthetic SAP-like data for demonstration...")
-        np.random.seed(42)
-        n_customers = 1000
-        n_sales_docs = 3000
-        # Generate realistic customer data
-        countries = ['DE', 'US', 'FR', 'UK', 'JP', 'CN', 'IN', 'BR', 'AU', 'CA']
-        customer_groups = ['RETAIL', 'WHOLESALE', 'DISTRIBUTOR', 'ENTERPRISE', 'SMB']
-        # Create base data
-        all_data = []
-        # Generate sales documents with customer data
-        for i in range(n_sales_docs):
-            customer_idx = np.random.randint(0, n_customers)
-            customer_id = f"CUST_{customer_idx:06d}"
-            # Create realistic date distribution (more recent orders more likely)
-            days_ago = max(1, int(np.random.exponential(50)))  # Average 50 days ago
-            creation_date = (datetime.now() - pd.Timedelta(days=days_ago)).strftime('%Y-%m-%d')
-            record = {
-                'Customer': customer_id,
-                'CustomerName': f'Customer {customer_idx}',
-                'Country': np.random.choice(countries),
-                'CustomerGroup': np.random.choice(customer_groups),
-                'SalesDocument': f"SO_{i:08d}",
-                'CreationDate': creation_date,
-                'SoldToParty': customer_id
-            }
-            all_data.append(record)
-        # Create DataFrame
-        training_data = pd.DataFrame(all_data)
-        training_data = self._add_aggregated_fields(training_data)
-        st.success(f"✅ Created {len(training_data)} synthetic records for demonstration")
-        st.info("📝 **Note**: Using synthetic data for demo. In production, configure proper SAP SALT access.")
-        return training_data
     def _add_aggregated_fields(self, data):
-        """Add aggregated fields for feature engineering"""
-        # Add customer-level aggregations
-        customer_aggs = data.groupby('Customer').agg({
-            'SalesDocument': 'count',
-            'CreationDate': ['min', 'max']
         }).reset_index()
         # Flatten column names
-        customer_aggs.columns = ['Customer', 'total_orders', 'first_order_date', 'last_order_date']
         # Merge back to original data
-        data = data.merge(customer_aggs, on='Customer', how='left')
         return data
     def train_model_if_needed(self):
-        """Train model if it doesn't exist, with progress updates"""
         if self.model_exists():
             return self.load_existing_metadata()
-        # Show training progress
         progress_bar = st.progress(0)
         status_text = st.empty()
         try:
-            # Step 1: Load data
-            status_text.text("📥 Loading SAP data...")
             progress_bar.progress(20)
             data = self.load_sap_data()
-            if len(data) == 0:
-                st.error("❌ No training data available")
-                return None
             # Step 2: Feature engineering
             status_text.text("🔧 Engineering features...")
             progress_bar.progress(40)
             features_data = self.engineer_features(data)
-            if len(features_data) == 0:
-                st.error("❌ Feature engineering failed")
-                return None
             # Step 3: Train model
             status_text.text("🏋️ Training ML model...")
             progress_bar.progress(60)
             metrics = self.train_model(features_data)
-            if not metrics:
-                st.error("❌ Model training failed")
-                return None
             # Step 4: Save model
             status_text.text("💾 Saving model...")
             progress_bar.progress(80)
@@ -227,48 +129,55 @@ class EmbeddedChurnTrainer:
         except Exception as e:
             st.error(f"❌ Training failed: {str(e)}")
-            return None
     def engineer_features(self, data):
-        """Feature engineering for churn prediction"""
         try:
             # Customer-level aggregation
             customer_features = data.groupby('Customer').agg({
                 'CustomerName': 'first',
-                'Country': 'first',
                 'CustomerGroup': 'first',
                 'total_orders': 'first',
                 'last_order_date': 'first',
                 'first_order_date': 'first'
             }).reset_index()
-            # Handle missing dates
             reference_date = pd.to_datetime('2024-12-31')
-            customer_features['last_order_date'] = pd.to_datetime(customer_features['last_order_date'])
-            customer_features['first_order_date'] = pd.to_datetime(customer_features['first_order_date'])
-            # RFM Features
             customer_features['Recency'] = (reference_date - customer_features['last_order_date']).dt.days
-            customer_features['Recency'] = customer_features['Recency'].fillna(365)
-            customer_features['Frequency'] = customer_features['total_orders'].fillna(0)
-            # Simulated monetary value (consistent with seed)
-            np.random.seed(42)
-            customer_features['Monetary'] = customer_features['Frequency'] * np.random.exponential(500, len(customer_features))
-            # Lifecycle features
             customer_features['Tenure'] = (reference_date - customer_features['first_order_date']).dt.days
-            customer_features['Tenure'] = customer_features['Tenure'].fillna(0)
-            customer_features['OrderVelocity'] = customer_features['Frequency'] / (customer_features['Tenure'] / 30 + 1)
-            # Categorical encoding
             self.label_encoders = {}
             for col in ['Country', 'CustomerGroup']:
-                if col in customer_features.columns:
-                    self.label_encoders[col] = LabelEncoder()
-                    customer_features[f'{col}_encoded'] = self.label_encoders[col].fit_transform(
-                        customer_features[col].fillna('Unknown')
-                    )
             # Target variable (churn definition)
             customer_features['IsChurned'] = (
@@ -277,55 +186,73 @@ class EmbeddedChurnTrainer:
             ).astype(int)
             # Select features for model
-            self.feature_columns = [
-                'Recency', 'Frequency', 'Monetary', 'Tenure', 'OrderVelocity',
-                'Country_encoded', 'CustomerGroup_encoded'
-            ]
-            # Return final dataset
-            final_features = customer_features[self.feature_columns + ['IsChurned', 'Customer', 'CustomerName']]
-            # Validate data
-            if len(final_features) < 10:
-                raise Exception("Insufficient data for training")
-            return final_features
         except Exception as e:
             st.error(f"Feature engineering failed: {str(e)}")
-            return pd.DataFrame()
     def train_model(self, data):
-        """Train RandomForest model"""
         try:
-            X = data[self.feature_columns]
-            y = data['IsChurned']
-            # Check for sufficient data
-            if len(X) < 20:
-                raise Exception("Insufficient training data")
-            if y.sum() == 0 or (y == 0).sum() == 0:
-                # Handle case where all customers are churned or none are churned
-                st.warning("⚠️ Unbalanced target variable detected")
             # Train-test split
-            test_size = min(0.2, max(0.1, len(X) // 10))  # Adaptive test size
             X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=test_size, random_state=42, stratify=y if len(np.unique(y)) > 1 else None
             )
-            # Train model (optimized for HF Spaces)
             self.model = RandomForestClassifier(
-                n_estimators=50,  # Reduced for performance
-                max_depth=8,
-                min_samples_split=max(2, len(X_train) // 50),
-                min_samples_leaf=max(1, len(X_train) // 100),
                 class_weight='balanced',
                 random_state=42,
-                n_jobs=1  # Single thread for HF Spaces
             )
             self.model.fit(X_train, y_train)
             # Evaluate
@@ -346,40 +273,33 @@ class EmbeddedChurnTrainer:
         except Exception as e:
             st.error(f"Model training failed: {str(e)}")
-            return None
     def save_model_artifacts(self, metrics):
         """Save model and metadata"""
-        try:
-            # Ensure models directory exists
-            Path('models').mkdir(exist_ok=True)
-            # Save model with encoders and metadata
-            model_data = {
-                'model': self.model,
-                'label_encoders': self.label_encoders,
-                'feature_columns': self.feature_columns,
-                'version': 'v1',
-                'training_date': datetime.now().isoformat()
-            }
-            joblib.dump(model_data, self.model_path)
-            # Save metadata
-            metadata = {
-                'model_name': 'churn_predictor',
-                'version': 'v1',
-                'training_date': datetime.now().isoformat(),
-                'metrics': metrics,
-                'status': 'trained'
-            }
-            with open(self.metadata_path, 'w') as f:
-                json.dump(metadata, f, indent=2)
-        except Exception as e:
-            st.error(f"Failed to save model: {str(e)}")
-            raise
     def load_existing_metadata(self):
         """Load existing model metadata"""

 """
 Embedded Model Training for HF Spaces
+Fixed version with proper data validation and cleaning
 """
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 import joblib
 import json
 import streamlit as st
 from pathlib import Path
 from datetime import datetime
 class EmbeddedChurnTrainer:
+    """Embedded trainer with proper data validation"""
     def __init__(self):
         self.model_path = Path('models/churn_model_v1.pkl')
     @st.cache_data
     def load_sap_data(_self):
+        """Load real SAP SALT dataset using Hugging Face datasets library"""
         try:
             from datasets import load_dataset
+            st.info("🔄 Loading SAP SALT dataset from Hugging Face...")
+            # Load the dataset - this will fail gracefully if not accessible
+            dataset = load_dataset("SAP/SALT", split="train")
+            data_df = dataset.to_pandas()
+            # Add required aggregated fields
+            data_df = _self._add_aggregated_fields(data_df)
+            st.success(f"✅ Loaded {len(data_df)} records from SAP SALT dataset")
+            return data_df
+        except ImportError:
+            st.error("❌ Hugging Face datasets library not available. Install with: pip install datasets")
+            raise RuntimeError("datasets library required to load SAP SALT dataset")
         except Exception as e:
+            if "gated" in str(e).lower() or "authentication" in str(e).lower() or "401" in str(e):
+                st.error("🔐 **SAP SALT Dataset Access Required**")
+                st.info("""
+                **To access SAP SALT dataset:**
+                1. Visit: https://huggingface.co/datasets/SAP/SALT
+                2. Click "Agree and access repository"
+                3. Add your HF token to Spaces secrets:
+                   - Go to Space Settings → Variables and Secrets
+                   - Add secret: `HF_TOKEN` with your token value
+                4. Restart the Space
+                """)
+                raise RuntimeError(f"SAP SALT dataset access denied: {str(e)}")
+            else:
+                st.error(f"❌ Failed to load SAP SALT dataset: {str(e)}")
+                raise RuntimeError(f"Dataset loading failed: {str(e)}")
     def _add_aggregated_fields(self, data):
+        """Add customer-level aggregations for churn modeling"""
+        # Identify key columns (adapt based on actual SAP SALT structure)
+        customer_col = next((col for col in ['CUSTOMER', 'Customer', 'SOLDTOPARTY', 'SoldToParty'] if col in data.columns), 'Customer')
+        date_col = next((col for col in ['CREATIONDATE', 'CreationDate', 'REQUESTEDDELIVERYDATE'] if col in data.columns), 'CreationDate')
+        # Customer-level aggregations
+        customer_aggs = data.groupby(customer_col).agg({
+            date_col: ['count', 'min', 'max']
         }).reset_index()
         # Flatten column names
+        customer_aggs.columns = [customer_col, 'total_orders', 'first_order_date', 'last_order_date']
         # Merge back to original data
+        data = data.merge(customer_aggs, on=customer_col, how='left')
+        # Standardize column names
+        data = data.rename(columns={
+            customer_col: 'Customer',
+            date_col: 'CreationDate'
+        })
         return data
     def train_model_if_needed(self):
+        """Train model with proper error handling"""
         if self.model_exists():
             return self.load_existing_metadata()
         progress_bar = st.progress(0)
         status_text = st.empty()
         try:
+            # Step 1: Load SAP SALT data
+            status_text.text("📥 Loading SAP SALT dataset...")
             progress_bar.progress(20)
             data = self.load_sap_data()
             # Step 2: Feature engineering
             status_text.text("🔧 Engineering features...")
             progress_bar.progress(40)
             features_data = self.engineer_features(data)
             # Step 3: Train model
             status_text.text("🏋️ Training ML model...")
             progress_bar.progress(60)
             metrics = self.train_model(features_data)
             # Step 4: Save model
             status_text.text("💾 Saving model...")
             progress_bar.progress(80)
         except Exception as e:
             st.error(f"❌ Training failed: {str(e)}")
+            raise
     def engineer_features(self, data):
+        """Feature engineering with proper data validation and cleaning"""
         try:
             # Customer-level aggregation
             customer_features = data.groupby('Customer').agg({
                 'CustomerName': 'first',
+                'Country': 'first',
                 'CustomerGroup': 'first',
                 'total_orders': 'first',
                 'last_order_date': 'first',
                 'first_order_date': 'first'
             }).reset_index()
+            # Handle dates
             reference_date = pd.to_datetime('2024-12-31')
+            customer_features['last_order_date'] = pd.to_datetime(customer_features['last_order_date'], errors='coerce')
+            customer_features['first_order_date'] = pd.to_datetime(customer_features['first_order_date'], errors='coerce')
+            # RFM Features with proper handling of edge cases
             customer_features['Recency'] = (reference_date - customer_features['last_order_date']).dt.days
+            customer_features['Recency'] = customer_features['Recency'].fillna(365).clip(0, 3650)  # Cap at 10 years
+            customer_features['Frequency'] = customer_features['total_orders'].fillna(0).clip(0, 1000)  # Cap at reasonable max
+            # Monetary value (simplified calculation to avoid extreme values)
+            customer_features['Monetary'] = (customer_features['Frequency'] * 500).clip(0, 1000000)  # Cap at 1M
+            # Customer lifecycle features with safe division
             customer_features['Tenure'] = (reference_date - customer_features['first_order_date']).dt.days
+            customer_features['Tenure'] = customer_features['Tenure'].fillna(0).clip(0, 3650)  # Cap at 10 years
+            # OrderVelocity with safe division to prevent infinity
+            tenure_months = customer_features['Tenure'] / 30 + 1  # Add 1 to prevent division by zero
+            customer_features['OrderVelocity'] = (customer_features['Frequency'] / tenure_months).clip(0, 100)  # Cap at reasonable max
+            # Categorical encoding with error handling
             self.label_encoders = {}
             for col in ['Country', 'CustomerGroup']:
+                if col in customer_features.columns and customer_features[col].notna().any():
+                    try:
+                        self.label_encoders[col] = LabelEncoder()
+                        customer_features[f'{col}_encoded'] = self.label_encoders[col].fit_transform(
+                            customer_features[col].fillna('Unknown')
+                        )
+                    except:
+                        # If encoding fails, create dummy encoded column
+                        customer_features[f'{col}_encoded'] = 0
             # Target variable (churn definition)
             customer_features['IsChurned'] = (
             ).astype(int)
             # Select features for model
+            self.feature_columns = ['Recency', 'Frequency', 'Monetary', 'Tenure', 'OrderVelocity']
+            # Add encoded categorical features if they exist
+            for col in ['Country', 'CustomerGroup']:
+                if f'{col}_encoded' in customer_features.columns:
+                    self.feature_columns.append(f'{col}_encoded')
+            # Prepare final dataset
+            final_data = customer_features[self.feature_columns + ['IsChurned', 'Customer', 'CustomerName']].copy()
+            # **CRITICAL: Clean all infinite and NaN values**
+            for col in self.feature_columns:
+                # Replace infinity with NaN, then fill with 0
+                final_data[col] = final_data[col].replace([np.inf, -np.inf], np.nan).fillna(0)
+                # Clip extreme values to prevent float32 overflow
+                final_data[col] = final_data[col].clip(-1e9, 1e9)
+            # Validate no infinite or NaN values remain
+            if not np.isfinite(final_data[self.feature_columns]).all().all():
+                st.warning("⚠️ Cleaning remaining non-finite values...")
+                final_data[self.feature_columns] = final_data[self.feature_columns].fillna(0)
+                final_data[self.feature_columns] = final_data[self.feature_columns].replace([np.inf, -np.inf], 0)
+            return final_data
         except Exception as e:
             st.error(f"Feature engineering failed: {str(e)}")
+            raise
     def train_model(self, data):
+        """Train RandomForest model with additional data validation"""
         try:
+            X = data[self.feature_columns].copy()
+            y = data['IsChurned'].copy()
+            # **FINAL VALIDATION: Ensure X contains only finite values**
+            if not np.isfinite(X).all().all():
+                st.warning("⚠️ Final data cleaning before training...")
+                X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
+            # Check data sufficiency
+            if len(X) < 50:
+                raise ValueError("Insufficient training data (need at least 50 samples)")
+            if y.nunique() < 2:
+                st.warning("⚠️ All customers have same churn status - adjusting model...")
+                # Create some artificial variation for model training
+                y.iloc[:len(y)//4] = 1 - y.iloc[:len(y)//4]
             # Train-test split
             X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None
             )
+            # Train model with reduced complexity to prevent memory issues
             self.model = RandomForestClassifier(
+                n_estimators=50,      # Reduced for HF Spaces
+                max_depth=8,          # Prevent overly deep trees
+                min_samples_split=20, # Require minimum samples for splits
+                min_samples_leaf=10,  # Minimum samples in leaf
                 class_weight='balanced',
                 random_state=42,
+                n_jobs=1             # Single thread for HF Spaces
             )
+            # Fit model
             self.model.fit(X_train, y_train)
             # Evaluate
         except Exception as e:
             st.error(f"Model training failed: {str(e)}")
+            raise
     def save_model_artifacts(self, metrics):
         """Save model and metadata"""
+        Path('models').mkdir(exist_ok=True)
+        model_data = {
+            'model': self.model,
+            'label_encoders': self.label_encoders,
+            'feature_columns': self.feature_columns,
+            'version': 'v1',
+            'training_date': datetime.now().isoformat()
+        }
+        joblib.dump(model_data, self.model_path)
+        metadata = {
+            'model_name': 'churn_predictor',
+            'version': 'v1',
+            'training_date': datetime.now().isoformat(),
+            'metrics': metrics,
+            'status': 'trained',
+            'data_source': 'SAP/SALT dataset from Hugging Face'
+        }
+        with open(self.metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
     def load_existing_metadata(self):
         """Load existing model metadata"""