Spaces:

PD03
/

RICA-AIRevenueIntelligenceAgent

Sleeping

App Files Files Community

PD03 commited on Aug 31, 2025

Commit

c829fa9

verified ·

1 Parent(s): d92d528

Create utils/model_trainer.py

Browse files

Files changed (1) hide show

utils/model_trainer.py +241 -0

utils/model_trainer.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Embedded Model Training for HF Spaces
+Auto-trains model on first app load if not present
+"""
+import pandas as pd
+import numpy as np
+import duckdb
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import classification_report
+import joblib
+import json
+import streamlit as st
+from pathlib import Path
+from datetime import datetime
+class EmbeddedChurnTrainer:
+    """Embedded trainer that works within HF Spaces constraints"""
+    def __init__(self):
+        self.model_path = Path('models/churn_model_v1.pkl')
+        self.metadata_path = Path('models/model_metadata.json')
+    def model_exists(self):
+        """Check if trained model exists"""
+        return self.model_path.exists() and self.metadata_path.exists()
+    @st.cache_data
+    def load_sap_data(_self):
+        """Load SAP data with Streamlit caching"""
+        try:
+            conn = duckdb.connect(':memory:')
+            # Load SAP datasets
+            conn.execute("""
+                CREATE TABLE customers AS
+                SELECT * FROM 'hf://datasets/SAP/SALT/I_Customer.parquet'
+                LIMIT 5000
+            """)  # Limit for HF Spaces performance
+            conn.execute("""
+                CREATE TABLE sales_docs AS
+                SELECT * FROM 'hf://datasets/SAP/SALT/I_SalesDocument.parquet'
+                LIMIT 10000
+            """)  # Limit for HF Spaces performance
+            # Join data
+            training_data = conn.execute("""
+                SELECT
+                    c.Customer,
+                    c.CustomerName,
+                    c.Country,
+                    c.CustomerGroup,
+                    s.SalesDocument,
+                    s.CreationDate,
+                    s.SoldToParty,
+                    COUNT(s.SalesDocument) OVER (PARTITION BY c.Customer) as total_orders,
+                    MAX(s.CreationDate) OVER (PARTITION BY c.Customer) as last_order_date,
+                    MIN(s.CreationDate) OVER (PARTITION BY c.Customer) as first_order_date
+                FROM customers c
+                LEFT JOIN sales_docs s ON c.Customer = s.SoldToParty
+                WHERE c.Customer IS NOT NULL
+            """).df()
+            return training_data
+        except Exception as e:
+            st.error(f"Data loading failed: {str(e)}")
+            return pd.DataFrame()
+    def train_model_if_needed(self):
+        """Train model if it doesn't exist, with progress bar"""
+        if self.model_exists():
+            return self.load_existing_metadata()
+        # Show training progress
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        try:
+            # Step 1: Load data
+            status_text.text("Loading SAP data...")
+            progress_bar.progress(20)
+            data = self.load_sap_data()
+            if len(data) == 0:
+                st.error("No training data available")
+                return None
+            # Step 2: Feature engineering
+            status_text.text("Engineering features...")
+            progress_bar.progress(40)
+            features_data = self.engineer_features(data)
+            # Step 3: Train model
+            status_text.text("Training ML model...")
+            progress_bar.progress(60)
+            metrics = self.train_model(features_data)
+            # Step 4: Save model
+            status_text.text("Saving model...")
+            progress_bar.progress(80)
+            self.save_model_artifacts(metrics)
+            # Complete
+            progress_bar.progress(100)
+            status_text.text("✅ Model training complete!")
+            return metrics
+        except Exception as e:
+            st.error(f"Training failed: {str(e)}")
+            return None
+    def engineer_features(self, data):
+        """Streamlined feature engineering for HF Spaces"""
+        # Customer-level aggregation
+        customer_features = data.groupby('Customer').agg({
+            'CustomerName': 'first',
+            'Country': 'first',
+            'CustomerGroup': 'first',
+            'total_orders': 'first',
+            'last_order_date': 'first',
+            'first_order_date': 'first'
+        }).reset_index()
+        # Handle missing dates
+        reference_date = pd.to_datetime('2024-12-31')
+        customer_features['last_order_date'] = pd.to_datetime(customer_features['last_order_date'])
+        customer_features['first_order_date'] = pd.to_datetime(customer_features['first_order_date'])
+        # RFM Features
+        customer_features['Recency'] = (reference_date - customer_features['last_order_date']).dt.days
+        customer_features['Recency'] = customer_features['Recency'].fillna(365)
+        customer_features['Frequency'] = customer_features['total_orders'].fillna(0)
+        # Simulated monetary value
+        np.random.seed(42)
+        customer_features['Monetary'] = customer_features['Frequency'] * np.random.exponential(500, len(customer_features))
+        # Lifecycle features
+        customer_features['Tenure'] = (reference_date - customer_features['first_order_date']).dt.days
+        customer_features['Tenure'] = customer_features['Tenure'].fillna(0)
+        customer_features['OrderVelocity'] = customer_features['Frequency'] / (customer_features['Tenure'] / 30 + 1)
+        # Categorical encoding
+        self.label_encoders = {}
+        for col in ['Country', 'CustomerGroup']:
+            if col in customer_features.columns:
+                self.label_encoders[col] = LabelEncoder()
+                customer_features[f'{col}_encoded'] = self.label_encoders[col].fit_transform(
+                    customer_features[col].fillna('Unknown')
+                )
+        # Target variable
+        customer_features['IsChurned'] = (
+            (customer_features['Recency'] > 90) &
+            (customer_features['Frequency'] > 0)
+        ).astype(int)
+        # Select features
+        self.feature_columns = [
+            'Recency', 'Frequency', 'Monetary', 'Tenure', 'OrderVelocity',
+            'Country_encoded', 'CustomerGroup_encoded'
+        ]
+        return customer_features[self.feature_columns + ['IsChurned', 'Customer', 'CustomerName']]
+    def train_model(self, data):
+        """Train RandomForest model"""
+        X = data[self.feature_columns]
+        y = data['IsChurned']
+        # Train-test split
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Train model
+        self.model = RandomForestClassifier(
+            n_estimators=50,  # Reduced for HF Spaces performance
+            max_depth=8,
+            min_samples_split=20,
+            class_weight='balanced',
+            random_state=42,
+            n_jobs=1  # Single thread for HF Spaces
+        )
+        self.model.fit(X_train, y_train)
+        # Evaluate
+        test_score = self.model.score(X_test, y_test)
+        y_pred = self.model.predict(X_test)
+        metrics = {
+            'test_accuracy': test_score,
+            'feature_columns': self.feature_columns,
+            'training_samples': len(X_train),
+            'churn_rate': y.mean(),
+            'feature_importance': dict(zip(self.feature_columns, self.model.feature_importances_))
+        }
+        return metrics
+    def save_model_artifacts(self, metrics):
+        """Save model and metadata"""
+        # Ensure models directory exists
+        Path('models').mkdir(exist_ok=True)
+        # Save model with encoders
+        model_data = {
+            'model': self.model,
+            'label_encoders': self.label_encoders,
+            'feature_columns': self.feature_columns,
+            'version': 'v1',
+            'training_date': datetime.now().isoformat()
+        }
+        joblib.dump(model_data, self.model_path)
+        # Save metadata
+        metadata = {
+            'model_name': 'churn_predictor',
+            'version': 'v1',
+            'training_date': datetime.now().isoformat(),
+            'metrics': metrics,
+            'status': 'trained'
+        }
+        with open(self.metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+    def load_existing_metadata(self):
+        """Load existing model metadata"""
+        try:
+            with open(self.metadata_path, 'r') as f:
+                return json.load(f)
+        except:
+            return None