Spaces:
Sleeping
Sleeping
| """ | |
| Machine Learning Model Training | |
| Trains XGBoost model to predict top-quartile fund performance | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import classification_report, roc_auc_score, roc_curve | |
| import xgboost as xgb | |
| import joblib | |
| import json | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Set style | |
| sns.set_style('whitegrid') | |
| def load_and_prepare_data(data_path='data/mock_fund_data.csv'): | |
| """Load and prepare data for modeling""" | |
| print("Loading data...") | |
| df = pd.read_csv(data_path) | |
| print(f"Loaded {len(df)} funds") | |
| print(f"Top quartile distribution: {df['is_top_quartile'].value_counts().to_dict()}") | |
| return df | |
| def engineer_features(df): | |
| """Create additional features for modeling""" | |
| df = df.copy() | |
| # Create interaction features | |
| df['size_rate_interaction'] = df['fund_size_mil'] * df['macro_interest_rate_at_launch'] | |
| df['pe_rate_interaction'] = df['public_market_pe_at_launch'] * df['macro_interest_rate_at_launch'] | |
| # Log transform of fund size (better for ML) | |
| df['log_fund_size'] = np.log1p(df['fund_size_mil']) | |
| # Vintage year bins (cycles matter) | |
| df['vintage_period'] = pd.cut(df['vintage_year'], | |
| bins=[2009, 2013, 2017, 2021, 2024], | |
| labels=['2010-2013', '2014-2017', '2018-2021', '2022-2023']) | |
| # Interest rate regime | |
| df['rate_regime'] = pd.cut(df['macro_interest_rate_at_launch'], | |
| bins=[0, 1, 3, 6], | |
| labels=['Low', 'Medium', 'High']) | |
| # PE ratio regime (valuation environment) | |
| df['valuation_regime'] = pd.cut(df['public_market_pe_at_launch'], | |
| bins=[0, 17, 21, 30], | |
| labels=['Low', 'Medium', 'High']) | |
| return df | |
| def train_model(df): | |
| """Train XGBoost model""" | |
| print("\nEngineering features...") | |
| df = engineer_features(df) | |
| # Encode categorical variables | |
| le_strategy = LabelEncoder() | |
| le_vintage_period = LabelEncoder() | |
| le_rate_regime = LabelEncoder() | |
| le_valuation_regime = LabelEncoder() | |
| df['strategy_encoded'] = le_strategy.fit_transform(df['strategy']) | |
| df['vintage_period_encoded'] = le_vintage_period.fit_transform(df['vintage_period']) | |
| df['rate_regime_encoded'] = le_rate_regime.fit_transform(df['rate_regime']) | |
| df['valuation_regime_encoded'] = le_valuation_regime.fit_transform(df['valuation_regime']) | |
| # Select features for modeling | |
| feature_cols = [ | |
| 'strategy_encoded', | |
| 'fund_size_mil', | |
| 'log_fund_size', | |
| 'macro_interest_rate_at_launch', | |
| 'public_market_pe_at_launch', | |
| 'vintage_year', | |
| 'vintage_period_encoded', | |
| 'rate_regime_encoded', | |
| 'valuation_regime_encoded', | |
| 'size_rate_interaction', | |
| 'pe_rate_interaction' | |
| ] | |
| X = df[feature_cols] | |
| y = df['is_top_quartile'] | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| print(f"\nTraining set: {len(X_train)} samples") | |
| print(f"Test set: {len(X_test)} samples") | |
| # Train XGBoost model | |
| print("\nTraining XGBoost model...") | |
| model = xgb.XGBClassifier( | |
| n_estimators=200, | |
| max_depth=6, | |
| learning_rate=0.1, | |
| subsample=0.8, | |
| colsample_bytree=0.8, | |
| objective='binary:logistic', | |
| eval_metric='auc', | |
| random_state=42, | |
| scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]) # Handle class imbalance | |
| ) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_test, y_test)], | |
| verbose=False | |
| ) | |
| # Evaluate | |
| y_pred_proba = model.predict_proba(X_test)[:, 1] | |
| y_pred = model.predict(X_test) | |
| print("\n" + "="*50) | |
| print("MODEL PERFORMANCE") | |
| print("="*50) | |
| print(classification_report(y_test, y_pred, target_names=['Not Top Quartile', 'Top Quartile'])) | |
| auc_score = roc_auc_score(y_test, y_pred_proba) | |
| print(f"\nROC-AUC Score: {auc_score:.4f}") | |
| # Feature importance | |
| feature_importance = pd.DataFrame({ | |
| 'feature': feature_cols, | |
| 'importance': model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| print("\nTop 10 Most Important Features:") | |
| print(feature_importance.head(10).to_string(index=False)) | |
| # Save model and encoders | |
| print("\nSaving model and encoders...") | |
| joblib.dump(model, 'models/xgboost_model.pkl') | |
| joblib.dump(le_strategy, 'models/strategy_encoder.pkl') | |
| joblib.dump(le_vintage_period, 'models/vintage_period_encoder.pkl') | |
| joblib.dump(le_rate_regime, 'models/rate_regime_encoder.pkl') | |
| joblib.dump(le_valuation_regime, 'models/valuation_regime_encoder.pkl') | |
| # Save feature names | |
| with open('models/feature_names.json', 'w') as f: | |
| json.dump(feature_cols, f) | |
| # Save metadata | |
| metadata = { | |
| 'auc_score': float(auc_score), | |
| 'n_training_samples': len(X_train), | |
| 'n_test_samples': len(X_test), | |
| 'strategies': le_strategy.classes_.tolist(), | |
| 'feature_importance': feature_importance.to_dict('records') | |
| } | |
| with open('models/model_metadata.json', 'w') as f: | |
| json.dump(metadata, f, indent=2) | |
| print("\nModel training complete!") | |
| print("Saved files:") | |
| print(" - models/xgboost_model.pkl") | |
| print(" - models/strategy_encoder.pkl") | |
| print(" - models/feature_names.json") | |
| print(" - models/model_metadata.json") | |
| return model, feature_importance, auc_score | |
| if __name__ == "__main__": | |
| import os | |
| # Create models directory | |
| os.makedirs('models', exist_ok=True) | |
| # Load data | |
| df = load_and_prepare_data() | |
| # Train model | |
| model, feature_importance, auc_score = train_model(df) | |
| print(f"\n{'='*50}") | |
| print(f"✓ Model successfully trained with AUC: {auc_score:.4f}") | |
| print(f"{'='*50}") | |