""" Machine Learning Model Training Trains XGBoost model to predict top-quartile fund performance """ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report, roc_auc_score, roc_curve import xgboost as xgb import joblib import json import matplotlib.pyplot as plt import seaborn as sns # Set style sns.set_style('whitegrid') def load_and_prepare_data(data_path='data/mock_fund_data.csv'): """Load and prepare data for modeling""" print("Loading data...") df = pd.read_csv(data_path) print(f"Loaded {len(df)} funds") print(f"Top quartile distribution: {df['is_top_quartile'].value_counts().to_dict()}") return df def engineer_features(df): """Create additional features for modeling""" df = df.copy() # Create interaction features df['size_rate_interaction'] = df['fund_size_mil'] * df['macro_interest_rate_at_launch'] df['pe_rate_interaction'] = df['public_market_pe_at_launch'] * df['macro_interest_rate_at_launch'] # Log transform of fund size (better for ML) df['log_fund_size'] = np.log1p(df['fund_size_mil']) # Vintage year bins (cycles matter) df['vintage_period'] = pd.cut(df['vintage_year'], bins=[2009, 2013, 2017, 2021, 2024], labels=['2010-2013', '2014-2017', '2018-2021', '2022-2023']) # Interest rate regime df['rate_regime'] = pd.cut(df['macro_interest_rate_at_launch'], bins=[0, 1, 3, 6], labels=['Low', 'Medium', 'High']) # PE ratio regime (valuation environment) df['valuation_regime'] = pd.cut(df['public_market_pe_at_launch'], bins=[0, 17, 21, 30], labels=['Low', 'Medium', 'High']) return df def train_model(df): """Train XGBoost model""" print("\nEngineering features...") df = engineer_features(df) # Encode categorical variables le_strategy = LabelEncoder() le_vintage_period = LabelEncoder() le_rate_regime = LabelEncoder() le_valuation_regime = LabelEncoder() df['strategy_encoded'] = le_strategy.fit_transform(df['strategy']) df['vintage_period_encoded'] = le_vintage_period.fit_transform(df['vintage_period']) df['rate_regime_encoded'] = le_rate_regime.fit_transform(df['rate_regime']) df['valuation_regime_encoded'] = le_valuation_regime.fit_transform(df['valuation_regime']) # Select features for modeling feature_cols = [ 'strategy_encoded', 'fund_size_mil', 'log_fund_size', 'macro_interest_rate_at_launch', 'public_market_pe_at_launch', 'vintage_year', 'vintage_period_encoded', 'rate_regime_encoded', 'valuation_regime_encoded', 'size_rate_interaction', 'pe_rate_interaction' ] X = df[feature_cols] y = df['is_top_quartile'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"\nTraining set: {len(X_train)} samples") print(f"Test set: {len(X_test)} samples") # Train XGBoost model print("\nTraining XGBoost model...") model = xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', eval_metric='auc', random_state=42, scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]) # Handle class imbalance ) model.fit( X_train, y_train, eval_set=[(X_test, y_test)], verbose=False ) # Evaluate y_pred_proba = model.predict_proba(X_test)[:, 1] y_pred = model.predict(X_test) print("\n" + "="*50) print("MODEL PERFORMANCE") print("="*50) print(classification_report(y_test, y_pred, target_names=['Not Top Quartile', 'Top Quartile'])) auc_score = roc_auc_score(y_test, y_pred_proba) print(f"\nROC-AUC Score: {auc_score:.4f}") # Feature importance feature_importance = pd.DataFrame({ 'feature': feature_cols, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) print("\nTop 10 Most Important Features:") print(feature_importance.head(10).to_string(index=False)) # Save model and encoders print("\nSaving model and encoders...") joblib.dump(model, 'models/xgboost_model.pkl') joblib.dump(le_strategy, 'models/strategy_encoder.pkl') joblib.dump(le_vintage_period, 'models/vintage_period_encoder.pkl') joblib.dump(le_rate_regime, 'models/rate_regime_encoder.pkl') joblib.dump(le_valuation_regime, 'models/valuation_regime_encoder.pkl') # Save feature names with open('models/feature_names.json', 'w') as f: json.dump(feature_cols, f) # Save metadata metadata = { 'auc_score': float(auc_score), 'n_training_samples': len(X_train), 'n_test_samples': len(X_test), 'strategies': le_strategy.classes_.tolist(), 'feature_importance': feature_importance.to_dict('records') } with open('models/model_metadata.json', 'w') as f: json.dump(metadata, f, indent=2) print("\nModel training complete!") print("Saved files:") print(" - models/xgboost_model.pkl") print(" - models/strategy_encoder.pkl") print(" - models/feature_names.json") print(" - models/model_metadata.json") return model, feature_importance, auc_score if __name__ == "__main__": import os # Create models directory os.makedirs('models', exist_ok=True) # Load data df = load_and_prepare_data() # Train model model, feature_importance, auc_score = train_model(df) print(f"\n{'='*50}") print(f"✓ Model successfully trained with AUC: {auc_score:.4f}") print(f"{'='*50}")