Merger-and-Acquisition / train_model.py
AAdevloper
Initial commit: Alpha-Index 100 Gradio app
faebc8b
"""
Machine Learning Model Training
Trains XGBoost model to predict top-quartile fund performance
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import xgboost as xgb
import joblib
import json
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
sns.set_style('whitegrid')
def load_and_prepare_data(data_path='data/mock_fund_data.csv'):
"""Load and prepare data for modeling"""
print("Loading data...")
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} funds")
print(f"Top quartile distribution: {df['is_top_quartile'].value_counts().to_dict()}")
return df
def engineer_features(df):
"""Create additional features for modeling"""
df = df.copy()
# Create interaction features
df['size_rate_interaction'] = df['fund_size_mil'] * df['macro_interest_rate_at_launch']
df['pe_rate_interaction'] = df['public_market_pe_at_launch'] * df['macro_interest_rate_at_launch']
# Log transform of fund size (better for ML)
df['log_fund_size'] = np.log1p(df['fund_size_mil'])
# Vintage year bins (cycles matter)
df['vintage_period'] = pd.cut(df['vintage_year'],
bins=[2009, 2013, 2017, 2021, 2024],
labels=['2010-2013', '2014-2017', '2018-2021', '2022-2023'])
# Interest rate regime
df['rate_regime'] = pd.cut(df['macro_interest_rate_at_launch'],
bins=[0, 1, 3, 6],
labels=['Low', 'Medium', 'High'])
# PE ratio regime (valuation environment)
df['valuation_regime'] = pd.cut(df['public_market_pe_at_launch'],
bins=[0, 17, 21, 30],
labels=['Low', 'Medium', 'High'])
return df
def train_model(df):
"""Train XGBoost model"""
print("\nEngineering features...")
df = engineer_features(df)
# Encode categorical variables
le_strategy = LabelEncoder()
le_vintage_period = LabelEncoder()
le_rate_regime = LabelEncoder()
le_valuation_regime = LabelEncoder()
df['strategy_encoded'] = le_strategy.fit_transform(df['strategy'])
df['vintage_period_encoded'] = le_vintage_period.fit_transform(df['vintage_period'])
df['rate_regime_encoded'] = le_rate_regime.fit_transform(df['rate_regime'])
df['valuation_regime_encoded'] = le_valuation_regime.fit_transform(df['valuation_regime'])
# Select features for modeling
feature_cols = [
'strategy_encoded',
'fund_size_mil',
'log_fund_size',
'macro_interest_rate_at_launch',
'public_market_pe_at_launch',
'vintage_year',
'vintage_period_encoded',
'rate_regime_encoded',
'valuation_regime_encoded',
'size_rate_interaction',
'pe_rate_interaction'
]
X = df[feature_cols]
y = df['is_top_quartile']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
# Train XGBoost model
print("\nTraining XGBoost model...")
model = xgb.XGBClassifier(
n_estimators=200,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
eval_metric='auc',
random_state=42,
scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]) # Handle class imbalance
)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False
)
# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)
print(classification_report(y_test, y_pred, target_names=['Not Top Quartile', 'Top Quartile']))
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC-AUC Score: {auc_score:.4f}")
# Feature importance
feature_importance = pd.DataFrame({
'feature': feature_cols,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))
# Save model and encoders
print("\nSaving model and encoders...")
joblib.dump(model, 'models/xgboost_model.pkl')
joblib.dump(le_strategy, 'models/strategy_encoder.pkl')
joblib.dump(le_vintage_period, 'models/vintage_period_encoder.pkl')
joblib.dump(le_rate_regime, 'models/rate_regime_encoder.pkl')
joblib.dump(le_valuation_regime, 'models/valuation_regime_encoder.pkl')
# Save feature names
with open('models/feature_names.json', 'w') as f:
json.dump(feature_cols, f)
# Save metadata
metadata = {
'auc_score': float(auc_score),
'n_training_samples': len(X_train),
'n_test_samples': len(X_test),
'strategies': le_strategy.classes_.tolist(),
'feature_importance': feature_importance.to_dict('records')
}
with open('models/model_metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print("\nModel training complete!")
print("Saved files:")
print(" - models/xgboost_model.pkl")
print(" - models/strategy_encoder.pkl")
print(" - models/feature_names.json")
print(" - models/model_metadata.json")
return model, feature_importance, auc_score
if __name__ == "__main__":
import os
# Create models directory
os.makedirs('models', exist_ok=True)
# Load data
df = load_and_prepare_data()
# Train model
model, feature_importance, auc_score = train_model(df)
print(f"\n{'='*50}")
print(f"✓ Model successfully trained with AUC: {auc_score:.4f}")
print(f"{'='*50}")