exoplanet / train_model.py
Nur Arifin Akbar
Convert to Gradio app with pre-trained model
ba763c8
"""
Train ML model for exoplanet classification
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
import joblib
import json
def prepare_kepler_data():
"""Load and prepare Kepler KOI dataset"""
print("Loading Kepler KOI dataset...")
df = pd.read_csv('data/kepler_koi.csv')
# Create binary classification: CONFIRMED vs NOT CONFIRMED (FALSE POSITIVE + CANDIDATE)
# This is the main task - identifying which objects are actual exoplanets
df['is_exoplanet'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
# Select important features for classification
feature_columns = [
'koi_period', # Orbital period (days)
'koi_duration', # Transit duration (hours)
'koi_depth', # Transit depth (ppm)
'koi_prad', # Planetary radius (Earth radii)
'koi_teq', # Equilibrium temperature (K)
'koi_insol', # Insolation flux (Earth flux)
'koi_steff', # Stellar effective temperature (K)
'koi_srad', # Stellar radius (solar radii)
'koi_smass', # Stellar mass (solar masses)
'koi_slogg', # Stellar surface gravity (log10(cm/s^2))
'koi_model_snr', # Transit signal-to-noise
'koi_count', # Number of planets in system
'koi_num_transits' # Number of transits observed
]
# Keep only columns that exist
feature_columns = [col for col in feature_columns if col in df.columns]
print(f"Using {len(feature_columns)} features: {feature_columns}")
X = df[feature_columns].copy()
y = df['is_exoplanet'].copy()
print(f"\nDataset size: {len(X)} samples")
print(f"Features: {len(feature_columns)}")
print(f"\nClass distribution:")
print(f" Exoplanets: {y.sum()} ({y.sum()/len(y)*100:.1f}%)")
print(f" Non-exoplanets: {len(y)-y.sum()} ({(len(y)-y.sum())/len(y)*100:.1f}%)")
return X, y, feature_columns
def train_model(X, y, feature_columns):
"""Train and evaluate the model"""
print("\n" + "="*80)
print("TRAINING MODEL")
print("="*80)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
# Handle missing values
print("\nHandling missing values with median imputation...")
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Scale features
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
# Train Random Forest
print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(
n_estimators=200,
max_depth=20,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)
# Train Gradient Boosting
print("Training Gradient Boosting Classifier...")
gb_model = GradientBoostingClassifier(
n_estimators=200,
max_depth=5,
learning_rate=0.1,
random_state=42
)
gb_model.fit(X_train_scaled, y_train)
# Evaluate models
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
models = {
'Random Forest': rf_model,
'Gradient Boosting': gb_model
}
best_model = None
best_score = 0
best_name = ''
for name, model in models.items():
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n{name}:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
target_names=['Non-Exoplanet', 'Exoplanet']))
if accuracy > best_score:
best_score = accuracy
best_model = model
best_name = name
print(f"\nBest Model: {best_name} (Accuracy: {best_score:.4f})")
# Feature importance
print("\n" + "="*80)
print("FEATURE IMPORTANCE (Top 10)")
print("="*80)
if hasattr(best_model, 'feature_importances_'):
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(min(10, len(feature_columns))):
idx = indices[i]
print(f"{i+1}. {feature_columns[idx]}: {importances[idx]:.4f}")
# Save model and preprocessing objects
print("\n" + "="*80)
print("SAVING MODEL")
print("="*80)
model_artifacts = {
'model': best_model,
'scaler': scaler,
'imputer': imputer,
'feature_columns': feature_columns,
'model_name': best_name,
'accuracy': best_score
}
joblib.dump(model_artifacts, 'model/exoplanet_classifier.pkl')
print("✓ Model saved to model/exoplanet_classifier.pkl")
# Save metadata
metadata = {
'model_name': best_name,
'accuracy': float(best_score),
'n_features': len(feature_columns),
'feature_columns': feature_columns,
'training_samples': len(X_train),
'test_samples': len(X_test)
}
with open('model/metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print("✓ Metadata saved to model/metadata.json")
return best_model, scaler, imputer
def main():
import os
os.makedirs('model', exist_ok=True)
# Prepare data
X, y, feature_columns = prepare_kepler_data()
# Train model
model, scaler, imputer = train_model(X, y, feature_columns)
print("\n" + "="*80)
print("MODEL TRAINING COMPLETE!")
print("="*80)
if __name__ == "__main__":
main()