Spaces:
Sleeping
Sleeping
| """ | |
| Train ML model for exoplanet classification | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | |
| from sklearn.impute import SimpleImputer | |
| import joblib | |
| import json | |
| def prepare_kepler_data(): | |
| """Load and prepare Kepler KOI dataset""" | |
| print("Loading Kepler KOI dataset...") | |
| df = pd.read_csv('data/kepler_koi.csv') | |
| # Create binary classification: CONFIRMED vs NOT CONFIRMED (FALSE POSITIVE + CANDIDATE) | |
| # This is the main task - identifying which objects are actual exoplanets | |
| df['is_exoplanet'] = (df['koi_disposition'] == 'CONFIRMED').astype(int) | |
| # Select important features for classification | |
| feature_columns = [ | |
| 'koi_period', # Orbital period (days) | |
| 'koi_duration', # Transit duration (hours) | |
| 'koi_depth', # Transit depth (ppm) | |
| 'koi_prad', # Planetary radius (Earth radii) | |
| 'koi_teq', # Equilibrium temperature (K) | |
| 'koi_insol', # Insolation flux (Earth flux) | |
| 'koi_steff', # Stellar effective temperature (K) | |
| 'koi_srad', # Stellar radius (solar radii) | |
| 'koi_smass', # Stellar mass (solar masses) | |
| 'koi_slogg', # Stellar surface gravity (log10(cm/s^2)) | |
| 'koi_model_snr', # Transit signal-to-noise | |
| 'koi_count', # Number of planets in system | |
| 'koi_num_transits' # Number of transits observed | |
| ] | |
| # Keep only columns that exist | |
| feature_columns = [col for col in feature_columns if col in df.columns] | |
| print(f"Using {len(feature_columns)} features: {feature_columns}") | |
| X = df[feature_columns].copy() | |
| y = df['is_exoplanet'].copy() | |
| print(f"\nDataset size: {len(X)} samples") | |
| print(f"Features: {len(feature_columns)}") | |
| print(f"\nClass distribution:") | |
| print(f" Exoplanets: {y.sum()} ({y.sum()/len(y)*100:.1f}%)") | |
| print(f" Non-exoplanets: {len(y)-y.sum()} ({(len(y)-y.sum())/len(y)*100:.1f}%)") | |
| return X, y, feature_columns | |
| def train_model(X, y, feature_columns): | |
| """Train and evaluate the model""" | |
| print("\n" + "="*80) | |
| print("TRAINING MODEL") | |
| print("="*80) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| print(f"\nTrain set: {len(X_train)} samples") | |
| print(f"Test set: {len(X_test)} samples") | |
| # Handle missing values | |
| print("\nHandling missing values with median imputation...") | |
| imputer = SimpleImputer(strategy='median') | |
| X_train_imputed = imputer.fit_transform(X_train) | |
| X_test_imputed = imputer.transform(X_test) | |
| # Scale features | |
| print("Scaling features...") | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train_imputed) | |
| X_test_scaled = scaler.transform(X_test_imputed) | |
| # Train Random Forest | |
| print("\nTraining Random Forest Classifier...") | |
| rf_model = RandomForestClassifier( | |
| n_estimators=200, | |
| max_depth=20, | |
| min_samples_split=5, | |
| min_samples_leaf=2, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| rf_model.fit(X_train_scaled, y_train) | |
| # Train Gradient Boosting | |
| print("Training Gradient Boosting Classifier...") | |
| gb_model = GradientBoostingClassifier( | |
| n_estimators=200, | |
| max_depth=5, | |
| learning_rate=0.1, | |
| random_state=42 | |
| ) | |
| gb_model.fit(X_train_scaled, y_train) | |
| # Evaluate models | |
| print("\n" + "="*80) | |
| print("EVALUATION RESULTS") | |
| print("="*80) | |
| models = { | |
| 'Random Forest': rf_model, | |
| 'Gradient Boosting': gb_model | |
| } | |
| best_model = None | |
| best_score = 0 | |
| best_name = '' | |
| for name, model in models.items(): | |
| y_pred = model.predict(X_test_scaled) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"\n{name}:") | |
| print(f"Accuracy: {accuracy:.4f}") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred, | |
| target_names=['Non-Exoplanet', 'Exoplanet'])) | |
| if accuracy > best_score: | |
| best_score = accuracy | |
| best_model = model | |
| best_name = name | |
| print(f"\nBest Model: {best_name} (Accuracy: {best_score:.4f})") | |
| # Feature importance | |
| print("\n" + "="*80) | |
| print("FEATURE IMPORTANCE (Top 10)") | |
| print("="*80) | |
| if hasattr(best_model, 'feature_importances_'): | |
| importances = best_model.feature_importances_ | |
| indices = np.argsort(importances)[::-1] | |
| for i in range(min(10, len(feature_columns))): | |
| idx = indices[i] | |
| print(f"{i+1}. {feature_columns[idx]}: {importances[idx]:.4f}") | |
| # Save model and preprocessing objects | |
| print("\n" + "="*80) | |
| print("SAVING MODEL") | |
| print("="*80) | |
| model_artifacts = { | |
| 'model': best_model, | |
| 'scaler': scaler, | |
| 'imputer': imputer, | |
| 'feature_columns': feature_columns, | |
| 'model_name': best_name, | |
| 'accuracy': best_score | |
| } | |
| joblib.dump(model_artifacts, 'model/exoplanet_classifier.pkl') | |
| print("✓ Model saved to model/exoplanet_classifier.pkl") | |
| # Save metadata | |
| metadata = { | |
| 'model_name': best_name, | |
| 'accuracy': float(best_score), | |
| 'n_features': len(feature_columns), | |
| 'feature_columns': feature_columns, | |
| 'training_samples': len(X_train), | |
| 'test_samples': len(X_test) | |
| } | |
| with open('model/metadata.json', 'w') as f: | |
| json.dump(metadata, f, indent=2) | |
| print("✓ Metadata saved to model/metadata.json") | |
| return best_model, scaler, imputer | |
| def main(): | |
| import os | |
| os.makedirs('model', exist_ok=True) | |
| # Prepare data | |
| X, y, feature_columns = prepare_kepler_data() | |
| # Train model | |
| model, scaler, imputer = train_model(X, y, feature_columns) | |
| print("\n" + "="*80) | |
| print("MODEL TRAINING COMPLETE!") | |
| print("="*80) | |
| if __name__ == "__main__": | |
| main() | |