import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix # 1. Load Data print("Loading data...") df = pd.read_csv('real_tokamak_data_v2.csv') # 2. Cleaning (Same as before) y = df['label'] X = df.drop(['label', 'shot_id'], axis=1) # Drop dead sensors X = X.loc[:, X.std() > 0] # Fill NaNs X = X.fillna(0) # 3. Split Data # Stratify ensures we have the same % of disruptions in train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) print(f"Training on {len(X_train)} shots.") print(f"Testing on {len(X_test)} shots.") # 4. Train Random Forest (The "Workhorse" of Classical ML) clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42) clf.fit(X_train, y_train) # 5. Evaluate y_prob = clf.predict_proba(X_test)[:, 1] # Probability of Disruption auc = roc_auc_score(y_test, y_prob) acc = accuracy_score(y_test, clf.predict(X_test)) print("\n" + "="*30) print(f"RESULTS: CLASSICAL SUPERVISED BASELINE") print("="*30) print(f"āœ… AUC Score: {auc:.4f}") print(f"āœ… Accuracy: {acc:.2%}") print("-" * 30) print("Confusion Matrix:") print(confusion_matrix(y_test, clf.predict(X_test))) print("="*30) # 6. Save the Best Features for QGAN # We can ask the Random Forest which pixels matter most! importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] print("\nTop 5 Features (Physics Signals):") for i in range(5): print(f" {X.columns[indices[i]]} (Importance: {importances[indices[i]]:.4f})") # Save only the top 8 features for the QGAN (Small quantum circuit limit) top_features = X.columns[indices[:8]] np.savez('qgan_data_optimized.npz', X_train=X_train[top_features].values, X_test=X_test[top_features].values, y_train=y_train.values, y_test=y_test.values) print("\nšŸ’¾ Saved optimized data to 'qgan_data_optimized.npz'")