| import pandas as pd |
| import numpy as np |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix |
|
|
| |
| print("Loading data...") |
| df = pd.read_csv('real_tokamak_data_v2.csv') |
|
|
| |
| y = df['label'] |
| X = df.drop(['label', 'shot_id'], axis=1) |
|
|
| |
| X = X.loc[:, X.std() > 0] |
| |
| X = X.fillna(0) |
|
|
| |
| |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) |
|
|
| print(f"Training on {len(X_train)} shots.") |
| print(f"Testing on {len(X_test)} shots.") |
|
|
| |
| clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42) |
| clf.fit(X_train, y_train) |
|
|
| |
| y_prob = clf.predict_proba(X_test)[:, 1] |
| auc = roc_auc_score(y_test, y_prob) |
| acc = accuracy_score(y_test, clf.predict(X_test)) |
|
|
| print("\n" + "="*30) |
| print(f"RESULTS: CLASSICAL SUPERVISED BASELINE") |
| print("="*30) |
| print(f"✅ AUC Score: {auc:.4f}") |
| print(f"✅ Accuracy: {acc:.2%}") |
| print("-" * 30) |
| print("Confusion Matrix:") |
| print(confusion_matrix(y_test, clf.predict(X_test))) |
| print("="*30) |
|
|
| |
| |
| importances = clf.feature_importances_ |
| indices = np.argsort(importances)[::-1] |
|
|
| print("\nTop 5 Features (Physics Signals):") |
| for i in range(5): |
| print(f" {X.columns[indices[i]]} (Importance: {importances[indices[i]]:.4f})") |
|
|
| |
| top_features = X.columns[indices[:8]] |
| np.savez('qgan_data_optimized.npz', |
| X_train=X_train[top_features].values, |
| X_test=X_test[top_features].values, |
| y_train=y_train.values, |
| y_test=y_test.values) |
| print("\n💾 Saved optimized data to 'qgan_data_optimized.npz'") |