QGAN_Project / vG.0.1 /benchmark_supervised.py
1bnjmn3's picture
Add files using upload-large-folder tool
0f755ec verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
# 1. Load Data
print("Loading data...")
df = pd.read_csv('real_tokamak_data_v2.csv')
# 2. Cleaning (Same as before)
y = df['label']
X = df.drop(['label', 'shot_id'], axis=1)
# Drop dead sensors
X = X.loc[:, X.std() > 0]
# Fill NaNs
X = X.fillna(0)
# 3. Split Data
# Stratify ensures we have the same % of disruptions in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Training on {len(X_train)} shots.")
print(f"Testing on {len(X_test)} shots.")
# 4. Train Random Forest (The "Workhorse" of Classical ML)
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)
# 5. Evaluate
y_prob = clf.predict_proba(X_test)[:, 1] # Probability of Disruption
auc = roc_auc_score(y_test, y_prob)
acc = accuracy_score(y_test, clf.predict(X_test))
print("\n" + "="*30)
print(f"RESULTS: CLASSICAL SUPERVISED BASELINE")
print("="*30)
print(f"✅ AUC Score: {auc:.4f}")
print(f"✅ Accuracy: {acc:.2%}")
print("-" * 30)
print("Confusion Matrix:")
print(confusion_matrix(y_test, clf.predict(X_test)))
print("="*30)
# 6. Save the Best Features for QGAN
# We can ask the Random Forest which pixels matter most!
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nTop 5 Features (Physics Signals):")
for i in range(5):
print(f" {X.columns[indices[i]]} (Importance: {importances[indices[i]]:.4f})")
# Save only the top 8 features for the QGAN (Small quantum circuit limit)
top_features = X.columns[indices[:8]]
np.savez('qgan_data_optimized.npz',
X_train=X_train[top_features].values,
X_test=X_test[top_features].values,
y_train=y_train.values,
y_test=y_test.values)
print("\n💾 Saved optimized data to 'qgan_data_optimized.npz'")