import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

# 1. Load Data
print("Loading data...")
df = pd.read_csv('real_tokamak_data_v2.csv')

# 2. Cleaning (Same as before)
y = df['label']
X = df.drop(['label', 'shot_id'], axis=1)

# Drop dead sensors
X = X.loc[:, X.std() > 0]
# Fill NaNs
X = X.fillna(0)

# 3. Split Data
# Stratify ensures we have the same % of disruptions in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training on {len(X_train)} shots.")
print(f"Testing on {len(X_test)} shots.")

# 4. Train Random Forest (The "Workhorse" of Classical ML)
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# 5. Evaluate
y_prob = clf.predict_proba(X_test)[:, 1] # Probability of Disruption
auc = roc_auc_score(y_test, y_prob)
acc = accuracy_score(y_test, clf.predict(X_test))

print("\n" + "="*30)
print(f"RESULTS: CLASSICAL SUPERVISED BASELINE")
print("="*30)
print(f"✅ AUC Score:       {auc:.4f}")
print(f"✅ Accuracy:        {acc:.2%}")
print("-" * 30)
print("Confusion Matrix:")
print(confusion_matrix(y_test, clf.predict(X_test)))
print("="*30)

# 6. Save the Best Features for QGAN
# We can ask the Random Forest which pixels matter most!
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 5 Features (Physics Signals):")
for i in range(5):
    print(f"   {X.columns[indices[i]]} (Importance: {importances[indices[i]]:.4f})")

# Save only the top 8 features for the QGAN (Small quantum circuit limit)
top_features = X.columns[indices[:8]]
np.savez('qgan_data_optimized.npz', 
         X_train=X_train[top_features].values, 
         X_test=X_test[top_features].values, 
         y_train=y_train.values,
         y_test=y_test.values)
print("\n💾 Saved optimized data to 'qgan_data_optimized.npz'")