import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.ensemble import IsolationForest from sklearn.svm import OneClassSVM from sklearn.impute import SimpleImputer from sklearn.metrics import roc_auc_score # 1. Load Data print("Loading data...") df = pd.read_csv('real_tokamak_data.csv') # 2. Preprocessing & Cleaning # Separate label and remove metadata y = df['label'] X = df.drop(['label', 'shot_id'], axis=1) # --- SAFETY BLOCK: CLEAN THE DATA --- print(f"Original Shape: {X.shape}") # A. Drop columns that are completely constant (StandardScaler hates these) # If a sensor reads '0.0' for every single shot, it gives 0 variance -> NaN X = X.loc[:, X.std() > 0] print(f"Shape after dropping dead sensors: {X.shape}") # B. Fill any remaining NaNs with 0 imputer = SimpleImputer(strategy='constant', fill_value=0) X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns) # Check if infinite values exist and replace them X.replace([np.inf, -np.inf], 0, inplace=True) # ------------------------------------ # Split: Train ONLY on Healthy data (Label 0) X_healthy = X[y == 0] X_train, X_test_healthy = train_test_split(X_healthy, test_size=0.2, random_state=42) # Add Disruptive shots to Test Set X_disruptive = X[y == 1] X_test = pd.concat([X_test_healthy, X_disruptive]) y_test = np.concatenate([np.zeros(len(X_test_healthy)), np.ones(len(X_disruptive))]) # Normalize scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # PCA (Reduce to 10 features) # We use 0.95 variance or 10 components, whichever is safer for your data size n_components = min(10, X_train_scaled.shape[1]) pca = PCA(n_components=n_components) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) print(f"Data Prepared. Training on {len(X_train_pca)} healthy shots.") print(f"Testing on {len(X_test_pca)} shots ({len(X_disruptive)} disruptions).") # --- MODEL 1: Isolation Forest --- print("\nRunning Isolation Forest...") iso = IsolationForest(contamination=0.1, random_state=42) iso.fit(X_train_pca) y_pred_iso = -iso.score_samples(X_test_pca) auc_iso = roc_auc_score(y_test, y_pred_iso) print(f" >> Isolation Forest AUC: {auc_iso:.4f}") # --- MODEL 2: One-Class SVM --- print("Running One-Class SVM...") svm = OneClassSVM(nu=0.1) svm.fit(X_train_pca) y_pred_svm = -svm.score_samples(X_test_pca) auc_svm = roc_auc_score(y_test, y_pred_svm) print(f" >> One-Class SVM AUC: {auc_svm:.4f}") # Save the PCA data for QGAN np.savez('pca_dataset.npz', X_train=X_train_pca, X_test=X_test_pca, y_test=y_test) print("\n✅ PCA Data saved to 'pca_dataset.npz'.")