import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import joblib import os # --- CONFIGURATION --- # As defined in Section 4.3 and 4.4 of the Action Plan N_DIM = 2 # Compress to 2D for the 4-qubit limit [cite: 147] BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states [cite: 157] OUTPUT_DIR = "v2_pipeline_output" if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) def process_data(): print("πŸš€ Starting Phase I: Data Engineering Pipeline...") # 1. LOAD DATA # We check possible locations for the V2 file possible_paths = [ 'real_tokamak_data_v2.csv', # Root 'vG.0.1/real_tokamak_data_v2.csv', # Your subfolder 'data/real_tokamak_data_v2.csv' # Common data folder ] df = None for path in possible_paths: if os.path.exists(path): print(f" βœ… Found data at: {path}") df = pd.read_csv(path) break if df is None: print(" ❌ Error: 'real_tokamak_data_v2.csv' not found in root or vG.0.1/.") print(" Current Working Directory:", os.getcwd()) return print(f" πŸ“Š Shape: {df.shape}") # 2. SEPARATE HEALTHY VS ANOMALOUS # "Critical Constraint: The Scaler must be fit ONLY on the 'Healthy' training data" [cite: 144] # Assuming 'label' is the target and 'shot_id' is metadata drop_cols = ['label', 'shot_id', 'machine'] # Add 'machine' if present feature_cols = [c for c in df.columns if c not in drop_cols] X_healthy = df[df['label'] == 0][feature_cols] X_anomalous = df[df['label'] == 1][feature_cols] # For validation later print(f" Healthy Samples (Training): {len(X_healthy)}") print(f" Disruptive Samples (Validation): {len(X_anomalous)}") # 3. NORMALIZATION (StandardScaler) # "Subtracts the mean and divides by the standard deviation" [cite: 142] print(" βš–οΈ Normalizing...") scaler = StandardScaler() scaler.fit(X_healthy) # Fit ONLY on healthy X_healthy_scaled = scaler.transform(X_healthy) X_anomalous_scaled = scaler.transform(X_anomalous) # 4. DIMENSIONALITY REDUCTION (PCA) # Compressing to 2 dimensions to visualize "Islands" and fit 4 qubits [cite: 147] print(f" πŸ“‰ Compressing to {N_DIM} Dimensions (PCA)...") pca = PCA(n_components=N_DIM) pca.fit(X_healthy_scaled) # Fit ONLY on healthy X_healthy_pca = pca.transform(X_healthy_scaled) X_anomalous_pca = pca.transform(X_anomalous_scaled) print(f" Explained Variance Ratio: {pca.explained_variance_ratio_}") # 5. DISCRETIZATION (The Grid Method) # "Overlay a 4x4 grid... calculate probability density" [cite: 157-158] print(" πŸ•ΈοΈ Generating Quantum Target Distribution...") # We use numpy to histogram the 2D data into 4x4 bins hist, x_edges, y_edges = np.histogram2d( X_healthy_pca[:, 0], X_healthy_pca[:, 1], bins=BINS_PER_DIM, density=True ) # Flatten to a 1D probability vector (size 16) # This is the "DNA" the Quantum Generator must learn to replicate [cite: 160] target_distribution = hist.flatten() target_distribution = target_distribution / np.sum(target_distribution) # Normalize to sum to 1.0 print(f" Target Vector Shape: {target_distribution.shape}") print(f" Target Vector (First 5): {target_distribution[:5]}") # 6. VISUALIZATION (The "Money Plot") # We verify if the Real Data actually has the "Islands" topology [cite: 151] plt.figure(figsize=(10, 8)) # Plot Healthy (Blue) plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], c='blue', alpha=0.3, s=10, label='Healthy Plasma (Training)') # Plot Disruptive (Red) plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], c='red', alpha=0.3, s=10, label='Disruptions (Testing)') # Draw the Grid Lines for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3) for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3) plt.title(f"Real Data Topology (PCA): {len(df)} Shots") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.legend() plt.savefig(f"{OUTPUT_DIR}/real_data_topology.png") print(f" πŸ“Έ Topology map saved to '{OUTPUT_DIR}/real_data_topology.png'") # 7. SAVE EVERYTHING # We need these for the QGAN training script output_file = f"{OUTPUT_DIR}/processed_data.npz" np.savez(output_file, target_distribution=target_distribution, # For QGAN Loss grid_bounds=(x_edges, y_edges), # For Discriminator Input X_healthy_pca=X_healthy_pca, # For visual validation X_anomalous_pca=X_anomalous_pca) # For final testing # Save the models so we can run new live data later joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl") joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl") print(f"\nβœ… SUCCESS. Pipeline artifacts saved to '{OUTPUT_DIR}/'") if __name__ == "__main__": process_data()