File size: 5,292 Bytes

0f755ec

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os

# --- CONFIGURATION ---
# As defined in Section 4.3 and 4.4 of the Action Plan
N_DIM = 2            # Compress to 2D for the 4-qubit limit [cite: 147]
BINS_PER_DIM = 4     # 4x4 Grid = 16 discrete states [cite: 157]
OUTPUT_DIR = "v2_pipeline_output"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def process_data():
    print("🚀 Starting Phase I: Data Engineering Pipeline...")
    
    # 1. LOAD DATA
    # We check possible locations for the V2 file
    possible_paths = [
        'real_tokamak_data_v2.csv',          # Root
        'vG.0.1/real_tokamak_data_v2.csv',   # Your subfolder
        'data/real_tokamak_data_v2.csv'      # Common data folder
    ]
    
    df = None
    for path in possible_paths:
        if os.path.exists(path):
            print(f"   ✅ Found data at: {path}")
            df = pd.read_csv(path)
            break
            
    if df is None:
        print("   ❌ Error: 'real_tokamak_data_v2.csv' not found in root or vG.0.1/.")
        print("      Current Working Directory:", os.getcwd())
        return
    
    print(f"   📊 Shape: {df.shape}")
    

    # 2. SEPARATE HEALTHY VS ANOMALOUS
    # "Critical Constraint: The Scaler must be fit ONLY on the 'Healthy' training data" [cite: 144]
    
    # Assuming 'label' is the target and 'shot_id' is metadata
    drop_cols = ['label', 'shot_id', 'machine'] # Add 'machine' if present
    feature_cols = [c for c in df.columns if c not in drop_cols]
    
    X_healthy = df[df['label'] == 0][feature_cols]
    X_anomalous = df[df['label'] == 1][feature_cols] # For validation later
    
    print(f"   Healthy Samples (Training): {len(X_healthy)}")
    print(f"   Disruptive Samples (Validation): {len(X_anomalous)}")

    # 3. NORMALIZATION (StandardScaler)
    # "Subtracts the mean and divides by the standard deviation" [cite: 142]
    print("   ⚖️  Normalizing...")
    scaler = StandardScaler()
    scaler.fit(X_healthy) # Fit ONLY on healthy
    
    X_healthy_scaled = scaler.transform(X_healthy)
    X_anomalous_scaled = scaler.transform(X_anomalous)

    # 4. DIMENSIONALITY REDUCTION (PCA)
    # Compressing to 2 dimensions to visualize "Islands" and fit 4 qubits [cite: 147]
    print(f"   📉 Compressing to {N_DIM} Dimensions (PCA)...")
    pca = PCA(n_components=N_DIM)
    pca.fit(X_healthy_scaled) # Fit ONLY on healthy
    
    X_healthy_pca = pca.transform(X_healthy_scaled)
    X_anomalous_pca = pca.transform(X_anomalous_scaled)
    
    print(f"   Explained Variance Ratio: {pca.explained_variance_ratio_}")

    # 5. DISCRETIZATION (The Grid Method)
    # "Overlay a 4x4 grid... calculate probability density" [cite: 157-158]
    print("   🕸️  Generating Quantum Target Distribution...")
    
    # We use numpy to histogram the 2D data into 4x4 bins
    hist, x_edges, y_edges = np.histogram2d(
        X_healthy_pca[:, 0], 
        X_healthy_pca[:, 1], 
        bins=BINS_PER_DIM, 
        density=True
    )
    
    # Flatten to a 1D probability vector (size 16)
    # This is the "DNA" the Quantum Generator must learn to replicate [cite: 160]
    target_distribution = hist.flatten()
    target_distribution = target_distribution / np.sum(target_distribution) # Normalize to sum to 1.0

    print(f"   Target Vector Shape: {target_distribution.shape}")
    print(f"   Target Vector (First 5): {target_distribution[:5]}")

    # 6. VISUALIZATION (The "Money Plot")
    # We verify if the Real Data actually has the "Islands" topology [cite: 151]
    plt.figure(figsize=(10, 8))
    
    # Plot Healthy (Blue)
    plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], 
                c='blue', alpha=0.3, s=10, label='Healthy Plasma (Training)')
    
    # Plot Disruptive (Red)
    plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], 
                c='red', alpha=0.3, s=10, label='Disruptions (Testing)')
    
    # Draw the Grid Lines
    for x in x_edges:
        plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
    for y in y_edges:
        plt.axhline(y, color='gray', linestyle='--', alpha=0.3)

    plt.title(f"Real Data Topology (PCA): {len(df)} Shots")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend()
    plt.savefig(f"{OUTPUT_DIR}/real_data_topology.png")
    print(f"   📸 Topology map saved to '{OUTPUT_DIR}/real_data_topology.png'")

    # 7. SAVE EVERYTHING
    # We need these for the QGAN training script
    output_file = f"{OUTPUT_DIR}/processed_data.npz"
    np.savez(output_file, 
             target_distribution=target_distribution, # For QGAN Loss
             grid_bounds=(x_edges, y_edges),          # For Discriminator Input
             X_healthy_pca=X_healthy_pca,             # For visual validation
             X_anomalous_pca=X_anomalous_pca)         # For final testing
    
    # Save the models so we can run new live data later
    joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
    joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")
    
    print(f"\n✅ SUCCESS. Pipeline artifacts saved to '{OUTPUT_DIR}/'")

if __name__ == "__main__":
    process_data()