QGAN_Project / vGe0.1 /Fprocess_real_data_v2
1bnjmn3's picture
Add files using upload-large-folder tool
0f755ec verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os
# --- CONFIGURATION ---
# As defined in Section 4.3 and 4.4 of the Action Plan
N_DIM = 2 # Compress to 2D for the 4-qubit limit [cite: 147]
BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states [cite: 157]
OUTPUT_DIR = "v2_pipeline_output"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
def process_data():
print("๐Ÿš€ Starting Phase I: Data Engineering Pipeline...")
# 1. LOAD DATA
# We check possible locations for the V2 file
possible_paths = [
'real_tokamak_data_v2.csv', # Root
'vG.0.1/real_tokamak_data_v2.csv', # Your subfolder
'data/real_tokamak_data_v2.csv' # Common data folder
]
df = None
for path in possible_paths:
if os.path.exists(path):
print(f" โœ… Found data at: {path}")
df = pd.read_csv(path)
break
if df is None:
print(" โŒ Error: 'real_tokamak_data_v2.csv' not found in root or vG.0.1/.")
print(" Current Working Directory:", os.getcwd())
return
print(f" ๐Ÿ“Š Shape: {df.shape}")
# 2. SEPARATE HEALTHY VS ANOMALOUS
# "Critical Constraint: The Scaler must be fit ONLY on the 'Healthy' training data" [cite: 144]
# Assuming 'label' is the target and 'shot_id' is metadata
drop_cols = ['label', 'shot_id', 'machine'] # Add 'machine' if present
feature_cols = [c for c in df.columns if c not in drop_cols]
X_healthy = df[df['label'] == 0][feature_cols]
X_anomalous = df[df['label'] == 1][feature_cols] # For validation later
print(f" Healthy Samples (Training): {len(X_healthy)}")
print(f" Disruptive Samples (Validation): {len(X_anomalous)}")
# 3. NORMALIZATION (StandardScaler)
# "Subtracts the mean and divides by the standard deviation" [cite: 142]
print(" โš–๏ธ Normalizing...")
scaler = StandardScaler()
scaler.fit(X_healthy) # Fit ONLY on healthy
X_healthy_scaled = scaler.transform(X_healthy)
X_anomalous_scaled = scaler.transform(X_anomalous)
# 4. DIMENSIONALITY REDUCTION (PCA)
# Compressing to 2 dimensions to visualize "Islands" and fit 4 qubits [cite: 147]
print(f" ๐Ÿ“‰ Compressing to {N_DIM} Dimensions (PCA)...")
pca = PCA(n_components=N_DIM)
pca.fit(X_healthy_scaled) # Fit ONLY on healthy
X_healthy_pca = pca.transform(X_healthy_scaled)
X_anomalous_pca = pca.transform(X_anomalous_scaled)
print(f" Explained Variance Ratio: {pca.explained_variance_ratio_}")
# 5. DISCRETIZATION (The Grid Method)
# "Overlay a 4x4 grid... calculate probability density" [cite: 157-158]
print(" ๐Ÿ•ธ๏ธ Generating Quantum Target Distribution...")
# We use numpy to histogram the 2D data into 4x4 bins
hist, x_edges, y_edges = np.histogram2d(
X_healthy_pca[:, 0],
X_healthy_pca[:, 1],
bins=BINS_PER_DIM,
density=True
)
# Flatten to a 1D probability vector (size 16)
# This is the "DNA" the Quantum Generator must learn to replicate [cite: 160]
target_distribution = hist.flatten()
target_distribution = target_distribution / np.sum(target_distribution) # Normalize to sum to 1.0
print(f" Target Vector Shape: {target_distribution.shape}")
print(f" Target Vector (First 5): {target_distribution[:5]}")
# 6. VISUALIZATION (The "Money Plot")
# We verify if the Real Data actually has the "Islands" topology [cite: 151]
plt.figure(figsize=(10, 8))
# Plot Healthy (Blue)
plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1],
c='blue', alpha=0.3, s=10, label='Healthy Plasma (Training)')
# Plot Disruptive (Red)
plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1],
c='red', alpha=0.3, s=10, label='Disruptions (Testing)')
# Draw the Grid Lines
for x in x_edges:
plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
for y in y_edges:
plt.axhline(y, color='gray', linestyle='--', alpha=0.3)
plt.title(f"Real Data Topology (PCA): {len(df)} Shots")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.savefig(f"{OUTPUT_DIR}/real_data_topology.png")
print(f" ๐Ÿ“ธ Topology map saved to '{OUTPUT_DIR}/real_data_topology.png'")
# 7. SAVE EVERYTHING
# We need these for the QGAN training script
output_file = f"{OUTPUT_DIR}/processed_data.npz"
np.savez(output_file,
target_distribution=target_distribution, # For QGAN Loss
grid_bounds=(x_edges, y_edges), # For Discriminator Input
X_healthy_pca=X_healthy_pca, # For visual validation
X_anomalous_pca=X_anomalous_pca) # For final testing
# Save the models so we can run new live data later
joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")
print(f"\nโœ… SUCCESS. Pipeline artifacts saved to '{OUTPUT_DIR}/'")
if __name__ == "__main__":
process_data()