QGAN_Project / vGe0.1 /Fprocess_real_data_v2

Add files using upload-large-folder tool

0f755ec verified 6 months ago

5.29 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	import joblib
	import os

	# --- CONFIGURATION ---
	# As defined in Section 4.3 and 4.4 of the Action Plan
	N_DIM = 2 # Compress to 2D for the 4-qubit limit [cite: 147]
	BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states [cite: 157]
	OUTPUT_DIR = "v2_pipeline_output"

	if not os.path.exists(OUTPUT_DIR):
	os.makedirs(OUTPUT_DIR)

	def process_data():
	print("🚀 Starting Phase I: Data Engineering Pipeline...")

	# 1. LOAD DATA
	# We check possible locations for the V2 file
	possible_paths = [
	'real_tokamak_data_v2.csv', # Root
	'vG.0.1/real_tokamak_data_v2.csv', # Your subfolder
	'data/real_tokamak_data_v2.csv' # Common data folder
	]

	df = None
	for path in possible_paths:
	if os.path.exists(path):
	print(f" ✅ Found data at: {path}")
	df = pd.read_csv(path)
	break

	if df is None:
	print(" ❌ Error: 'real_tokamak_data_v2.csv' not found in root or vG.0.1/.")
	print(" Current Working Directory:", os.getcwd())
	return

	print(f" 📊 Shape: {df.shape}")


	# 2. SEPARATE HEALTHY VS ANOMALOUS
	# "Critical Constraint: The Scaler must be fit ONLY on the 'Healthy' training data" [cite: 144]

	# Assuming 'label' is the target and 'shot_id' is metadata
	drop_cols = ['label', 'shot_id', 'machine'] # Add 'machine' if present
	feature_cols = [c for c in df.columns if c not in drop_cols]

	X_healthy = df[df['label'] == 0][feature_cols]
	X_anomalous = df[df['label'] == 1][feature_cols] # For validation later

	print(f" Healthy Samples (Training): {len(X_healthy)}")
	print(f" Disruptive Samples (Validation): {len(X_anomalous)}")

	# 3. NORMALIZATION (StandardScaler)
	# "Subtracts the mean and divides by the standard deviation" [cite: 142]
	print(" ⚖️ Normalizing...")
	scaler = StandardScaler()
	scaler.fit(X_healthy) # Fit ONLY on healthy

	X_healthy_scaled = scaler.transform(X_healthy)
	X_anomalous_scaled = scaler.transform(X_anomalous)

	# 4. DIMENSIONALITY REDUCTION (PCA)
	# Compressing to 2 dimensions to visualize "Islands" and fit 4 qubits [cite: 147]
	print(f" 📉 Compressing to {N_DIM} Dimensions (PCA)...")
	pca = PCA(n_components=N_DIM)
	pca.fit(X_healthy_scaled) # Fit ONLY on healthy

	X_healthy_pca = pca.transform(X_healthy_scaled)
	X_anomalous_pca = pca.transform(X_anomalous_scaled)

	print(f" Explained Variance Ratio: {pca.explained_variance_ratio_}")

	# 5. DISCRETIZATION (The Grid Method)
	# "Overlay a 4x4 grid... calculate probability density" [cite: 157-158]
	print(" 🕸️ Generating Quantum Target Distribution...")

	# We use numpy to histogram the 2D data into 4x4 bins
	hist, x_edges, y_edges = np.histogram2d(
	X_healthy_pca[:, 0],
	X_healthy_pca[:, 1],
	bins=BINS_PER_DIM,
	density=True
	)

	# Flatten to a 1D probability vector (size 16)
	# This is the "DNA" the Quantum Generator must learn to replicate [cite: 160]
	target_distribution = hist.flatten()
	target_distribution = target_distribution / np.sum(target_distribution) # Normalize to sum to 1.0

	print(f" Target Vector Shape: {target_distribution.shape}")
	print(f" Target Vector (First 5): {target_distribution[:5]}")

	# 6. VISUALIZATION (The "Money Plot")
	# We verify if the Real Data actually has the "Islands" topology [cite: 151]
	plt.figure(figsize=(10, 8))

	# Plot Healthy (Blue)
	plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1],
	c='blue', alpha=0.3, s=10, label='Healthy Plasma (Training)')

	# Plot Disruptive (Red)
	plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1],
	c='red', alpha=0.3, s=10, label='Disruptions (Testing)')

	# Draw the Grid Lines
	for x in x_edges:
	plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
	for y in y_edges:
	plt.axhline(y, color='gray', linestyle='--', alpha=0.3)

	plt.title(f"Real Data Topology (PCA): {len(df)} Shots")
	plt.xlabel("Principal Component 1")
	plt.ylabel("Principal Component 2")
	plt.legend()
	plt.savefig(f"{OUTPUT_DIR}/real_data_topology.png")
	print(f" 📸 Topology map saved to '{OUTPUT_DIR}/real_data_topology.png'")

	# 7. SAVE EVERYTHING
	# We need these for the QGAN training script
	output_file = f"{OUTPUT_DIR}/processed_data.npz"
	np.savez(output_file,
	target_distribution=target_distribution, # For QGAN Loss
	grid_bounds=(x_edges, y_edges), # For Discriminator Input
	X_healthy_pca=X_healthy_pca, # For visual validation
	X_anomalous_pca=X_anomalous_pca) # For final testing

	# Save the models so we can run new live data later
	joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
	joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")

	print(f"\n✅ SUCCESS. Pipeline artifacts saved to '{OUTPUT_DIR}/'")

	if __name__ == "__main__":
	process_data()