QGAN_Project / vGe0.1 /Fprocess_real_data_v3.py

Add files using upload-large-folder tool

0f755ec verified 6 months ago

4.29 kB

	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	import joblib
	import os

	# --- CONFIGURATION ---
	N_DIM = 2 # Compress 8 features -> 2D Topology
	BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states (4 Qubits)
	OUTPUT_DIR = "v2_pipeline_output"

	if not os.path.exists(OUTPUT_DIR):
	os.makedirs(OUTPUT_DIR)

	def process_optimized_data():
	print("🚀 Starting Phase I: Data Pipeline (V3 - Optimized Input)...")

	# 1. LOAD OPTIMIZED DATA (The "Golden Ticket")
	# This file already has Top 8 features and is CLEAN (No NaNs)
	file_path = 'vG.0.1/qgan_data_optimized.npz'

	if not os.path.exists(file_path):
	# Fallback check
	if os.path.exists('qgan_data_optimized.npz'):
	file_path = 'qgan_data_optimized.npz'
	else:
	print(f"❌ Error: '{file_path}' not found.")
	return

	print(f" ✅ Loading: {file_path}")
	data = np.load(file_path)
	X_train = data['X_train'] # Healthy + Disruptive (Mixed in previous steps?)
	y_train = data['y_train'] # Labels
	X_test = data['X_test']
	y_test = data['y_test']

	# 2. ISOLATE HEALTHY DATA FOR TRAINING
	# We must train the "Map" ONLY on healthy data
	# Combine train/test to get maximum healthy samples for the map
	X_full = np.concatenate([X_train, X_test])
	y_full = np.concatenate([y_train, y_test])

	X_healthy = X_full[y_full == 0]
	X_anomalous = X_full[y_full == 1]

	print(f" 📊 Total Healthy Samples (The Map): {len(X_healthy)}")
	print(f" 📊 Total Anomalous Samples (The Trap): {len(X_anomalous)}")

	# 3. NORMALIZATION
	# Even though features are selected, they might be in different units
	print(" ⚖️ Re-Normalizing for PCA...")
	scaler = StandardScaler()
	X_healthy_scaled = scaler.fit_transform(X_healthy)
	X_anomalous_scaled = scaler.transform(X_anomalous)

	# 4. DIMENSIONALITY REDUCTION (8D -> 2D)
	# We compress the 8 Physics Features down to a 2D "Island Map"
	print(f" 📉 Compressing 8 Features -> {N_DIM} Dimensions...")
	pca = PCA(n_components=N_DIM)
	X_healthy_pca = pca.fit_transform(X_healthy_scaled)
	X_anomalous_pca = pca.transform(X_anomalous_scaled)

	print(f" Explained Variance: {pca.explained_variance_ratio_}")
	print(f" Total Information Retained: {sum(pca.explained_variance_ratio_):.2%}")

	# 5. DISCRETIZATION (The Grid)
	print(" 🕸️ Generating Quantum Target Distribution (4x4 Grid)...")

	hist, x_edges, y_edges = np.histogram2d(
	X_healthy_pca[:, 0],
	X_healthy_pca[:, 1],
	bins=BINS_PER_DIM,
	density=True
	)

	# Flatten to probability vector (size 16)
	target_distribution = hist.flatten()
	target_distribution = target_distribution / np.sum(target_distribution)

	# 6. VISUALIZATION
	plt.figure(figsize=(10, 8))
	# Healthy in Blue (The Islands)
	plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1],
	c='blue', alpha=0.3, s=10, label='Healthy (Top 8 Physics)')
	# Anomalous in Red (The Traps)
	plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1],
	c='red', alpha=0.3, s=10, label='Disruptions')

	# Grid Lines
	for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
	for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3)

	plt.title("V3 Topology: Optimized Physics Features (8->2 Dim)")
	plt.xlabel("PC 1")
	plt.ylabel("PC 2")
	plt.legend()
	plt.savefig(f"{OUTPUT_DIR}/real_data_topology_v3.png")
	print(f" 📸 Topology map saved to '{OUTPUT_DIR}/real_data_topology_v3.png'")

	# 7. SAVE ARTIFACTS
	output_file = f"{OUTPUT_DIR}/processed_data.npz"
	np.savez(output_file,
	target_distribution=target_distribution,
	grid_bounds=(x_edges, y_edges),
	X_healthy_pca=X_healthy_pca,
	X_anomalous_pca=X_anomalous_pca)

	joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
	joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")

	print(f"\n✅ SUCCESS. V3 Pipeline Complete. Ready for Quantum Training.")

	if __name__ == "__main__":
	process_optimized_data()