import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import joblib import os # --- CONFIGURATION --- N_DIM = 2 # Compress 8 features -> 2D Topology BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states (4 Qubits) OUTPUT_DIR = "v2_pipeline_output" if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) def process_optimized_data(): print("πŸš€ Starting Phase I: Data Pipeline (V3 - Optimized Input)...") # 1. LOAD OPTIMIZED DATA (The "Golden Ticket") # This file already has Top 8 features and is CLEAN (No NaNs) file_path = 'vG.0.1/qgan_data_optimized.npz' if not os.path.exists(file_path): # Fallback check if os.path.exists('qgan_data_optimized.npz'): file_path = 'qgan_data_optimized.npz' else: print(f"❌ Error: '{file_path}' not found.") return print(f" βœ… Loading: {file_path}") data = np.load(file_path) X_train = data['X_train'] # Healthy + Disruptive (Mixed in previous steps?) y_train = data['y_train'] # Labels X_test = data['X_test'] y_test = data['y_test'] # 2. ISOLATE HEALTHY DATA FOR TRAINING # We must train the "Map" ONLY on healthy data # Combine train/test to get maximum healthy samples for the map X_full = np.concatenate([X_train, X_test]) y_full = np.concatenate([y_train, y_test]) X_healthy = X_full[y_full == 0] X_anomalous = X_full[y_full == 1] print(f" πŸ“Š Total Healthy Samples (The Map): {len(X_healthy)}") print(f" πŸ“Š Total Anomalous Samples (The Trap): {len(X_anomalous)}") # 3. NORMALIZATION # Even though features are selected, they might be in different units print(" βš–οΈ Re-Normalizing for PCA...") scaler = StandardScaler() X_healthy_scaled = scaler.fit_transform(X_healthy) X_anomalous_scaled = scaler.transform(X_anomalous) # 4. DIMENSIONALITY REDUCTION (8D -> 2D) # We compress the 8 Physics Features down to a 2D "Island Map" print(f" πŸ“‰ Compressing 8 Features -> {N_DIM} Dimensions...") pca = PCA(n_components=N_DIM) X_healthy_pca = pca.fit_transform(X_healthy_scaled) X_anomalous_pca = pca.transform(X_anomalous_scaled) print(f" Explained Variance: {pca.explained_variance_ratio_}") print(f" Total Information Retained: {sum(pca.explained_variance_ratio_):.2%}") # 5. DISCRETIZATION (The Grid) print(" πŸ•ΈοΈ Generating Quantum Target Distribution (4x4 Grid)...") hist, x_edges, y_edges = np.histogram2d( X_healthy_pca[:, 0], X_healthy_pca[:, 1], bins=BINS_PER_DIM, density=True ) # Flatten to probability vector (size 16) target_distribution = hist.flatten() target_distribution = target_distribution / np.sum(target_distribution) # 6. VISUALIZATION plt.figure(figsize=(10, 8)) # Healthy in Blue (The Islands) plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], c='blue', alpha=0.3, s=10, label='Healthy (Top 8 Physics)') # Anomalous in Red (The Traps) plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], c='red', alpha=0.3, s=10, label='Disruptions') # Grid Lines for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3) for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3) plt.title("V3 Topology: Optimized Physics Features (8->2 Dim)") plt.xlabel("PC 1") plt.ylabel("PC 2") plt.legend() plt.savefig(f"{OUTPUT_DIR}/real_data_topology_v3.png") print(f" πŸ“Έ Topology map saved to '{OUTPUT_DIR}/real_data_topology_v3.png'") # 7. SAVE ARTIFACTS output_file = f"{OUTPUT_DIR}/processed_data.npz" np.savez(output_file, target_distribution=target_distribution, grid_bounds=(x_edges, y_edges), X_healthy_pca=X_healthy_pca, X_anomalous_pca=X_anomalous_pca) joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl") joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl") print(f"\nβœ… SUCCESS. V3 Pipeline Complete. Ready for Quantum Training.") if __name__ == "__main__": process_optimized_data()