QGAN_Project / vGe0.1 /Fprocess_real_data_v3.py
1bnjmn3's picture
Add files using upload-large-folder tool
0f755ec verified
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os
# --- CONFIGURATION ---
N_DIM = 2 # Compress 8 features -> 2D Topology
BINS_PER_DIM = 4 # 4x4 Grid = 16 discrete states (4 Qubits)
OUTPUT_DIR = "v2_pipeline_output"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
def process_optimized_data():
print("πŸš€ Starting Phase I: Data Pipeline (V3 - Optimized Input)...")
# 1. LOAD OPTIMIZED DATA (The "Golden Ticket")
# This file already has Top 8 features and is CLEAN (No NaNs)
file_path = 'vG.0.1/qgan_data_optimized.npz'
if not os.path.exists(file_path):
# Fallback check
if os.path.exists('qgan_data_optimized.npz'):
file_path = 'qgan_data_optimized.npz'
else:
print(f"❌ Error: '{file_path}' not found.")
return
print(f" βœ… Loading: {file_path}")
data = np.load(file_path)
X_train = data['X_train'] # Healthy + Disruptive (Mixed in previous steps?)
y_train = data['y_train'] # Labels
X_test = data['X_test']
y_test = data['y_test']
# 2. ISOLATE HEALTHY DATA FOR TRAINING
# We must train the "Map" ONLY on healthy data
# Combine train/test to get maximum healthy samples for the map
X_full = np.concatenate([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
X_healthy = X_full[y_full == 0]
X_anomalous = X_full[y_full == 1]
print(f" πŸ“Š Total Healthy Samples (The Map): {len(X_healthy)}")
print(f" πŸ“Š Total Anomalous Samples (The Trap): {len(X_anomalous)}")
# 3. NORMALIZATION
# Even though features are selected, they might be in different units
print(" βš–οΈ Re-Normalizing for PCA...")
scaler = StandardScaler()
X_healthy_scaled = scaler.fit_transform(X_healthy)
X_anomalous_scaled = scaler.transform(X_anomalous)
# 4. DIMENSIONALITY REDUCTION (8D -> 2D)
# We compress the 8 Physics Features down to a 2D "Island Map"
print(f" πŸ“‰ Compressing 8 Features -> {N_DIM} Dimensions...")
pca = PCA(n_components=N_DIM)
X_healthy_pca = pca.fit_transform(X_healthy_scaled)
X_anomalous_pca = pca.transform(X_anomalous_scaled)
print(f" Explained Variance: {pca.explained_variance_ratio_}")
print(f" Total Information Retained: {sum(pca.explained_variance_ratio_):.2%}")
# 5. DISCRETIZATION (The Grid)
print(" πŸ•ΈοΈ Generating Quantum Target Distribution (4x4 Grid)...")
hist, x_edges, y_edges = np.histogram2d(
X_healthy_pca[:, 0],
X_healthy_pca[:, 1],
bins=BINS_PER_DIM,
density=True
)
# Flatten to probability vector (size 16)
target_distribution = hist.flatten()
target_distribution = target_distribution / np.sum(target_distribution)
# 6. VISUALIZATION
plt.figure(figsize=(10, 8))
# Healthy in Blue (The Islands)
plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1],
c='blue', alpha=0.3, s=10, label='Healthy (Top 8 Physics)')
# Anomalous in Red (The Traps)
plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1],
c='red', alpha=0.3, s=10, label='Disruptions')
# Grid Lines
for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3)
plt.title("V3 Topology: Optimized Physics Features (8->2 Dim)")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()
plt.savefig(f"{OUTPUT_DIR}/real_data_topology_v3.png")
print(f" πŸ“Έ Topology map saved to '{OUTPUT_DIR}/real_data_topology_v3.png'")
# 7. SAVE ARTIFACTS
output_file = f"{OUTPUT_DIR}/processed_data.npz"
np.savez(output_file,
target_distribution=target_distribution,
grid_bounds=(x_edges, y_edges),
X_healthy_pca=X_healthy_pca,
X_anomalous_pca=X_anomalous_pca)
joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")
print(f"\nβœ… SUCCESS. V3 Pipeline Complete. Ready for Quantum Training.")
if __name__ == "__main__":
process_optimized_data()