import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os

# --- CONFIGURATION ---
N_DIM = 2            # Compress 8 features -> 2D Topology
BINS_PER_DIM = 4     # 4x4 Grid = 16 discrete states (4 Qubits)
OUTPUT_DIR = "v2_pipeline_output"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def process_optimized_data():
    print("🚀 Starting Phase I: Data Pipeline (V3 - Optimized Input)...")
    
    # 1. LOAD OPTIMIZED DATA (The "Golden Ticket")
    # This file already has Top 8 features and is CLEAN (No NaNs)
    file_path = 'vG.0.1/qgan_data_optimized.npz'
    
    if not os.path.exists(file_path):
        # Fallback check
        if os.path.exists('qgan_data_optimized.npz'):
            file_path = 'qgan_data_optimized.npz'
        else:
            print(f"❌ Error: '{file_path}' not found.")
            return

    print(f"   ✅ Loading: {file_path}")
    data = np.load(file_path)
    X_train = data['X_train'] # Healthy + Disruptive (Mixed in previous steps?)
    y_train = data['y_train'] # Labels
    X_test = data['X_test']
    y_test = data['y_test']
    
    # 2. ISOLATE HEALTHY DATA FOR TRAINING
    # We must train the "Map" ONLY on healthy data
    # Combine train/test to get maximum healthy samples for the map
    X_full = np.concatenate([X_train, X_test])
    y_full = np.concatenate([y_train, y_test])
    
    X_healthy = X_full[y_full == 0]
    X_anomalous = X_full[y_full == 1]
    
    print(f"   📊 Total Healthy Samples (The Map): {len(X_healthy)}")
    print(f"   📊 Total Anomalous Samples (The Trap): {len(X_anomalous)}")

    # 3. NORMALIZATION
    # Even though features are selected, they might be in different units
    print("   ⚖️  Re-Normalizing for PCA...")
    scaler = StandardScaler()
    X_healthy_scaled = scaler.fit_transform(X_healthy)
    X_anomalous_scaled = scaler.transform(X_anomalous)

    # 4. DIMENSIONALITY REDUCTION (8D -> 2D)
    # We compress the 8 Physics Features down to a 2D "Island Map"
    print(f"   📉 Compressing 8 Features -> {N_DIM} Dimensions...")
    pca = PCA(n_components=N_DIM)
    X_healthy_pca = pca.fit_transform(X_healthy_scaled)
    X_anomalous_pca = pca.transform(X_anomalous_scaled)
    
    print(f"   Explained Variance: {pca.explained_variance_ratio_}")
    print(f"   Total Information Retained: {sum(pca.explained_variance_ratio_):.2%}")

    # 5. DISCRETIZATION (The Grid)
    print("   🕸️  Generating Quantum Target Distribution (4x4 Grid)...")
    
    hist, x_edges, y_edges = np.histogram2d(
        X_healthy_pca[:, 0], 
        X_healthy_pca[:, 1], 
        bins=BINS_PER_DIM, 
        density=True
    )
    
    # Flatten to probability vector (size 16)
    target_distribution = hist.flatten()
    target_distribution = target_distribution / np.sum(target_distribution)

    # 6. VISUALIZATION
    plt.figure(figsize=(10, 8))
    # Healthy in Blue (The Islands)
    plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], 
                c='blue', alpha=0.3, s=10, label='Healthy (Top 8 Physics)')
    # Anomalous in Red (The Traps)
    plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], 
                c='red', alpha=0.3, s=10, label='Disruptions')
    
    # Grid Lines
    for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
    for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3)

    plt.title("V3 Topology: Optimized Physics Features (8->2 Dim)")
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    plt.legend()
    plt.savefig(f"{OUTPUT_DIR}/real_data_topology_v3.png")
    print(f"   📸 Topology map saved to '{OUTPUT_DIR}/real_data_topology_v3.png'")

    # 7. SAVE ARTIFACTS
    output_file = f"{OUTPUT_DIR}/processed_data.npz"
    np.savez(output_file, 
             target_distribution=target_distribution,
             grid_bounds=(x_edges, y_edges),
             X_healthy_pca=X_healthy_pca,
             X_anomalous_pca=X_anomalous_pca)
    
    joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
    joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")
    
    print(f"\n✅ SUCCESS. V3 Pipeline Complete. Ready for Quantum Training.")

if __name__ == "__main__":
    process_optimized_data()