File size: 4,291 Bytes
0f755ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os

# --- CONFIGURATION ---
N_DIM = 2            # Compress 8 features -> 2D Topology
BINS_PER_DIM = 4     # 4x4 Grid = 16 discrete states (4 Qubits)
OUTPUT_DIR = "v2_pipeline_output"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def process_optimized_data():
    print("πŸš€ Starting Phase I: Data Pipeline (V3 - Optimized Input)...")
    
    # 1. LOAD OPTIMIZED DATA (The "Golden Ticket")
    # This file already has Top 8 features and is CLEAN (No NaNs)
    file_path = 'vG.0.1/qgan_data_optimized.npz'
    
    if not os.path.exists(file_path):
        # Fallback check
        if os.path.exists('qgan_data_optimized.npz'):
            file_path = 'qgan_data_optimized.npz'
        else:
            print(f"❌ Error: '{file_path}' not found.")
            return

    print(f"   βœ… Loading: {file_path}")
    data = np.load(file_path)
    X_train = data['X_train'] # Healthy + Disruptive (Mixed in previous steps?)
    y_train = data['y_train'] # Labels
    X_test = data['X_test']
    y_test = data['y_test']
    
    # 2. ISOLATE HEALTHY DATA FOR TRAINING
    # We must train the "Map" ONLY on healthy data
    # Combine train/test to get maximum healthy samples for the map
    X_full = np.concatenate([X_train, X_test])
    y_full = np.concatenate([y_train, y_test])
    
    X_healthy = X_full[y_full == 0]
    X_anomalous = X_full[y_full == 1]
    
    print(f"   πŸ“Š Total Healthy Samples (The Map): {len(X_healthy)}")
    print(f"   πŸ“Š Total Anomalous Samples (The Trap): {len(X_anomalous)}")

    # 3. NORMALIZATION
    # Even though features are selected, they might be in different units
    print("   βš–οΈ  Re-Normalizing for PCA...")
    scaler = StandardScaler()
    X_healthy_scaled = scaler.fit_transform(X_healthy)
    X_anomalous_scaled = scaler.transform(X_anomalous)

    # 4. DIMENSIONALITY REDUCTION (8D -> 2D)
    # We compress the 8 Physics Features down to a 2D "Island Map"
    print(f"   πŸ“‰ Compressing 8 Features -> {N_DIM} Dimensions...")
    pca = PCA(n_components=N_DIM)
    X_healthy_pca = pca.fit_transform(X_healthy_scaled)
    X_anomalous_pca = pca.transform(X_anomalous_scaled)
    
    print(f"   Explained Variance: {pca.explained_variance_ratio_}")
    print(f"   Total Information Retained: {sum(pca.explained_variance_ratio_):.2%}")

    # 5. DISCRETIZATION (The Grid)
    print("   πŸ•ΈοΈ  Generating Quantum Target Distribution (4x4 Grid)...")
    
    hist, x_edges, y_edges = np.histogram2d(
        X_healthy_pca[:, 0], 
        X_healthy_pca[:, 1], 
        bins=BINS_PER_DIM, 
        density=True
    )
    
    # Flatten to probability vector (size 16)
    target_distribution = hist.flatten()
    target_distribution = target_distribution / np.sum(target_distribution)

    # 6. VISUALIZATION
    plt.figure(figsize=(10, 8))
    # Healthy in Blue (The Islands)
    plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], 
                c='blue', alpha=0.3, s=10, label='Healthy (Top 8 Physics)')
    # Anomalous in Red (The Traps)
    plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], 
                c='red', alpha=0.3, s=10, label='Disruptions')
    
    # Grid Lines
    for x in x_edges: plt.axvline(x, color='gray', linestyle='--', alpha=0.3)
    for y in y_edges: plt.axhline(y, color='gray', linestyle='--', alpha=0.3)

    plt.title("V3 Topology: Optimized Physics Features (8->2 Dim)")
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    plt.legend()
    plt.savefig(f"{OUTPUT_DIR}/real_data_topology_v3.png")
    print(f"   πŸ“Έ Topology map saved to '{OUTPUT_DIR}/real_data_topology_v3.png'")

    # 7. SAVE ARTIFACTS
    output_file = f"{OUTPUT_DIR}/processed_data.npz"
    np.savez(output_file, 
             target_distribution=target_distribution,
             grid_bounds=(x_edges, y_edges),
             X_healthy_pca=X_healthy_pca,
             X_anomalous_pca=X_anomalous_pca)
    
    joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl")
    joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl")
    
    print(f"\nβœ… SUCCESS. V3 Pipeline Complete. Ready for Quantum Training.")

if __name__ == "__main__":
    process_optimized_data()