File size: 4,232 Bytes
c5f84b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Save this as physics_data.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def generate_complex_tokamak_data(n_samples=5000, n_features=50):
    """
    Simulates complex, multi-modal Tokamak data.
    Modes:
    1. Ramp-up (High variation, 'The Takeoff')
    2. Flat-top (Stable, tight cluster, 'The Cruise')
    3. Ramp-down (Lower energy, 'The Landing')
    """
    print(f"--- GENERATING PHYSICS-INFORMED DATA ({n_samples} samples) ---")
    
    np.random.seed(42) 

    # --- 1. HEALTHY DATA (The "Three Phases") ---
    # This mimics the distinct operational phases of a tokamak discharge.
    
    # Mode A: Ramp-Up (25% of data) - Noisy, rising current
    n_a = int(0.25 * n_samples)
    # Centered at (-2, -2) in latent space roughly
    mean_a = np.zeros(n_features) - 2.0  
    cov_a = np.eye(n_features) * 0.6     
    data_a = np.random.multivariate_normal(mean_a, cov_a, n_a)

    # Mode B: Flat-Top (50% of data) - The Stable Core
    # This is the "safe" fusion state.
    n_b = int(0.50 * n_samples)
    mean_b = np.zeros(n_features)        
    cov_b = np.eye(n_features) * 0.3     # Very tight, organized
    data_b = np.random.multivariate_normal(mean_b, cov_b, n_b)

    # Mode C: Ramp-Down (25% of data) - Cooling down
    n_c = n_samples - n_a - n_b
    mean_c = np.zeros(n_features) + 2.0  
    cov_c = np.eye(n_features) * 0.6     
    data_c = np.random.multivariate_normal(mean_c, cov_c, n_c)

    # Stack them to create the "Healthy Manifold"
    X_healthy = np.vstack([data_a, data_b, data_c])
    
    # --- 2. ANOMALOUS DATA (The "Trap") ---
    n_anom = int(n_samples * 0.1) # 10% anomaly rate
    
    # Type 1: The "Bridge Gap" (Points hidden BETWEEN modes)
    # These mimic "failed transitions" or "locked modes"
    # If the discriminator is lazy, it will think these gaps are safe bridges.
    n_anom_1 = int(n_anom * 0.6)
    mean_bad_1 = np.zeros(n_features) - 1.0 # Stuck between Ramp and Flat-top
    cov_bad_1 = np.eye(n_features) * 0.15   # Tight cluster in the gap
    data_bad_1 = np.random.multivariate_normal(mean_bad_1, cov_bad_1, n_anom_1)

    # Type 2: The "Hard Disruption" (Energy Spike / Greenwald Limit)
    n_anom_2 = n_anom - n_anom_1
    mean_bad_2 = np.ones(n_features) * 3.5 # Way outside normal bounds
    cov_bad_2 = np.eye(n_features) * 0.5
    data_bad_2 = np.random.multivariate_normal(mean_bad_2, cov_bad_2, n_anom_2)
    
    X_anomalous = np.vstack([data_bad_1, data_bad_2])

    # --- 3. DATAFRAME CREATION ---
    df_healthy = pd.DataFrame(X_healthy, columns=[f'sensor_{i}' for i in range(n_features)])
    df_healthy['label'] = 0  # 0 = Healthy

    df_anomalous = pd.DataFrame(X_anomalous, columns=[f'sensor_{i}' for i in range(n_features)])
    df_anomalous['label'] = 1  # 1 = Anomalous

    df_total = pd.concat([df_healthy, df_anomalous], ignore_index=True)
    
    # Shuffle the deck
    df_total = df_total.sample(frac=1).reset_index(drop=True)
    
    print(f"Generated {len(df_healthy)} healthy and {len(df_anomalous)} anomalous samples.")
    return df_total

if __name__ == "__main__":
    # Generate and Save
    df = generate_complex_tokamak_data()
    df.to_csv('complex_tokamak_data.csv', index=False)
    print("Saved to 'complex_tokamak_data.csv'")
    
    # --- VISUAL PROOF ---
    # We use PCA to project the 50D data to 2D so you can SEE the islands
    print("Generating preview plot...")
    pca = PCA(n_components=2)
    X = df.drop('label', axis=1)
    y = df['label']
    X_pca = pca.fit_transform(X)
    
    plt.figure(figsize=(10, 8))
    
    # Plot Healthy (Blue)
    plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], 
                c='blue', alpha=0.2, s=10, label='Healthy (3 Modes)')
    
    # Plot Anomalous (Red)
    plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], 
                c='red', alpha=0.6, s=10, label='Anomalies (The Trap)')
    
    plt.title("V2 Benchmark Data: The 'Three Islands' Topology")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.savefig('v2_data_topology.png')
    print("Plot saved to 'v2_data_topology.png'. Open it to see the 'Islands'.")