| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.decomposition import PCA |
| import joblib |
| import os |
|
|
| |
| |
| N_DIM = 2 |
| BINS_PER_DIM = 4 |
| OUTPUT_DIR = "v2_pipeline_output" |
|
|
| if not os.path.exists(OUTPUT_DIR): |
| os.makedirs(OUTPUT_DIR) |
|
|
| def process_data(): |
| print("๐ Starting Phase I: Data Engineering Pipeline...") |
| |
| |
| |
| possible_paths = [ |
| 'real_tokamak_data_v2.csv', |
| 'vG.0.1/real_tokamak_data_v2.csv', |
| 'data/real_tokamak_data_v2.csv' |
| ] |
| |
| df = None |
| for path in possible_paths: |
| if os.path.exists(path): |
| print(f" โ
Found data at: {path}") |
| df = pd.read_csv(path) |
| break |
| |
| if df is None: |
| print(" โ Error: 'real_tokamak_data_v2.csv' not found in root or vG.0.1/.") |
| print(" Current Working Directory:", os.getcwd()) |
| return |
| |
| print(f" ๐ Shape: {df.shape}") |
| |
|
|
| |
| |
| |
| |
| drop_cols = ['label', 'shot_id', 'machine'] |
| feature_cols = [c for c in df.columns if c not in drop_cols] |
| |
| X_healthy = df[df['label'] == 0][feature_cols] |
| X_anomalous = df[df['label'] == 1][feature_cols] |
| |
| print(f" Healthy Samples (Training): {len(X_healthy)}") |
| print(f" Disruptive Samples (Validation): {len(X_anomalous)}") |
|
|
| |
| |
| print(" โ๏ธ Normalizing...") |
| scaler = StandardScaler() |
| scaler.fit(X_healthy) |
| |
| X_healthy_scaled = scaler.transform(X_healthy) |
| X_anomalous_scaled = scaler.transform(X_anomalous) |
|
|
| |
| |
| print(f" ๐ Compressing to {N_DIM} Dimensions (PCA)...") |
| pca = PCA(n_components=N_DIM) |
| pca.fit(X_healthy_scaled) |
| |
| X_healthy_pca = pca.transform(X_healthy_scaled) |
| X_anomalous_pca = pca.transform(X_anomalous_scaled) |
| |
| print(f" Explained Variance Ratio: {pca.explained_variance_ratio_}") |
|
|
| |
| |
| print(" ๐ธ๏ธ Generating Quantum Target Distribution...") |
| |
| |
| hist, x_edges, y_edges = np.histogram2d( |
| X_healthy_pca[:, 0], |
| X_healthy_pca[:, 1], |
| bins=BINS_PER_DIM, |
| density=True |
| ) |
| |
| |
| |
| target_distribution = hist.flatten() |
| target_distribution = target_distribution / np.sum(target_distribution) |
|
|
| print(f" Target Vector Shape: {target_distribution.shape}") |
| print(f" Target Vector (First 5): {target_distribution[:5]}") |
|
|
| |
| |
| plt.figure(figsize=(10, 8)) |
| |
| |
| plt.scatter(X_healthy_pca[:, 0], X_healthy_pca[:, 1], |
| c='blue', alpha=0.3, s=10, label='Healthy Plasma (Training)') |
| |
| |
| plt.scatter(X_anomalous_pca[:, 0], X_anomalous_pca[:, 1], |
| c='red', alpha=0.3, s=10, label='Disruptions (Testing)') |
| |
| |
| for x in x_edges: |
| plt.axvline(x, color='gray', linestyle='--', alpha=0.3) |
| for y in y_edges: |
| plt.axhline(y, color='gray', linestyle='--', alpha=0.3) |
|
|
| plt.title(f"Real Data Topology (PCA): {len(df)} Shots") |
| plt.xlabel("Principal Component 1") |
| plt.ylabel("Principal Component 2") |
| plt.legend() |
| plt.savefig(f"{OUTPUT_DIR}/real_data_topology.png") |
| print(f" ๐ธ Topology map saved to '{OUTPUT_DIR}/real_data_topology.png'") |
|
|
| |
| |
| output_file = f"{OUTPUT_DIR}/processed_data.npz" |
| np.savez(output_file, |
| target_distribution=target_distribution, |
| grid_bounds=(x_edges, y_edges), |
| X_healthy_pca=X_healthy_pca, |
| X_anomalous_pca=X_anomalous_pca) |
| |
| |
| joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.pkl") |
| joblib.dump(pca, f"{OUTPUT_DIR}/pca.pkl") |
| |
| print(f"\nโ
SUCCESS. Pipeline artifacts saved to '{OUTPUT_DIR}/'") |
|
|
| if __name__ == "__main__": |
| process_data() |