1bnjmn3
/

QGAN_Project

Model card Files Files and versions

QGAN_Project / vG.0.1 /benchmark_supervised.py

1bnjmn3's picture

Add files using upload-large-folder tool

0f755ec verified 6 months ago

history blame contribute delete

2.11 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

	# 1. Load Data
	print("Loading data...")
	df = pd.read_csv('real_tokamak_data_v2.csv')

	# 2. Cleaning (Same as before)
	y = df['label']
	X = df.drop(['label', 'shot_id'], axis=1)

	# Drop dead sensors
	X = X.loc[:, X.std() > 0]
	# Fill NaNs
	X = X.fillna(0)

	# 3. Split Data
	# Stratify ensures we have the same % of disruptions in train and test
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

	print(f"Training on {len(X_train)} shots.")
	print(f"Testing on {len(X_test)} shots.")

	# 4. Train Random Forest (The "Workhorse" of Classical ML)
	clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
	clf.fit(X_train, y_train)

	# 5. Evaluate
	y_prob = clf.predict_proba(X_test)[:, 1] # Probability of Disruption
	auc = roc_auc_score(y_test, y_prob)
	acc = accuracy_score(y_test, clf.predict(X_test))

	print("\n" + "="*30)
	print(f"RESULTS: CLASSICAL SUPERVISED BASELINE")
	print("="*30)
	print(f"✅ AUC Score: {auc:.4f}")
	print(f"✅ Accuracy: {acc:.2%}")
	print("-" * 30)
	print("Confusion Matrix:")
	print(confusion_matrix(y_test, clf.predict(X_test)))
	print("="*30)

	# 6. Save the Best Features for QGAN
	# We can ask the Random Forest which pixels matter most!
	importances = clf.feature_importances_
	indices = np.argsort(importances)[::-1]

	print("\nTop 5 Features (Physics Signals):")
	for i in range(5):
	print(f" {X.columns[indices[i]]} (Importance: {importances[indices[i]]:.4f})")

	# Save only the top 8 features for the QGAN (Small quantum circuit limit)
	top_features = X.columns[indices[:8]]
	np.savez('qgan_data_optimized.npz',
	X_train=X_train[top_features].values,
	X_test=X_test[top_features].values,
	y_train=y_train.values,
	y_test=y_test.values)
	print("\n💾 Saved optimized data to 'qgan_data_optimized.npz'")