Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
|
|
| 8 |
import numpy as np
|
| 9 |
import pandas as pd
|
| 10 |
from datasets import load_dataset
|
| 11 |
-
from sklearn.ensemble import
|
| 12 |
from sklearn.impute import SimpleImputer
|
| 13 |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
|
| 14 |
from sklearn.model_selection import train_test_split
|
|
@@ -19,7 +19,9 @@ logging.basicConfig(level=logging.INFO)
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
APP_TITLE = "Noise Detection"
|
| 22 |
-
APP_SUBTITLE =
|
|
|
|
|
|
|
| 23 |
|
| 24 |
REPO_CONFIG = {
|
| 25 |
"clean": {
|
|
@@ -66,6 +68,7 @@ NON_FEATURE_COLS = {
|
|
| 66 |
}
|
| 67 |
|
| 68 |
SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
|
|
|
|
| 69 |
_ASSET_CACHE: Dict[str, pd.DataFrame] = {}
|
| 70 |
_COMBINED_CACHE: Optional[pd.DataFrame] = None
|
| 71 |
|
|
@@ -81,7 +84,7 @@ def safe_parse(value):
|
|
| 81 |
|
| 82 |
|
| 83 |
def adjacency_features(adj_value) -> Dict[str, float]:
|
| 84 |
-
"""Derive
|
| 85 |
parsed = safe_parse(adj_value)
|
| 86 |
if not isinstance(parsed, list) or len(parsed) == 0:
|
| 87 |
return {
|
|
@@ -160,7 +163,7 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 160 |
|
| 161 |
|
| 162 |
def load_single_dataset(dataset_key: str) -> pd.DataFrame:
|
| 163 |
-
"""Load a
|
| 164 |
if dataset_key not in _ASSET_CACHE:
|
| 165 |
logger.info("Loading dataset: %s", dataset_key)
|
| 166 |
ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
|
|
@@ -172,7 +175,7 @@ def load_single_dataset(dataset_key: str) -> pd.DataFrame:
|
|
| 172 |
|
| 173 |
|
| 174 |
def load_combined_dataset() -> pd.DataFrame:
|
| 175 |
-
"""Load and merge all noise-condition datasets."""
|
| 176 |
global _COMBINED_CACHE
|
| 177 |
if _COMBINED_CACHE is None:
|
| 178 |
frames = [load_single_dataset(key) for key in REPO_CONFIG.keys()]
|
|
@@ -192,7 +195,7 @@ def load_guide_content() -> str:
|
|
| 192 |
|
| 193 |
|
| 194 |
def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
|
| 195 |
-
"""Return numeric feature columns excluding metadata and
|
| 196 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 197 |
features = []
|
| 198 |
for col in numeric_cols:
|
|
@@ -205,7 +208,7 @@ def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
|
|
| 205 |
|
| 206 |
|
| 207 |
def default_feature_selection(features: List[str]) -> List[str]:
|
| 208 |
-
"""
|
| 209 |
preferred = [
|
| 210 |
"gate_entropy",
|
| 211 |
"adj_density",
|
|
@@ -240,7 +243,7 @@ def make_classification_figure(
|
|
| 240 |
ax3 = fig.add_subplot(gs[0, 2])
|
| 241 |
|
| 242 |
cm = confusion_matrix(y_true, y_pred, labels=class_names)
|
| 243 |
-
|
| 244 |
ax1.set_title("Confusion Matrix")
|
| 245 |
ax1.set_xlabel("Predicted")
|
| 246 |
ax1.set_ylabel("Actual")
|
|
@@ -251,10 +254,10 @@ def make_classification_figure(
|
|
| 251 |
for i in range(cm.shape[0]):
|
| 252 |
for j in range(cm.shape[1]):
|
| 253 |
ax1.text(j, i, cm[i, j], ha="center", va="center")
|
| 254 |
-
fig.colorbar(
|
| 255 |
|
| 256 |
-
|
| 257 |
-
ax2.hist(
|
| 258 |
ax2.set_title("Correct vs Incorrect")
|
| 259 |
ax2.set_xlabel("0 = Correct, 1 = Incorrect")
|
| 260 |
ax2.set_ylabel("Count")
|
|
@@ -273,7 +276,7 @@ def make_classification_figure(
|
|
| 273 |
|
| 274 |
|
| 275 |
def build_dataset_profile(df: pd.DataFrame) -> str:
|
| 276 |
-
"""Build a dataset summary for the explorer tab."""
|
| 277 |
return (
|
| 278 |
f"### Dataset profile\n\n"
|
| 279 |
f"**Rows:** {len(df):,} \n"
|
|
@@ -373,11 +376,13 @@ def train_classifier(
|
|
| 373 |
("scaler", StandardScaler()),
|
| 374 |
(
|
| 375 |
"classifier",
|
| 376 |
-
|
| 377 |
n_estimators=trees,
|
| 378 |
max_depth=depth,
|
| 379 |
random_state=seed,
|
| 380 |
n_jobs=-1,
|
|
|
|
|
|
|
| 381 |
),
|
| 382 |
),
|
| 383 |
]
|
|
@@ -387,14 +392,19 @@ def train_classifier(
|
|
| 387 |
y_pred = model.predict(X_test)
|
| 388 |
|
| 389 |
accuracy = float(accuracy_score(y_test, y_pred))
|
| 390 |
-
macro_f1 = float(f1_score(y_test, y_pred, average="macro"))
|
| 391 |
-
weighted_f1 = float(f1_score(y_test, y_pred, average="weighted"))
|
| 392 |
|
| 393 |
classifier = model.named_steps["classifier"]
|
| 394 |
importances = getattr(classifier, "feature_importances_", None)
|
| 395 |
fig = make_classification_figure(y_test.to_numpy(), y_pred, CLASS_ORDER, list(feature_columns), importances)
|
| 396 |
|
| 397 |
-
report = classification_report(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
results = (
|
| 399 |
"### Classification results\n\n"
|
| 400 |
f"**Rows used:** {len(train_df):,} \n"
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import pandas as pd
|
| 10 |
from datasets import load_dataset
|
| 11 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
| 12 |
from sklearn.impute import SimpleImputer
|
| 13 |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
|
| 14 |
from sklearn.model_selection import train_test_split
|
|
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
APP_TITLE = "Noise Detection"
|
| 22 |
+
APP_SUBTITLE = (
|
| 23 |
+
"Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
|
| 24 |
+
)
|
| 25 |
|
| 26 |
REPO_CONFIG = {
|
| 27 |
"clean": {
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
|
| 71 |
+
|
| 72 |
_ASSET_CACHE: Dict[str, pd.DataFrame] = {}
|
| 73 |
_COMBINED_CACHE: Optional[pd.DataFrame] = None
|
| 74 |
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
def adjacency_features(adj_value) -> Dict[str, float]:
|
| 87 |
+
"""Derive graph statistics from an adjacency matrix."""
|
| 88 |
parsed = safe_parse(adj_value)
|
| 89 |
if not isinstance(parsed, list) or len(parsed) == 0:
|
| 90 |
return {
|
|
|
|
| 163 |
|
| 164 |
|
| 165 |
def load_single_dataset(dataset_key: str) -> pd.DataFrame:
|
| 166 |
+
"""Load a dataset shard from Hugging Face and cache it in memory."""
|
| 167 |
if dataset_key not in _ASSET_CACHE:
|
| 168 |
logger.info("Loading dataset: %s", dataset_key)
|
| 169 |
ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
|
|
|
|
| 175 |
|
| 176 |
|
| 177 |
def load_combined_dataset() -> pd.DataFrame:
|
| 178 |
+
"""Load and merge all four noise-condition datasets."""
|
| 179 |
global _COMBINED_CACHE
|
| 180 |
if _COMBINED_CACHE is None:
|
| 181 |
frames = [load_single_dataset(key) for key in REPO_CONFIG.keys()]
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
|
| 198 |
+
"""Return numeric feature columns excluding metadata and target columns."""
|
| 199 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 200 |
features = []
|
| 201 |
for col in numeric_cols:
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
def default_feature_selection(features: List[str]) -> List[str]:
|
| 211 |
+
"""Select a stable default feature subset."""
|
| 212 |
preferred = [
|
| 213 |
"gate_entropy",
|
| 214 |
"adj_density",
|
|
|
|
| 243 |
ax3 = fig.add_subplot(gs[0, 2])
|
| 244 |
|
| 245 |
cm = confusion_matrix(y_true, y_pred, labels=class_names)
|
| 246 |
+
image = ax1.imshow(cm, interpolation="nearest")
|
| 247 |
ax1.set_title("Confusion Matrix")
|
| 248 |
ax1.set_xlabel("Predicted")
|
| 249 |
ax1.set_ylabel("Actual")
|
|
|
|
| 254 |
for i in range(cm.shape[0]):
|
| 255 |
for j in range(cm.shape[1]):
|
| 256 |
ax1.text(j, i, cm[i, j], ha="center", va="center")
|
| 257 |
+
fig.colorbar(image, ax=ax1, fraction=0.046, pad=0.04)
|
| 258 |
|
| 259 |
+
incorrect = (y_true != y_pred).astype(int)
|
| 260 |
+
ax2.hist(incorrect, bins=[-0.5, 0.5, 1.5])
|
| 261 |
ax2.set_title("Correct vs Incorrect")
|
| 262 |
ax2.set_xlabel("0 = Correct, 1 = Incorrect")
|
| 263 |
ax2.set_ylabel("Count")
|
|
|
|
| 276 |
|
| 277 |
|
| 278 |
def build_dataset_profile(df: pd.DataFrame) -> str:
|
| 279 |
+
"""Build a short dataset summary for the explorer tab."""
|
| 280 |
return (
|
| 281 |
f"### Dataset profile\n\n"
|
| 282 |
f"**Rows:** {len(df):,} \n"
|
|
|
|
| 376 |
("scaler", StandardScaler()),
|
| 377 |
(
|
| 378 |
"classifier",
|
| 379 |
+
ExtraTreesClassifier(
|
| 380 |
n_estimators=trees,
|
| 381 |
max_depth=depth,
|
| 382 |
random_state=seed,
|
| 383 |
n_jobs=-1,
|
| 384 |
+
class_weight="balanced",
|
| 385 |
+
min_samples_leaf=1,
|
| 386 |
),
|
| 387 |
),
|
| 388 |
]
|
|
|
|
| 392 |
y_pred = model.predict(X_test)
|
| 393 |
|
| 394 |
accuracy = float(accuracy_score(y_test, y_pred))
|
| 395 |
+
macro_f1 = float(f1_score(y_test, y_pred, average="macro", zero_division=0))
|
| 396 |
+
weighted_f1 = float(f1_score(y_test, y_pred, average="weighted", zero_division=0))
|
| 397 |
|
| 398 |
classifier = model.named_steps["classifier"]
|
| 399 |
importances = getattr(classifier, "feature_importances_", None)
|
| 400 |
fig = make_classification_figure(y_test.to_numpy(), y_pred, CLASS_ORDER, list(feature_columns), importances)
|
| 401 |
|
| 402 |
+
report = classification_report(
|
| 403 |
+
y_test,
|
| 404 |
+
y_pred,
|
| 405 |
+
labels=CLASS_ORDER,
|
| 406 |
+
zero_division=0,
|
| 407 |
+
)
|
| 408 |
results = (
|
| 409 |
"### Classification results\n\n"
|
| 410 |
f"**Rows used:** {len(train_df):,} \n"
|