Spaces:

QSBench
/

Noise_Detection

Sleeping

App Files Files Community

QSBench commited on 12 days ago

Commit

ffca061

verified ·

1 Parent(s): 2635a44

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -104

app.py CHANGED Viewed

@@ -1,62 +1,51 @@
 import ast
 import logging
 import re
-from typing import Dict, List
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
-from sklearn.model_selection import train_test_split
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.metrics import classification_report, confusion_matrix
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 APP_TITLE = "Noise Detection"
-APP_SUBTITLE = "Classify circuits by noise type: clean, depolarizing, amplitude_damping, hardware_aware."
 REPO_CONFIG = {
-    "Core (Clean)": "QSBench/QSBench-Core-v1.0.0-demo",
-    "Depolarizing Noise": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
-    "Amplitude Damping": "QSBench/QSBench-Amplitude-v1.0.0-demo",
-    "Hardware-Aware Noise": "QSBench/QSBench-Transpilation-v1.0.0-demo",
 }
 NON_FEATURE_COLS = {
-    "sample_id",
-    "sample_seed",
-    "circuit_hash",
-    "split",
-    "circuit_qasm",
-    "qasm_raw",
-    "qasm_transpiled",
-    "circuit_type_resolved",
-    "circuit_type_requested",
-    "noise_type",
-    "noise_prob",
-    "observable_bases",
-    "observable_mode",
-    "backend_device",
-    "precision_mode",
-    "circuit_signature",
 }
-_SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
-def load_dataset_df(dataset_key: str) -> pd.DataFrame:
-    if dataset_key not in _ASSET_CACHE:
-        ds = load_dataset(REPO_CONFIG[dataset_key])
-        df = pd.DataFrame(ds["train"])
-        df = enrich_dataframe(df)
-        df["noise_label"] = dataset_key
-        _ASSET_CACHE[dataset_key] = df
-    return _ASSET_CACHE[dataset_key]
 def safe_parse(value):
@@ -72,7 +61,6 @@ def adjacency_features(adj_value) -> Dict[str, float]:
     parsed = safe_parse(adj_value)
     if not isinstance(parsed, list) or len(parsed) == 0:
         return {"adj_edge_count": np.nan, "adj_density": np.nan, "adj_degree_mean": np.nan, "adj_degree_std": np.nan}
     try:
         arr = np.array(parsed, dtype=float)
         n = arr.shape[0]
@@ -94,13 +82,12 @@ def qasm_features(qasm_value) -> Dict[str, float]:
     if not isinstance(qasm_value, str) or not qasm_value.strip():
         return {"qasm_length": np.nan, "qasm_line_count": np.nan, "qasm_gate_keyword_count": np.nan,
                 "qasm_measure_count": np.nan, "qasm_comment_count": np.nan}
     text = qasm_value
     lines = [line for line in text.splitlines() if line.strip()]
-    gate_keywords = re.findall(r"\b(cx|h|x|y|z|rx|ry|rz|u1|u2|u3|u|swap|cz|ccx|rxx|ryy|rzz)\b", text, flags=re.IGNORECASE)
     measure_count = len(re.findall(r"\bmeasure\b", text, flags=re.IGNORECASE))
     comment_count = sum(1 for line in lines if line.strip().startswith("//"))
     return {
         "qasm_length": float(len(text)),
         "qasm_line_count": float(len(lines)),
@@ -115,7 +102,6 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if "adjacency" in df.columns:
         adj_df = df["adjacency"].apply(adjacency_features).apply(pd.Series)
         df = pd.concat([df, adj_df], axis=1)
     qasm_source = "qasm_transpiled" if "qasm_transpiled" in df.columns else "qasm_raw"
     if qasm_source in df.columns:
         qasm_df = df[qasm_source].apply(qasm_features).apply(pd.Series)
@@ -123,15 +109,30 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    features = []
-    for col in numeric_cols:
-        if col in NON_FEATURE_COLS:
-            continue
-        if any(pattern in col for pattern in _SOFT_EXCLUDE_PATTERNS):
-            continue
-        features.append(col)
     return sorted(features)
@@ -141,76 +142,97 @@ def default_feature_selection(features: List[str]) -> List[str]:
     return [f for f in preferred if f in features]
-def train_classifier(dataset_keys, feature_columns, test_size, seed):
     if not feature_columns:
-        return None, "No features selected"
-    dfs = [load_dataset_df(k) for k in dataset_keys]
-    df = pd.concat(dfs, axis=0, ignore_index=True)
     df = df.dropna(subset=feature_columns + ["noise_label"])
     X = df[feature_columns]
     y = df["noise_label"]
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(seed), stratify=y)
-    model = HistGradientBoostingClassifier(
-        learning_rate=0.05,
-        max_iter=200,
-        max_depth=5,
-        min_samples_leaf=10,
-        l2_regularization=0.1,
-        class_weight="balanced",
-        random_state=int(seed),
-    )
     model.fit(X_train, y_train)
-    preds = model.predict(X_test)
-    report = classification_report(y_test, preds, output_dict=False)
-    cm = confusion_matrix(y_test, preds)
-    return report, cm.tolist()
-CUSTOM_CSS = """
-.gradio-container {max-width: 1400px !important;}
-"""
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(f"# 🌌 {APP_TITLE}")
     gr.Markdown(APP_SUBTITLE)
-    with gr.Tabs():
-        with gr.TabItem("🧠 Classification"):
-            dataset_dropdown = gr.CheckboxGroup(list(REPO_CONFIG.keys()), value=list(REPO_CONFIG.keys()), label="Datasets")
-            feature_picker = gr.CheckboxGroup(label="Input features")
-            test_size = gr.Slider(0.1, 0.5, value=0.2, step=0.05, label="Test split")
-            seed = gr.Number(value=42, label="Random seed")
-            run_btn = gr.Button("Train & Evaluate", variant="primary")
-            metrics = gr.Markdown()
-            cm_plot = gr.Plot()
-    gr.Markdown("---")
-    gr.Markdown(
-        "### 🔗 Links\n"
-        "[Website](https://qsbench.github.io) | "
-        "[Hugging Face](https://huggingface.co/QSBench) | "
-        "[GitHub](https://github.com/QSBench)"
-    )
-    dataset_dropdown.change(
-        lambda datasets: gr.update(choices=get_available_feature_columns(pd.concat([load_dataset_df(k) for k in datasets]))),
-        [dataset_dropdown],
-        [feature_picker]
-    )
-    run_btn.click(
-        train_classifier,
-        [dataset_dropdown, feature_picker, test_size, seed],
-        [metrics, cm_plot]
-    )
 if __name__ == "__main__":
     demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)

 import ast
 import logging
 import re
+from typing import Dict, List, Optional, Tuple
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 APP_TITLE = "Noise Detection"
+APP_SUBTITLE = (
+    "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
+)
 REPO_CONFIG = {
+    "clean": {"label": "clean", "repo": "QSBench/QSBench-Core-v1.0.0-demo"},
+    "depolarizing": {"label": "depolarizing", "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0"},
+    "amplitude_damping": {"label": "amplitude_damping", "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo"},
+    "hardware_aware": {"label": "hardware_aware", "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo"},
 }
+CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
 NON_FEATURE_COLS = {
+    "sample_id", "sample_seed", "circuit_hash", "split",
+    "circuit_qasm", "qasm_raw", "qasm_transpiled",
+    "circuit_type_resolved", "circuit_type_requested",
+    "noise_type", "noise_prob", "observable_bases",
+    "observable_mode", "backend_device", "precision_mode",
+    "circuit_signature", "entanglement", "meyer_wallach",
+    "cx_count", "noise_label",
 }
+SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
+_COMBINED_CACHE: Optional[pd.DataFrame] = None
 def safe_parse(value):
     parsed = safe_parse(adj_value)
     if not isinstance(parsed, list) or len(parsed) == 0:
         return {"adj_edge_count": np.nan, "adj_density": np.nan, "adj_degree_mean": np.nan, "adj_degree_std": np.nan}
     try:
         arr = np.array(parsed, dtype=float)
         n = arr.shape[0]
     if not isinstance(qasm_value, str) or not qasm_value.strip():
         return {"qasm_length": np.nan, "qasm_line_count": np.nan, "qasm_gate_keyword_count": np.nan,
                 "qasm_measure_count": np.nan, "qasm_comment_count": np.nan}
     text = qasm_value
     lines = [line for line in text.splitlines() if line.strip()]
+    gate_keywords = re.findall(r"\b(cx|h|x|y|z|rx|ry|rz|u1|u2|u3|u|swap|cz|ccx|rxx|ryy|rzz)\b",
+                               text, flags=re.IGNORECASE)
     measure_count = len(re.findall(r"\bmeasure\b", text, flags=re.IGNORECASE))
     comment_count = sum(1 for line in lines if line.strip().startswith("//"))
     return {
         "qasm_length": float(len(text)),
         "qasm_line_count": float(len(lines)),
     if "adjacency" in df.columns:
         adj_df = df["adjacency"].apply(adjacency_features).apply(pd.Series)
         df = pd.concat([df, adj_df], axis=1)
     qasm_source = "qasm_transpiled" if "qasm_transpiled" in df.columns else "qasm_raw"
     if qasm_source in df.columns:
         qasm_df = df[qasm_source].apply(qasm_features).apply(pd.Series)
     return df
+def load_single_dataset(dataset_key: str) -> pd.DataFrame:
+    if dataset_key not in _ASSET_CACHE:
+        ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
+        df = pd.DataFrame(ds["train"])
+        df = enrich_dataframe(df)
+        df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
+        _ASSET_CACHE[dataset_key] = df
+    return _ASSET_CACHE[dataset_key]
+def load_combined_dataset() -> pd.DataFrame:
+    global _COMBINED_CACHE
+    if _COMBINED_CACHE is None:
+        frames = [load_single_dataset(k) for k in REPO_CONFIG.keys()]
+        combined = pd.concat(frames, ignore_index=True)
+        combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
+        _COMBINED_CACHE = combined
+    return _COMBINED_CACHE
 def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    features = [col for col in numeric_cols if col not in NON_FEATURE_COLS
+                and all(pattern not in col for pattern in SOFT_EXCLUDE_PATTERNS)]
     return sorted(features)
     return [f for f in preferred if f in features]
+def make_classification_figure(y_true, y_pred, class_names, feature_names=None, importances=None):
+    fig = plt.figure(figsize=(20, 6))
+    gs = fig.add_gridspec(1, 3)
+    ax1 = fig.add_subplot(gs[0, 0])
+    ax2 = fig.add_subplot(gs[0, 1])
+    ax3 = fig.add_subplot(gs[0, 2])
+    cm = confusion_matrix(y_true, y_pred, labels=class_names)
+    im = ax1.imshow(cm, interpolation="nearest")
+    ax1.set_title("Confusion Matrix")
+    ax1.set_xlabel("Predicted")
+    ax1.set_ylabel("Actual")
+    ax1.set_xticks(np.arange(len(class_names)))
+    ax1.set_yticks(np.arange(len(class_names)))
+    ax1.set_xticklabels(class_names, rotation=45, ha="right")
+    ax1.set_yticklabels(class_names)
+    for i in range(cm.shape[0]):
+        for j in range(cm.shape[1]):
+            ax1.text(j, i, cm[i, j], ha="center", va="center")
+    fig.colorbar(im, ax=ax1, fraction=0.046, pad=0.04)
+    incorrect = (y_true != y_pred).astype(int)
+    ax2.hist(incorrect, bins=[-0.5, 0.5, 1.5])
+    ax2.set_title("Correct vs Incorrect")
+    ax2.set_xlabel("0 = Correct, 1 = Incorrect")
+    ax2.set_ylabel("Count")
+    if importances is not None and feature_names is not None and len(importances) == len(feature_names):
+        idx = np.argsort(importances)[-10:]
+        ax3.barh([feature_names[i] for i in idx], importances[idx])
+        ax3.set_title("Top-10 Feature Importances")
+        ax3.set_xlabel("Importance")
+    else:
+        ax3.text(0.5, 0.5, "Feature importances unavailable", ha="center", va="center")
+        ax3.set_axis_off()
+    fig.tight_layout()
+    return fig
+def train_classifier(feature_columns, test_size, max_depth, random_state, n_estimators=200):
     if not feature_columns:
+        return None, "### ❌ Please select at least one feature."
+    df = load_combined_dataset()
     df = df.dropna(subset=feature_columns + ["noise_label"])
     X = df[feature_columns]
     y = df["noise_label"]
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(random_state),
+                                                        stratify=y)
+    model = Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler()),
+        ("classifier", HistGradientBoostingClassifier(
+            max_depth=int(max_depth),
+            max_iter=int(n_estimators),
+            random_state=int(random_state),
+            learning_rate=0.05,
+        ))
+    ])
     model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    classifier = model.named_steps["classifier"]
+    importances = getattr(classifier, "feature_importances_", None)
+    fig = make_classification_figure(y_test.to_numpy(), y_pred, CLASS_ORDER, feature_columns, importances)
+    report = classification_report(y_test, y_pred, labels=CLASS_ORDER)
+    results = f"### Classification report\n```\n{report}\n```"
+    return fig, results
+CUSTOM_CSS = ".gradio-container {max-width: 1400px !important;}"
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(f"# 🌌 {APP_TITLE}")
     gr.Markdown(APP_SUBTITLE)
+    with gr.TabItem("🧠 Classification"):
+        feature_picker = gr.CheckboxGroup(label="Input features", choices=[])
+        test_size = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="Test split")
+        max_depth = gr.Slider(1, 30, value=5, step=1, label="Max depth")
+        seed = gr.Number(value=42, precision=0, label="Random seed")
+        n_estimators = gr.Slider(50, 400, value=200, step=10, label="Iterations")
+        run_btn = gr.Button("Train & Evaluate", variant="primary")
+        plot = gr.Plot()
+        metrics = gr.Markdown()
+    dataset_dropdown = gr.Dropdown(list(REPO_CONFIG.keys()), value="clean", label="Dataset")
+    dataset_dropdown.change(lambda _: gr.update(choices=default_feature_selection(get_available_feature_columns(load_combined_dataset()))),
+                            [], [feature_picker])
+    run_btn.click(train_classifier, [feature_picker, test_size, max_depth, seed, n_estimators], [plot, metrics])
 if __name__ == "__main__":
     demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)