Spaces:

QSBench
/

Noise_Detection

Running

App Files Files Community

QSBench commited on 2 days ago

Commit

e4692be

verified ·

1 Parent(s): d96d8b6

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -60

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import ast
-import glob
 import logging
-import os
 import re
 from typing import Dict, List, Optional, Tuple
@@ -9,6 +7,7 @@ import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
@@ -21,31 +20,29 @@ logger = logging.getLogger(__name__)
 APP_TITLE = "Noise Detection"
 APP_SUBTITLE = (
-    "Detect hardware-aware transpilation artifacts versus all other circuit conditions using structural circuit features."
 )
-DATA_DIR = os.getenv("QS_DATA_DIR", "data")
 REPO_CONFIG = {
     "clean": {
         "label": "clean",
-        "path": os.getenv("QS_CLEAN_PATH", os.path.join(DATA_DIR, "core")),
     },
     "depolarizing": {
         "label": "depolarizing",
-        "path": os.getenv("QS_DEPOLARIZING_PATH", os.path.join(DATA_DIR, "depolarizing")),
     },
     "amplitude_damping": {
         "label": "amplitude_damping",
-        "path": os.getenv("QS_AMPLITUDE_PATH", os.path.join(DATA_DIR, "amplitude")),
     },
     "hardware_aware": {
         "label": "hardware_aware",
-        "path": os.getenv("QS_HARDWARE_AWARE_PATH", os.path.join(DATA_DIR, "transpilation")),
     },
 }
-CLASS_ORDER = ["other", "hardware_aware"]
 NON_FEATURE_COLS = {
     "sample_id",
@@ -68,14 +65,12 @@ NON_FEATURE_COLS = {
     "meyer_wallach",
     "cx_count",
     "noise_label",
-    "source_dataset",
-    "target_label",
 }
 SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
-_COMBINED_CACHE: Dict[Tuple[str, ...], pd.DataFrame] = {}
 def safe_parse(value):
@@ -167,50 +162,32 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     return df
-def _resolve_path(dataset_key: str) -> str:
-    path = REPO_CONFIG[dataset_key]["path"]
-    if not os.path.exists(path):
-        raise FileNotFoundError(
-            f"Local dataset path not found for '{dataset_key}': {path}. "
-            "Set the matching environment variable or place the parquet directory at this path."
-        )
-    return path
-def _read_parquet_source(path: str) -> pd.DataFrame:
-    """Read a parquet file or a directory of parquet shards."""
-    if os.path.isdir(path):
-        files = sorted(glob.glob(os.path.join(path, "*.parquet")))
-        if not files:
-            raise FileNotFoundError(f"No parquet files found in directory: {path}")
-        frames = [pd.read_parquet(file_path) for file_path in files]
-        return pd.concat(frames, ignore_index=True)
-    return pd.read_parquet(path)
 def load_single_dataset(dataset_key: str) -> pd.DataFrame:
-    """Load a local parquet dataset and cache it in memory."""
     if dataset_key not in _ASSET_CACHE:
-        path = _resolve_path(dataset_key)
-        logger.info("Loading local dataset: %s -> %s", dataset_key, path)
-        df = _read_parquet_source(path)
         df = enrich_dataframe(df)
         df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
-        df["source_dataset"] = dataset_key
         _ASSET_CACHE[dataset_key] = df
     return _ASSET_CACHE[dataset_key]
-def load_combined_dataset(dataset_keys: List[str]) -> pd.DataFrame:
-    """Load and merge selected local datasets."""
     cache_key = tuple(sorted(dataset_keys))
-    if cache_key not in _COMBINED_CACHE:
         frames = [load_single_dataset(key) for key in dataset_keys]
         combined = pd.concat(frames, ignore_index=True)
-        combined = combined.copy()
-        _COMBINED_CACHE[cache_key] = combined
-    return _COMBINED_CACHE[cache_key]
 def load_guide_content() -> str:
@@ -367,7 +344,7 @@ def train_classifier(
     max_depth: float,
     random_state: float,
 ) -> Tuple[Optional[plt.Figure], str]:
-    """Train a binary classifier for hardware-aware detection."""
     if not dataset_keys:
         return None, "### ❌ Please select at least one dataset."
@@ -375,23 +352,15 @@ def train_classifier(
         return None, "### ❌ Please select at least one feature."
     df = load_combined_dataset(dataset_keys).copy()
-    df["target_label"] = np.where(df["source_dataset"] == "hardware_aware", "hardware_aware", "other")
-    if "target_label" not in df.columns:
-        return None, "### ❌ Target label could not be created."
-    train_df = df.dropna(subset=["target_label"]).copy()
     if len(train_df) < 20:
         return None, "### ❌ Not enough rows after filtering missing values."
-    X = train_df[feature_columns].copy()
-    X = X.dropna(axis=1, how="all")
-    if X.shape[1] == 0:
-        return None, "### ❌ All selected features are empty in the chosen datasets."
-    feature_columns = X.columns.tolist()
-    y = train_df["target_label"]
     seed = int(random_state)
     depth = int(max_depth) if max_depth and int(max_depth) > 0 else None

 import ast
 import logging
 import re
 from typing import Dict, List, Optional, Tuple
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from datasets import load_dataset
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
 APP_TITLE = "Noise Detection"
 APP_SUBTITLE = (
+    "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
 )
 REPO_CONFIG = {
     "clean": {
         "label": "clean",
+        "repo": "QSBench/QSBench-Core-v1.0.0-demo",
     },
     "depolarizing": {
         "label": "depolarizing",
+        "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
     },
     "amplitude_damping": {
         "label": "amplitude_damping",
+        "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo",
     },
     "hardware_aware": {
         "label": "hardware_aware",
+        "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo",
     },
 }
+CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
 NON_FEATURE_COLS = {
     "sample_id",
     "meyer_wallach",
     "cx_count",
     "noise_label",
 }
 SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
+_COMBINED_CACHE: Optional[pd.DataFrame] = None
 def safe_parse(value):
     return df
 def load_single_dataset(dataset_key: str) -> pd.DataFrame:
+    """Load a dataset shard from Hugging Face and cache it in memory."""
     if dataset_key not in _ASSET_CACHE:
+        logger.info("Loading dataset: %s", dataset_key)
+        ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
+        df = pd.DataFrame(ds["train"])
         df = enrich_dataframe(df)
         df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
         _ASSET_CACHE[dataset_key] = df
     return _ASSET_CACHE[dataset_key]
+def load_combined_dataset(dataset_keys: Optional[List[str]] = None) -> pd.DataFrame:
+    """Load and merge selected noise-condition datasets."""
+    global _COMBINED_CACHE
+    if dataset_keys is None:
+        dataset_keys = list(REPO_CONFIG.keys())
     cache_key = tuple(sorted(dataset_keys))
+    if _COMBINED_CACHE is None or not isinstance(_COMBINED_CACHE, pd.DataFrame) or getattr(_COMBINED_CACHE, "_cache_key", None) != cache_key:
         frames = [load_single_dataset(key) for key in dataset_keys]
         combined = pd.concat(frames, ignore_index=True)
+        combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
+        combined._cache_key = cache_key  # type: ignore[attr-defined]
+        _COMBINED_CACHE = combined
+    return _COMBINED_CACHE
 def load_guide_content() -> str:
     max_depth: float,
     random_state: float,
 ) -> Tuple[Optional[plt.Figure], str]:
+    """Train a four-class classifier and return metrics plus a plot."""
     if not dataset_keys:
         return None, "### ❌ Please select at least one dataset."
         return None, "### ❌ Please select at least one feature."
     df = load_combined_dataset(dataset_keys).copy()
+    required_cols = feature_columns + ["noise_label"]
+    train_df = df.dropna(subset=required_cols).copy()
+    train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
     if len(train_df) < 20:
         return None, "### ❌ Not enough rows after filtering missing values."
+    X = train_df[feature_columns]
+    y = train_df["noise_label"]
     seed = int(random_state)
     depth = int(max_depth) if max_depth and int(max_depth) > 0 else None