Spaces:

QSBench
/

Noise_Detection

Running

App Files Files Community

QSBench commited on 4 days ago

Commit

c97ff0e

verified ·

1 Parent(s): 2aae7a5

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -25

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import ast
 import logging
 import re
 from typing import Dict, List, Optional, Tuple
@@ -7,7 +9,6 @@ import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from datasets import load_dataset
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
@@ -20,29 +21,31 @@ logger = logging.getLogger(__name__)
 APP_TITLE = "Noise Detection"
 APP_SUBTITLE = (
-    "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
 )
 REPO_CONFIG = {
     "clean": {
         "label": "clean",
-        "repo": "QSBench/QSBench-Core-v1.0.0-demo",
     },
     "depolarizing": {
         "label": "depolarizing",
-        "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
     },
     "amplitude_damping": {
         "label": "amplitude_damping",
-        "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo",
     },
     "hardware_aware": {
         "label": "hardware_aware",
-        "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo",
     },
 }
-CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
 NON_FEATURE_COLS = {
     "sample_id",
@@ -65,12 +68,14 @@ NON_FEATURE_COLS = {
     "meyer_wallach",
     "cx_count",
     "noise_label",
 }
 SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
-_COMBINED_CACHE: Optional[pd.DataFrame] = None
 def safe_parse(value):
@@ -185,26 +190,27 @@ def _read_parquet_source(path: str) -> pd.DataFrame:
 def load_single_dataset(dataset_key: str) -> pd.DataFrame:
-    """Load a dataset shard from Hugging Face and cache it in memory."""
     if dataset_key not in _ASSET_CACHE:
-        logger.info("Loading dataset: %s", dataset_key)
-        ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
-        df = pd.DataFrame(ds["train"])
         df = enrich_dataframe(df)
         df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
         _ASSET_CACHE[dataset_key] = df
     return _ASSET_CACHE[dataset_key]
-def load_combined_dataset() -> pd.DataFrame:
-    """Load and merge all four noise-condition datasets."""
-    global _COMBINED_CACHE
-    if _COMBINED_CACHE is None:
-        frames = [load_single_dataset(key) for key in REPO_CONFIG.keys()]
         combined = pd.concat(frames, ignore_index=True)
-        combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
-        _COMBINED_CACHE = combined
-    return _COMBINED_CACHE
 def load_guide_content() -> str:
@@ -325,11 +331,15 @@ def refresh_explorer(dataset_key: str, split_name: str) -> Tuple[gr.update, pd.D
     profile_box = build_dataset_profile(df)
     summary_box = (
-        f"### Split summary\n\n"
-        f"**Dataset:** `{dataset_key}`  \n"
-        f"**Label:** `{REPO_CONFIG[dataset_key]['label']}`  \n"
-        f"**Path:** `{REPO_CONFIG[dataset_key]['path']}`  \n"
-        f"**Available splits:** {', '.join(splits)}  \n"
         f"**Preview rows:** {len(display_df)}"
     )

 import ast
+import glob
 import logging
+import os
 import re
 from typing import Dict, List, Optional, Tuple
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
 APP_TITLE = "Noise Detection"
 APP_SUBTITLE = (
+    "Detect hardware-aware transpilation artifacts versus all other circuit conditions using structural circuit features."
 )
+DATA_DIR = os.getenv("QS_DATA_DIR", "data")
 REPO_CONFIG = {
     "clean": {
         "label": "clean",
+        "path": os.getenv("QS_CLEAN_PATH", os.path.join(DATA_DIR, "core")),
     },
     "depolarizing": {
         "label": "depolarizing",
+        "path": os.getenv("QS_DEPOLARIZING_PATH", os.path.join(DATA_DIR, "depolarizing")),
     },
     "amplitude_damping": {
         "label": "amplitude_damping",
+        "path": os.getenv("QS_AMPLITUDE_PATH", os.path.join(DATA_DIR, "amplitude")),
     },
     "hardware_aware": {
         "label": "hardware_aware",
+        "path": os.getenv("QS_HARDWARE_AWARE_PATH", os.path.join(DATA_DIR, "transpilation")),
     },
 }
+CLASS_ORDER = ["other", "hardware_aware"]
 NON_FEATURE_COLS = {
     "sample_id",
     "meyer_wallach",
     "cx_count",
     "noise_label",
+    "source_dataset",
+    "target_label",
 }
 SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
 _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
+_COMBINED_CACHE: Dict[Tuple[str, ...], pd.DataFrame] = {}
 def safe_parse(value):
 def load_single_dataset(dataset_key: str) -> pd.DataFrame:
+    """Load a local parquet dataset and cache it in memory."""
     if dataset_key not in _ASSET_CACHE:
+        path = _resolve_path(dataset_key)
+        logger.info("Loading local dataset: %s -> %s", dataset_key, path)
+        df = _read_parquet_source(path)
         df = enrich_dataframe(df)
         df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
+        df["source_dataset"] = dataset_key
         _ASSET_CACHE[dataset_key] = df
     return _ASSET_CACHE[dataset_key]
+def load_combined_dataset(dataset_keys: List[str]) -> pd.DataFrame:
+    """Load and merge selected local datasets."""
+    cache_key = tuple(sorted(dataset_keys))
+    if cache_key not in _COMBINED_CACHE:
+        frames = [load_single_dataset(key) for key in dataset_keys]
         combined = pd.concat(frames, ignore_index=True)
+        combined = combined.copy()
+        _COMBINED_CACHE[cache_key] = combined
+    return _COMBINED_CACHE[cache_key]
 def load_guide_content() -> str:
     profile_box = build_dataset_profile(df)
     summary_box = (
+        f"### Split summary
+"
+        f"**Dataset:** `{dataset_key}`
+"
+        f"**Label:** `{REPO_CONFIG[dataset_key]['label']}`
+"
+        f"**Available splits:** {', '.join(splits)}
+"
         f"**Preview rows:** {len(display_df)}"
     )