QSBench commited on
Commit
e4692be
·
verified ·
1 Parent(s): d96d8b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -60
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import ast
2
- import glob
3
  import logging
4
- import os
5
  import re
6
  from typing import Dict, List, Optional, Tuple
7
 
@@ -9,6 +7,7 @@ import gradio as gr
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import pandas as pd
 
12
  from sklearn.ensemble import HistGradientBoostingClassifier
13
  from sklearn.impute import SimpleImputer
14
  from sklearn.inspection import permutation_importance
@@ -21,31 +20,29 @@ logger = logging.getLogger(__name__)
21
 
22
  APP_TITLE = "Noise Detection"
23
  APP_SUBTITLE = (
24
- "Detect hardware-aware transpilation artifacts versus all other circuit conditions using structural circuit features."
25
  )
26
 
27
- DATA_DIR = os.getenv("QS_DATA_DIR", "data")
28
-
29
  REPO_CONFIG = {
30
  "clean": {
31
  "label": "clean",
32
- "path": os.getenv("QS_CLEAN_PATH", os.path.join(DATA_DIR, "core")),
33
  },
34
  "depolarizing": {
35
  "label": "depolarizing",
36
- "path": os.getenv("QS_DEPOLARIZING_PATH", os.path.join(DATA_DIR, "depolarizing")),
37
  },
38
  "amplitude_damping": {
39
  "label": "amplitude_damping",
40
- "path": os.getenv("QS_AMPLITUDE_PATH", os.path.join(DATA_DIR, "amplitude")),
41
  },
42
  "hardware_aware": {
43
  "label": "hardware_aware",
44
- "path": os.getenv("QS_HARDWARE_AWARE_PATH", os.path.join(DATA_DIR, "transpilation")),
45
  },
46
  }
47
 
48
- CLASS_ORDER = ["other", "hardware_aware"]
49
 
50
  NON_FEATURE_COLS = {
51
  "sample_id",
@@ -68,14 +65,12 @@ NON_FEATURE_COLS = {
68
  "meyer_wallach",
69
  "cx_count",
70
  "noise_label",
71
- "source_dataset",
72
- "target_label",
73
  }
74
 
75
  SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
76
 
77
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
78
- _COMBINED_CACHE: Dict[Tuple[str, ...], pd.DataFrame] = {}
79
 
80
 
81
  def safe_parse(value):
@@ -167,50 +162,32 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
167
  return df
168
 
169
 
170
- def _resolve_path(dataset_key: str) -> str:
171
- path = REPO_CONFIG[dataset_key]["path"]
172
- if not os.path.exists(path):
173
- raise FileNotFoundError(
174
- f"Local dataset path not found for '{dataset_key}': {path}. "
175
- "Set the matching environment variable or place the parquet directory at this path."
176
- )
177
- return path
178
-
179
-
180
- def _read_parquet_source(path: str) -> pd.DataFrame:
181
- """Read a parquet file or a directory of parquet shards."""
182
- if os.path.isdir(path):
183
- files = sorted(glob.glob(os.path.join(path, "*.parquet")))
184
- if not files:
185
- raise FileNotFoundError(f"No parquet files found in directory: {path}")
186
- frames = [pd.read_parquet(file_path) for file_path in files]
187
- return pd.concat(frames, ignore_index=True)
188
-
189
- return pd.read_parquet(path)
190
-
191
-
192
  def load_single_dataset(dataset_key: str) -> pd.DataFrame:
193
- """Load a local parquet dataset and cache it in memory."""
194
  if dataset_key not in _ASSET_CACHE:
195
- path = _resolve_path(dataset_key)
196
- logger.info("Loading local dataset: %s -> %s", dataset_key, path)
197
- df = _read_parquet_source(path)
198
  df = enrich_dataframe(df)
199
  df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
200
- df["source_dataset"] = dataset_key
201
  _ASSET_CACHE[dataset_key] = df
202
  return _ASSET_CACHE[dataset_key]
203
 
204
 
205
- def load_combined_dataset(dataset_keys: List[str]) -> pd.DataFrame:
206
- """Load and merge selected local datasets."""
 
 
 
 
207
  cache_key = tuple(sorted(dataset_keys))
208
- if cache_key not in _COMBINED_CACHE:
209
  frames = [load_single_dataset(key) for key in dataset_keys]
210
  combined = pd.concat(frames, ignore_index=True)
211
- combined = combined.copy()
212
- _COMBINED_CACHE[cache_key] = combined
213
- return _COMBINED_CACHE[cache_key]
 
214
 
215
 
216
  def load_guide_content() -> str:
@@ -367,7 +344,7 @@ def train_classifier(
367
  max_depth: float,
368
  random_state: float,
369
  ) -> Tuple[Optional[plt.Figure], str]:
370
- """Train a binary classifier for hardware-aware detection."""
371
  if not dataset_keys:
372
  return None, "### ❌ Please select at least one dataset."
373
 
@@ -375,23 +352,15 @@ def train_classifier(
375
  return None, "### ❌ Please select at least one feature."
376
 
377
  df = load_combined_dataset(dataset_keys).copy()
378
- df["target_label"] = np.where(df["source_dataset"] == "hardware_aware", "hardware_aware", "other")
379
-
380
- if "target_label" not in df.columns:
381
- return None, "### ❌ Target label could not be created."
382
-
383
- train_df = df.dropna(subset=["target_label"]).copy()
384
 
385
  if len(train_df) < 20:
386
  return None, "### ❌ Not enough rows after filtering missing values."
387
 
388
- X = train_df[feature_columns].copy()
389
- X = X.dropna(axis=1, how="all")
390
- if X.shape[1] == 0:
391
- return None, "### ❌ All selected features are empty in the chosen datasets."
392
-
393
- feature_columns = X.columns.tolist()
394
- y = train_df["target_label"]
395
 
396
  seed = int(random_state)
397
  depth = int(max_depth) if max_depth and int(max_depth) > 0 else None
 
1
  import ast
 
2
  import logging
 
3
  import re
4
  from typing import Dict, List, Optional, Tuple
5
 
 
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
+ from datasets import load_dataset
11
  from sklearn.ensemble import HistGradientBoostingClassifier
12
  from sklearn.impute import SimpleImputer
13
  from sklearn.inspection import permutation_importance
 
20
 
21
  APP_TITLE = "Noise Detection"
22
  APP_SUBTITLE = (
23
+ "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
24
  )
25
 
 
 
26
  REPO_CONFIG = {
27
  "clean": {
28
  "label": "clean",
29
+ "repo": "QSBench/QSBench-Core-v1.0.0-demo",
30
  },
31
  "depolarizing": {
32
  "label": "depolarizing",
33
+ "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
34
  },
35
  "amplitude_damping": {
36
  "label": "amplitude_damping",
37
+ "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo",
38
  },
39
  "hardware_aware": {
40
  "label": "hardware_aware",
41
+ "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo",
42
  },
43
  }
44
 
45
+ CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
46
 
47
  NON_FEATURE_COLS = {
48
  "sample_id",
 
65
  "meyer_wallach",
66
  "cx_count",
67
  "noise_label",
 
 
68
  }
69
 
70
  SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
71
 
72
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
73
+ _COMBINED_CACHE: Optional[pd.DataFrame] = None
74
 
75
 
76
  def safe_parse(value):
 
162
  return df
163
 
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def load_single_dataset(dataset_key: str) -> pd.DataFrame:
166
+ """Load a dataset shard from Hugging Face and cache it in memory."""
167
  if dataset_key not in _ASSET_CACHE:
168
+ logger.info("Loading dataset: %s", dataset_key)
169
+ ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
170
+ df = pd.DataFrame(ds["train"])
171
  df = enrich_dataframe(df)
172
  df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
 
173
  _ASSET_CACHE[dataset_key] = df
174
  return _ASSET_CACHE[dataset_key]
175
 
176
 
177
+ def load_combined_dataset(dataset_keys: Optional[List[str]] = None) -> pd.DataFrame:
178
+ """Load and merge selected noise-condition datasets."""
179
+ global _COMBINED_CACHE
180
+ if dataset_keys is None:
181
+ dataset_keys = list(REPO_CONFIG.keys())
182
+
183
  cache_key = tuple(sorted(dataset_keys))
184
+ if _COMBINED_CACHE is None or not isinstance(_COMBINED_CACHE, pd.DataFrame) or getattr(_COMBINED_CACHE, "_cache_key", None) != cache_key:
185
  frames = [load_single_dataset(key) for key in dataset_keys]
186
  combined = pd.concat(frames, ignore_index=True)
187
+ combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
188
+ combined._cache_key = cache_key # type: ignore[attr-defined]
189
+ _COMBINED_CACHE = combined
190
+ return _COMBINED_CACHE
191
 
192
 
193
  def load_guide_content() -> str:
 
344
  max_depth: float,
345
  random_state: float,
346
  ) -> Tuple[Optional[plt.Figure], str]:
347
+ """Train a four-class classifier and return metrics plus a plot."""
348
  if not dataset_keys:
349
  return None, "### ❌ Please select at least one dataset."
350
 
 
352
  return None, "### ❌ Please select at least one feature."
353
 
354
  df = load_combined_dataset(dataset_keys).copy()
355
+ required_cols = feature_columns + ["noise_label"]
356
+ train_df = df.dropna(subset=required_cols).copy()
357
+ train_df = train_df[train_df["noise_label"].isin(CLASS_ORDER)]
 
 
 
358
 
359
  if len(train_df) < 20:
360
  return None, "### ❌ Not enough rows after filtering missing values."
361
 
362
+ X = train_df[feature_columns]
363
+ y = train_df["noise_label"]
 
 
 
 
 
364
 
365
  seed = int(random_state)
366
  depth = int(max_depth) if max_depth and int(max_depth) > 0 else None