QSBench commited on
Commit
c97ff0e
·
verified ·
1 Parent(s): 2aae7a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -25
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import ast
 
2
  import logging
 
3
  import re
4
  from typing import Dict, List, Optional, Tuple
5
 
@@ -7,7 +9,6 @@ import gradio as gr
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
- from datasets import load_dataset
11
  from sklearn.ensemble import HistGradientBoostingClassifier
12
  from sklearn.impute import SimpleImputer
13
  from sklearn.inspection import permutation_importance
@@ -20,29 +21,31 @@ logger = logging.getLogger(__name__)
20
 
21
  APP_TITLE = "Noise Detection"
22
  APP_SUBTITLE = (
23
- "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
24
  )
25
 
 
 
26
  REPO_CONFIG = {
27
  "clean": {
28
  "label": "clean",
29
- "repo": "QSBench/QSBench-Core-v1.0.0-demo",
30
  },
31
  "depolarizing": {
32
  "label": "depolarizing",
33
- "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
34
  },
35
  "amplitude_damping": {
36
  "label": "amplitude_damping",
37
- "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo",
38
  },
39
  "hardware_aware": {
40
  "label": "hardware_aware",
41
- "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo",
42
  },
43
  }
44
 
45
- CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
46
 
47
  NON_FEATURE_COLS = {
48
  "sample_id",
@@ -65,12 +68,14 @@ NON_FEATURE_COLS = {
65
  "meyer_wallach",
66
  "cx_count",
67
  "noise_label",
 
 
68
  }
69
 
70
  SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
71
 
72
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
73
- _COMBINED_CACHE: Optional[pd.DataFrame] = None
74
 
75
 
76
  def safe_parse(value):
@@ -185,26 +190,27 @@ def _read_parquet_source(path: str) -> pd.DataFrame:
185
 
186
 
187
  def load_single_dataset(dataset_key: str) -> pd.DataFrame:
188
- """Load a dataset shard from Hugging Face and cache it in memory."""
189
  if dataset_key not in _ASSET_CACHE:
190
- logger.info("Loading dataset: %s", dataset_key)
191
- ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
192
- df = pd.DataFrame(ds["train"])
193
  df = enrich_dataframe(df)
194
  df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
 
195
  _ASSET_CACHE[dataset_key] = df
196
  return _ASSET_CACHE[dataset_key]
197
 
198
 
199
- def load_combined_dataset() -> pd.DataFrame:
200
- """Load and merge all four noise-condition datasets."""
201
- global _COMBINED_CACHE
202
- if _COMBINED_CACHE is None:
203
- frames = [load_single_dataset(key) for key in REPO_CONFIG.keys()]
204
  combined = pd.concat(frames, ignore_index=True)
205
- combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
206
- _COMBINED_CACHE = combined
207
- return _COMBINED_CACHE
208
 
209
 
210
  def load_guide_content() -> str:
@@ -325,11 +331,15 @@ def refresh_explorer(dataset_key: str, split_name: str) -> Tuple[gr.update, pd.D
325
 
326
  profile_box = build_dataset_profile(df)
327
  summary_box = (
328
- f"### Split summary\n\n"
329
- f"**Dataset:** `{dataset_key}` \n"
330
- f"**Label:** `{REPO_CONFIG[dataset_key]['label']}` \n"
331
- f"**Path:** `{REPO_CONFIG[dataset_key]['path']}` \n"
332
- f"**Available splits:** {', '.join(splits)} \n"
 
 
 
 
333
  f"**Preview rows:** {len(display_df)}"
334
  )
335
 
 
1
  import ast
2
+ import glob
3
  import logging
4
+ import os
5
  import re
6
  from typing import Dict, List, Optional, Tuple
7
 
 
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import pandas as pd
 
12
  from sklearn.ensemble import HistGradientBoostingClassifier
13
  from sklearn.impute import SimpleImputer
14
  from sklearn.inspection import permutation_importance
 
21
 
22
  APP_TITLE = "Noise Detection"
23
  APP_SUBTITLE = (
24
+ "Detect hardware-aware transpilation artifacts versus all other circuit conditions using structural circuit features."
25
  )
26
 
27
+ DATA_DIR = os.getenv("QS_DATA_DIR", "data")
28
+
29
  REPO_CONFIG = {
30
  "clean": {
31
  "label": "clean",
32
+ "path": os.getenv("QS_CLEAN_PATH", os.path.join(DATA_DIR, "core")),
33
  },
34
  "depolarizing": {
35
  "label": "depolarizing",
36
+ "path": os.getenv("QS_DEPOLARIZING_PATH", os.path.join(DATA_DIR, "depolarizing")),
37
  },
38
  "amplitude_damping": {
39
  "label": "amplitude_damping",
40
+ "path": os.getenv("QS_AMPLITUDE_PATH", os.path.join(DATA_DIR, "amplitude")),
41
  },
42
  "hardware_aware": {
43
  "label": "hardware_aware",
44
+ "path": os.getenv("QS_HARDWARE_AWARE_PATH", os.path.join(DATA_DIR, "transpilation")),
45
  },
46
  }
47
 
48
+ CLASS_ORDER = ["other", "hardware_aware"]
49
 
50
  NON_FEATURE_COLS = {
51
  "sample_id",
 
68
  "meyer_wallach",
69
  "cx_count",
70
  "noise_label",
71
+ "source_dataset",
72
+ "target_label",
73
  }
74
 
75
  SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
76
 
77
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
78
+ _COMBINED_CACHE: Dict[Tuple[str, ...], pd.DataFrame] = {}
79
 
80
 
81
  def safe_parse(value):
 
190
 
191
 
192
  def load_single_dataset(dataset_key: str) -> pd.DataFrame:
193
+ """Load a local parquet dataset and cache it in memory."""
194
  if dataset_key not in _ASSET_CACHE:
195
+ path = _resolve_path(dataset_key)
196
+ logger.info("Loading local dataset: %s -> %s", dataset_key, path)
197
+ df = _read_parquet_source(path)
198
  df = enrich_dataframe(df)
199
  df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
200
+ df["source_dataset"] = dataset_key
201
  _ASSET_CACHE[dataset_key] = df
202
  return _ASSET_CACHE[dataset_key]
203
 
204
 
205
+ def load_combined_dataset(dataset_keys: List[str]) -> pd.DataFrame:
206
+ """Load and merge selected local datasets."""
207
+ cache_key = tuple(sorted(dataset_keys))
208
+ if cache_key not in _COMBINED_CACHE:
209
+ frames = [load_single_dataset(key) for key in dataset_keys]
210
  combined = pd.concat(frames, ignore_index=True)
211
+ combined = combined.copy()
212
+ _COMBINED_CACHE[cache_key] = combined
213
+ return _COMBINED_CACHE[cache_key]
214
 
215
 
216
  def load_guide_content() -> str:
 
331
 
332
  profile_box = build_dataset_profile(df)
333
  summary_box = (
334
+ f"### Split summary
335
+
336
+ "
337
+ f"**Dataset:** `{dataset_key}`
338
+ "
339
+ f"**Label:** `{REPO_CONFIG[dataset_key]['label']}`
340
+ "
341
+ f"**Available splits:** {', '.join(splits)}
342
+ "
343
  f"**Preview rows:** {len(display_df)}"
344
  )
345