QSBench commited on
Commit
ffca061
·
verified ·
1 Parent(s): 2635a44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -104
app.py CHANGED
@@ -1,62 +1,51 @@
1
  import ast
2
  import logging
3
  import re
4
- from typing import Dict, List
5
 
6
  import gradio as gr
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  from datasets import load_dataset
11
- from sklearn.model_selection import train_test_split
12
  from sklearn.ensemble import HistGradientBoostingClassifier
13
- from sklearn.metrics import classification_report, confusion_matrix
 
 
 
 
14
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
  APP_TITLE = "Noise Detection"
19
- APP_SUBTITLE = "Classify circuits by noise type: clean, depolarizing, amplitude_damping, hardware_aware."
 
 
20
 
21
  REPO_CONFIG = {
22
- "Core (Clean)": "QSBench/QSBench-Core-v1.0.0-demo",
23
- "Depolarizing Noise": "QSBench/QSBench-Depolarizing-Demo-v1.0.0",
24
- "Amplitude Damping": "QSBench/QSBench-Amplitude-v1.0.0-demo",
25
- "Hardware-Aware Noise": "QSBench/QSBench-Transpilation-v1.0.0-demo",
26
  }
27
 
 
 
28
  NON_FEATURE_COLS = {
29
- "sample_id",
30
- "sample_seed",
31
- "circuit_hash",
32
- "split",
33
- "circuit_qasm",
34
- "qasm_raw",
35
- "qasm_transpiled",
36
- "circuit_type_resolved",
37
- "circuit_type_requested",
38
- "noise_type",
39
- "noise_prob",
40
- "observable_bases",
41
- "observable_mode",
42
- "backend_device",
43
- "precision_mode",
44
- "circuit_signature",
45
  }
46
 
47
- _SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
48
 
49
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
50
-
51
-
52
- def load_dataset_df(dataset_key: str) -> pd.DataFrame:
53
- if dataset_key not in _ASSET_CACHE:
54
- ds = load_dataset(REPO_CONFIG[dataset_key])
55
- df = pd.DataFrame(ds["train"])
56
- df = enrich_dataframe(df)
57
- df["noise_label"] = dataset_key
58
- _ASSET_CACHE[dataset_key] = df
59
- return _ASSET_CACHE[dataset_key]
60
 
61
 
62
  def safe_parse(value):
@@ -72,7 +61,6 @@ def adjacency_features(adj_value) -> Dict[str, float]:
72
  parsed = safe_parse(adj_value)
73
  if not isinstance(parsed, list) or len(parsed) == 0:
74
  return {"adj_edge_count": np.nan, "adj_density": np.nan, "adj_degree_mean": np.nan, "adj_degree_std": np.nan}
75
-
76
  try:
77
  arr = np.array(parsed, dtype=float)
78
  n = arr.shape[0]
@@ -94,13 +82,12 @@ def qasm_features(qasm_value) -> Dict[str, float]:
94
  if not isinstance(qasm_value, str) or not qasm_value.strip():
95
  return {"qasm_length": np.nan, "qasm_line_count": np.nan, "qasm_gate_keyword_count": np.nan,
96
  "qasm_measure_count": np.nan, "qasm_comment_count": np.nan}
97
-
98
  text = qasm_value
99
  lines = [line for line in text.splitlines() if line.strip()]
100
- gate_keywords = re.findall(r"\b(cx|h|x|y|z|rx|ry|rz|u1|u2|u3|u|swap|cz|ccx|rxx|ryy|rzz)\b", text, flags=re.IGNORECASE)
 
101
  measure_count = len(re.findall(r"\bmeasure\b", text, flags=re.IGNORECASE))
102
  comment_count = sum(1 for line in lines if line.strip().startswith("//"))
103
-
104
  return {
105
  "qasm_length": float(len(text)),
106
  "qasm_line_count": float(len(lines)),
@@ -115,7 +102,6 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
115
  if "adjacency" in df.columns:
116
  adj_df = df["adjacency"].apply(adjacency_features).apply(pd.Series)
117
  df = pd.concat([df, adj_df], axis=1)
118
-
119
  qasm_source = "qasm_transpiled" if "qasm_transpiled" in df.columns else "qasm_raw"
120
  if qasm_source in df.columns:
121
  qasm_df = df[qasm_source].apply(qasm_features).apply(pd.Series)
@@ -123,15 +109,30 @@ def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
123
  return df
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
127
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
128
- features = []
129
- for col in numeric_cols:
130
- if col in NON_FEATURE_COLS:
131
- continue
132
- if any(pattern in col for pattern in _SOFT_EXCLUDE_PATTERNS):
133
- continue
134
- features.append(col)
135
  return sorted(features)
136
 
137
 
@@ -141,76 +142,97 @@ def default_feature_selection(features: List[str]) -> List[str]:
141
  return [f for f in preferred if f in features]
142
 
143
 
144
- def train_classifier(dataset_keys, feature_columns, test_size, seed):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  if not feature_columns:
146
- return None, "No features selected"
147
-
148
- dfs = [load_dataset_df(k) for k in dataset_keys]
149
- df = pd.concat(dfs, axis=0, ignore_index=True)
150
  df = df.dropna(subset=feature_columns + ["noise_label"])
151
-
152
  X = df[feature_columns]
153
  y = df["noise_label"]
154
-
155
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(seed), stratify=y)
156
-
157
- model = HistGradientBoostingClassifier(
158
- learning_rate=0.05,
159
- max_iter=200,
160
- max_depth=5,
161
- min_samples_leaf=10,
162
- l2_regularization=0.1,
163
- class_weight="balanced",
164
- random_state=int(seed),
165
- )
166
  model.fit(X_train, y_train)
167
- preds = model.predict(X_test)
168
-
169
- report = classification_report(y_test, preds, output_dict=False)
170
- cm = confusion_matrix(y_test, preds)
 
 
 
171
 
172
- return report, cm.tolist()
173
 
 
174
 
175
- CUSTOM_CSS = """
176
- .gradio-container {max-width: 1400px !important;}
177
- """
178
 
179
  with gr.Blocks(title=APP_TITLE) as demo:
180
  gr.Markdown(f"# 🌌 {APP_TITLE}")
181
  gr.Markdown(APP_SUBTITLE)
182
 
183
- with gr.Tabs():
184
- with gr.TabItem("🧠 Classification"):
185
- dataset_dropdown = gr.CheckboxGroup(list(REPO_CONFIG.keys()), value=list(REPO_CONFIG.keys()), label="Datasets")
186
- feature_picker = gr.CheckboxGroup(label="Input features")
187
-
188
- test_size = gr.Slider(0.1, 0.5, value=0.2, step=0.05, label="Test split")
189
- seed = gr.Number(value=42, label="Random seed")
190
- run_btn = gr.Button("Train & Evaluate", variant="primary")
191
-
192
- metrics = gr.Markdown()
193
- cm_plot = gr.Plot()
194
-
195
- gr.Markdown("---")
196
- gr.Markdown(
197
- "### 🔗 Links\n"
198
- "[Website](https://qsbench.github.io) | "
199
- "[Hugging Face](https://huggingface.co/QSBench) | "
200
- "[GitHub](https://github.com/QSBench)"
201
- )
202
-
203
- dataset_dropdown.change(
204
- lambda datasets: gr.update(choices=get_available_feature_columns(pd.concat([load_dataset_df(k) for k in datasets]))),
205
- [dataset_dropdown],
206
- [feature_picker]
207
- )
208
-
209
- run_btn.click(
210
- train_classifier,
211
- [dataset_dropdown, feature_picker, test_size, seed],
212
- [metrics, cm_plot]
213
- )
214
 
215
  if __name__ == "__main__":
216
  demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)
 
1
  import ast
2
  import logging
3
  import re
4
+ from typing import Dict, List, Optional, Tuple
5
 
6
  import gradio as gr
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  from datasets import load_dataset
 
11
  from sklearn.ensemble import HistGradientBoostingClassifier
12
+ from sklearn.impute import SimpleImputer
13
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.pipeline import Pipeline
16
+ from sklearn.preprocessing import StandardScaler
17
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
  APP_TITLE = "Noise Detection"
22
+ APP_SUBTITLE = (
23
+ "Classify quantum circuits into clean, depolarizing, amplitude_damping, or hardware-aware noise conditions."
24
+ )
25
 
26
  REPO_CONFIG = {
27
+ "clean": {"label": "clean", "repo": "QSBench/QSBench-Core-v1.0.0-demo"},
28
+ "depolarizing": {"label": "depolarizing", "repo": "QSBench/QSBench-Depolarizing-Demo-v1.0.0"},
29
+ "amplitude_damping": {"label": "amplitude_damping", "repo": "QSBench/QSBench-Amplitude-v1.0.0-demo"},
30
+ "hardware_aware": {"label": "hardware_aware", "repo": "QSBench/QSBench-Transpilation-v1.0.0-demo"},
31
  }
32
 
33
+ CLASS_ORDER = ["clean", "depolarizing", "amplitude_damping", "hardware_aware"]
34
+
35
  NON_FEATURE_COLS = {
36
+ "sample_id", "sample_seed", "circuit_hash", "split",
37
+ "circuit_qasm", "qasm_raw", "qasm_transpiled",
38
+ "circuit_type_resolved", "circuit_type_requested",
39
+ "noise_type", "noise_prob", "observable_bases",
40
+ "observable_mode", "backend_device", "precision_mode",
41
+ "circuit_signature", "entanglement", "meyer_wallach",
42
+ "cx_count", "noise_label",
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
+ SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
46
 
47
  _ASSET_CACHE: Dict[str, pd.DataFrame] = {}
48
+ _COMBINED_CACHE: Optional[pd.DataFrame] = None
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  def safe_parse(value):
 
61
  parsed = safe_parse(adj_value)
62
  if not isinstance(parsed, list) or len(parsed) == 0:
63
  return {"adj_edge_count": np.nan, "adj_density": np.nan, "adj_degree_mean": np.nan, "adj_degree_std": np.nan}
 
64
  try:
65
  arr = np.array(parsed, dtype=float)
66
  n = arr.shape[0]
 
82
  if not isinstance(qasm_value, str) or not qasm_value.strip():
83
  return {"qasm_length": np.nan, "qasm_line_count": np.nan, "qasm_gate_keyword_count": np.nan,
84
  "qasm_measure_count": np.nan, "qasm_comment_count": np.nan}
 
85
  text = qasm_value
86
  lines = [line for line in text.splitlines() if line.strip()]
87
+ gate_keywords = re.findall(r"\b(cx|h|x|y|z|rx|ry|rz|u1|u2|u3|u|swap|cz|ccx|rxx|ryy|rzz)\b",
88
+ text, flags=re.IGNORECASE)
89
  measure_count = len(re.findall(r"\bmeasure\b", text, flags=re.IGNORECASE))
90
  comment_count = sum(1 for line in lines if line.strip().startswith("//"))
 
91
  return {
92
  "qasm_length": float(len(text)),
93
  "qasm_line_count": float(len(lines)),
 
102
  if "adjacency" in df.columns:
103
  adj_df = df["adjacency"].apply(adjacency_features).apply(pd.Series)
104
  df = pd.concat([df, adj_df], axis=1)
 
105
  qasm_source = "qasm_transpiled" if "qasm_transpiled" in df.columns else "qasm_raw"
106
  if qasm_source in df.columns:
107
  qasm_df = df[qasm_source].apply(qasm_features).apply(pd.Series)
 
109
  return df
110
 
111
 
112
+ def load_single_dataset(dataset_key: str) -> pd.DataFrame:
113
+ if dataset_key not in _ASSET_CACHE:
114
+ ds = load_dataset(REPO_CONFIG[dataset_key]["repo"])
115
+ df = pd.DataFrame(ds["train"])
116
+ df = enrich_dataframe(df)
117
+ df["noise_label"] = REPO_CONFIG[dataset_key]["label"]
118
+ _ASSET_CACHE[dataset_key] = df
119
+ return _ASSET_CACHE[dataset_key]
120
+
121
+
122
+ def load_combined_dataset() -> pd.DataFrame:
123
+ global _COMBINED_CACHE
124
+ if _COMBINED_CACHE is None:
125
+ frames = [load_single_dataset(k) for k in REPO_CONFIG.keys()]
126
+ combined = pd.concat(frames, ignore_index=True)
127
+ combined = combined[combined["noise_label"].isin(CLASS_ORDER)].copy()
128
+ _COMBINED_CACHE = combined
129
+ return _COMBINED_CACHE
130
+
131
+
132
  def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
133
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
134
+ features = [col for col in numeric_cols if col not in NON_FEATURE_COLS
135
+ and all(pattern not in col for pattern in SOFT_EXCLUDE_PATTERNS)]
 
 
 
 
 
136
  return sorted(features)
137
 
138
 
 
142
  return [f for f in preferred if f in features]
143
 
144
 
145
+ def make_classification_figure(y_true, y_pred, class_names, feature_names=None, importances=None):
146
+ fig = plt.figure(figsize=(20, 6))
147
+ gs = fig.add_gridspec(1, 3)
148
+ ax1 = fig.add_subplot(gs[0, 0])
149
+ ax2 = fig.add_subplot(gs[0, 1])
150
+ ax3 = fig.add_subplot(gs[0, 2])
151
+
152
+ cm = confusion_matrix(y_true, y_pred, labels=class_names)
153
+ im = ax1.imshow(cm, interpolation="nearest")
154
+ ax1.set_title("Confusion Matrix")
155
+ ax1.set_xlabel("Predicted")
156
+ ax1.set_ylabel("Actual")
157
+ ax1.set_xticks(np.arange(len(class_names)))
158
+ ax1.set_yticks(np.arange(len(class_names)))
159
+ ax1.set_xticklabels(class_names, rotation=45, ha="right")
160
+ ax1.set_yticklabels(class_names)
161
+ for i in range(cm.shape[0]):
162
+ for j in range(cm.shape[1]):
163
+ ax1.text(j, i, cm[i, j], ha="center", va="center")
164
+ fig.colorbar(im, ax=ax1, fraction=0.046, pad=0.04)
165
+
166
+ incorrect = (y_true != y_pred).astype(int)
167
+ ax2.hist(incorrect, bins=[-0.5, 0.5, 1.5])
168
+ ax2.set_title("Correct vs Incorrect")
169
+ ax2.set_xlabel("0 = Correct, 1 = Incorrect")
170
+ ax2.set_ylabel("Count")
171
+
172
+ if importances is not None and feature_names is not None and len(importances) == len(feature_names):
173
+ idx = np.argsort(importances)[-10:]
174
+ ax3.barh([feature_names[i] for i in idx], importances[idx])
175
+ ax3.set_title("Top-10 Feature Importances")
176
+ ax3.set_xlabel("Importance")
177
+ else:
178
+ ax3.text(0.5, 0.5, "Feature importances unavailable", ha="center", va="center")
179
+ ax3.set_axis_off()
180
+
181
+ fig.tight_layout()
182
+ return fig
183
+
184
+
185
+ def train_classifier(feature_columns, test_size, max_depth, random_state, n_estimators=200):
186
  if not feature_columns:
187
+ return None, "### Please select at least one feature."
188
+ df = load_combined_dataset()
 
 
189
  df = df.dropna(subset=feature_columns + ["noise_label"])
 
190
  X = df[feature_columns]
191
  y = df["noise_label"]
192
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(random_state),
193
+ stratify=y)
194
+ model = Pipeline([
195
+ ("imputer", SimpleImputer(strategy="median")),
196
+ ("scaler", StandardScaler()),
197
+ ("classifier", HistGradientBoostingClassifier(
198
+ max_depth=int(max_depth),
199
+ max_iter=int(n_estimators),
200
+ random_state=int(random_state),
201
+ learning_rate=0.05,
202
+ ))
203
+ ])
204
  model.fit(X_train, y_train)
205
+ y_pred = model.predict(X_test)
206
+ classifier = model.named_steps["classifier"]
207
+ importances = getattr(classifier, "feature_importances_", None)
208
+ fig = make_classification_figure(y_test.to_numpy(), y_pred, CLASS_ORDER, feature_columns, importances)
209
+ report = classification_report(y_test, y_pred, labels=CLASS_ORDER)
210
+ results = f"### Classification report\n```\n{report}\n```"
211
+ return fig, results
212
 
 
213
 
214
+ CUSTOM_CSS = ".gradio-container {max-width: 1400px !important;}"
215
 
 
 
 
216
 
217
  with gr.Blocks(title=APP_TITLE) as demo:
218
  gr.Markdown(f"# 🌌 {APP_TITLE}")
219
  gr.Markdown(APP_SUBTITLE)
220
 
221
+ with gr.TabItem("🧠 Classification"):
222
+ feature_picker = gr.CheckboxGroup(label="Input features", choices=[])
223
+ test_size = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="Test split")
224
+ max_depth = gr.Slider(1, 30, value=5, step=1, label="Max depth")
225
+ seed = gr.Number(value=42, precision=0, label="Random seed")
226
+ n_estimators = gr.Slider(50, 400, value=200, step=10, label="Iterations")
227
+ run_btn = gr.Button("Train & Evaluate", variant="primary")
228
+ plot = gr.Plot()
229
+ metrics = gr.Markdown()
230
+
231
+ dataset_dropdown = gr.Dropdown(list(REPO_CONFIG.keys()), value="clean", label="Dataset")
232
+ dataset_dropdown.change(lambda _: gr.update(choices=default_feature_selection(get_available_feature_columns(load_combined_dataset()))),
233
+ [], [feature_picker])
234
+
235
+ run_btn.click(train_classifier, [feature_picker, test_size, max_depth, seed, n_estimators], [plot, metrics])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  if __name__ == "__main__":
238
  demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)