Spaces:

QSBench
/

Circuit_Family_Classifier

Running

App Files Files Community

QSBench commited on 3 days ago

Commit

9c99c87

verified ·

1 Parent(s): a63cf6b

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -70

app.py CHANGED Viewed

@@ -39,7 +39,6 @@ REPO_CONFIG = {
     }
 }
-TARGET_FAMILIES = ['QFT', 'HEA', 'RANDOM', 'EFFICIENT', 'REAL_AMPLITUDES']
 NON_FEATURE_COLS = {
     "sample_id", "sample_seed", "circuit_hash", "split", "circuit_qasm",
     "qasm_raw", "qasm_transpiled", "circuit_type_resolved", "circuit_type_requested",
@@ -50,6 +49,7 @@ NON_FEATURE_COLS = {
 _ASSET_CACHE = {}
 def load_all_assets(key: str) -> Dict:
     if key not in _ASSET_CACHE:
         logger.info(f"Fetching {key}...")
         ds = load_dataset(REPO_CONFIG[key]["repo"])
@@ -58,16 +58,16 @@ def load_all_assets(key: str) -> Dict:
         _ASSET_CACHE[key] = {"df": pd.DataFrame(ds["train"]), "meta": meta, "report": report}
     return _ASSET_CACHE[key]
-# --- UI LOGIC ---
 def load_guide_content():
     try:
         with open("GUIDE.md", "r", encoding="utf-8") as f:
             return f.read()
     except:
-        return "### ⚠️ GUIDE.md not found."
 def sync_ml_metrics(ds_name: str):
     assets = load_all_assets(ds_name)
     df = assets["df"]
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
@@ -75,99 +75,62 @@ def sync_ml_metrics(ds_name: str):
     defaults = [f for f in ["gate_entropy", "meyer_wallach", "adjacency", "depth", "cx_count"] if f in valid_features]
     return gr.update(choices=valid_features, value=defaults)
-Судя по ошибке Found only: ['mixed'], в вашем столбце circuit_type_requested вместо конкретных названий семейств (QFT, HEA и т.д.) записано значение 'mixed'. Это часто случается в демонстрационных подмножествах, где данные уже перемешаны и помечены общим тегом.
-Для классификации нам нужны исходные метки. В датасетах QSBench они обычно находятся в столбце circuit_type_resolved.
-Вот обновленный код функции train_classifier с исправленной логикой выбора столбца и более надежной обработкой ошибок.
-Исправленный код (App Code)
-Python
 def train_classifier(ds_name: str, features: List[str]):
     if not features:
-        return None, "### ❌ Error: No features selected. Please pick structural metrics."
     assets = load_all_assets(ds_name)
     df = assets["df"]
-    # Try 'resolved' column first as 'requested' might contain 'mixed' in demo shards
     target_col = 'circuit_type_resolved' if 'circuit_type_resolved' in df.columns else 'circuit_type_requested'
-    # Clean data: remove NaNs and ensure we have valid target strings
     train_df = df.dropna(subset=features + [target_col])
-    # Filter out rows where the target might be 'mixed' or generic if others are available
-    unique_types = train_df[target_col].unique()
-    if 'mixed' in unique_types and len(unique_types) > 1:
         train_df = train_df[train_df[target_col] != 'mixed']
     X = train_df[features]
     y = train_df[target_col]
-    # Verification: Do we have at least 2 distinct classes to perform classification?
-    current_classes = y.unique()
-    if len(current_classes) < 2:
-        return None, f"### ❌ Classification Error\nFound only one class: `{current_classes}` in column `{target_col}`. " \
-                     "Try a different dataset or check if the source file has labels."
-    # Encode labels to integers
     le = LabelEncoder()
     y_encoded = le.fit_transform(y)
-    class_names = le.classes_
-    # Split dataset
     try:
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
-        )
-    except ValueError:
-        # Fallback if stratify fails due to very small class sizes
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y_encoded, test_size=0.2, random_state=42
-        )
-    # Train Random Forest Classifier
-    clf = RandomForestClassifier(n_estimators=100, max_depth=12, n_jobs=-1, random_state=42)
-    clf.fit(X_train, y_train)
     preds = clf.predict(X_test)
-    # Visuals
     sns.set_theme(style="whitegrid")
     fig, axes = plt.subplots(1, 2, figsize=(20, 8))
-    # Plot 1: Confusion Matrix
     cm = confusion_matrix(y_test, preds)
-    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis',
-                xticklabels=class_names, yticklabels=class_names, ax=axes[0], cbar=False)
     axes[0].set_title(f"Confusion Matrix (Acc: {accuracy_score(y_test, preds):.2%})")
-    axes[0].set_xlabel("Predicted Label")
-    axes[0].set_ylabel("True Label")
-    # Plot 2: Feature Importance
     importances = clf.feature_importances_
-    indices = np.argsort(importances)[-10:]
-    axes[1].barh([features[i] for i in indices], importances[indices], color='#2ecc71')
     axes[1].set_title("Top-10 Discriminative Features")
     plt.tight_layout()
-    # Generate text report
-    report_dict = classification_report(y_test, preds, target_names=class_names)
-    summary = f"### 🏆 Classifier Results: {ds_name}\n" \
-              f"**Target Column used:** `{target_col}`\n" \
-              f"**Accuracy:** {accuracy_score(y_test, preds):.2%}\n\n" \
-              f"**Report:**\n```\n{report_dict}\n```"
-    return fig, summary
 def update_explorer(ds_name: str, split_name: str):
     assets = load_all_assets(ds_name)
     df = assets["df"]
-    # Identify splits
     splits = df["split"].unique().tolist() if "split" in df.columns else ["train"]
-    # Ensure current split_name exists in this dataset
     if split_name not in splits:
         split_name = splits[0]
@@ -185,27 +148,27 @@ def update_explorer(ds_name: str, split_name: str):
         f"### 📋 {ds_name} Explorer"
     )
-# --- INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="QSBench Classifier") as demo:
     gr.Markdown("# 🌌 QSBench: Circuit Family Classifier")
     with gr.Tabs():
         with gr.TabItem("🔎 Explorer"):
-            meta_txt = gr.Markdown("### Loading...")
             with gr.Row():
                 ds_sel = gr.Dropdown(list(REPO_CONFIG.keys()), value="Core (Clean)", label="Dataset")
                 sp_sel = gr.Dropdown(["train"], value="train", label="Split")
             data_view = gr.Dataframe(interactive=False)
             with gr.Row():
-                c_raw = gr.Code(label="Logic QASM", language="python")
                 c_tr = gr.Code(label="Transpiled QASM", language="python")
         with gr.TabItem("🧠 Classification"):
             with gr.Row():
                 with gr.Column(scale=1):
                     ml_ds_sel = gr.Dropdown(list(REPO_CONFIG.keys()), value="Core (Clean)", label="Environment")
-                    ml_feat_sel = gr.CheckboxGroup(label="Features", choices=[])
-                    train_btn = gr.Button("Run Analysis", variant="primary")
                 with gr.Column(scale=2):
                     p_out = gr.Plot()
                     t_out = gr.Markdown()
@@ -215,14 +178,13 @@ with gr.Blocks(theme=gr.themes.Soft(), title="QSBench Classifier") as demo:
     gr.Markdown("--- \n ### 🔗 [Website](https://qsbench.github.io) | [Hugging Face](https://huggingface.co/QSBench) | [GitHub](https://github.com/QSBench)")
-    # --- UPDATED EVENT LOGIC ---
-    # Triggering the same function for both dropdowns
     ds_sel.change(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     sp_sel.change(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     ml_ds_sel.change(sync_ml_metrics, [ml_ds_sel], [ml_feat_sel])
     train_btn.click(train_classifier, [ml_ds_sel, ml_feat_sel], [p_out, t_out])
     demo.load(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     demo.load(sync_ml_metrics, [ml_ds_sel], [ml_feat_sel])

     }
 }
 NON_FEATURE_COLS = {
     "sample_id", "sample_seed", "circuit_hash", "split", "circuit_qasm",
     "qasm_raw", "qasm_transpiled", "circuit_type_resolved", "circuit_type_requested",
 _ASSET_CACHE = {}
 def load_all_assets(key: str) -> Dict:
+    """Fetch dataset and metadata from Hugging Face."""
     if key not in _ASSET_CACHE:
         logger.info(f"Fetching {key}...")
         ds = load_dataset(REPO_CONFIG[key]["repo"])
         _ASSET_CACHE[key] = {"df": pd.DataFrame(ds["train"]), "meta": meta, "report": report}
     return _ASSET_CACHE[key]
 def load_guide_content():
+    """Load content for the methodology tab."""
     try:
         with open("GUIDE.md", "r", encoding="utf-8") as f:
             return f.read()
     except:
+        return "### ⚠️ GUIDE.md not found. Please upload it to the root directory."
 def sync_ml_metrics(ds_name: str):
+    """Identify numerical features for classification."""
     assets = load_all_assets(ds_name)
     df = assets["df"]
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
     defaults = [f for f in ["gate_entropy", "meyer_wallach", "adjacency", "depth", "cx_count"] if f in valid_features]
     return gr.update(choices=valid_features, value=defaults)
 def train_classifier(ds_name: str, features: List[str]):
+    """Train a classifier to detect circuit families."""
     if not features:
+        return None, "### ❌ Error: Select features first."
     assets = load_all_assets(ds_name)
     df = assets["df"]
+    # Logic: use 'resolved' if 'requested' contains 'mixed' tags
     target_col = 'circuit_type_resolved' if 'circuit_type_resolved' in df.columns else 'circuit_type_requested'
+    # Filter 'mixed' out if other classes exist
     train_df = df.dropna(subset=features + [target_col])
+    if 'mixed' in train_df[target_col].unique() and len(train_df[target_col].unique()) > 1:
         train_df = train_df[train_df[target_col] != 'mixed']
     X = train_df[features]
     y = train_df[target_col]
+    if len(y.unique()) < 2:
+        return None, f"### ❌ Error: At least 2 classes needed. Found only: {y.unique()}"
     le = LabelEncoder()
     y_encoded = le.fit_transform(y)
     try:
+        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
+    except:
+        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
+    clf = RandomForestClassifier(n_estimators=100, max_depth=12, n_jobs=-1).fit(X_train, y_train)
     preds = clf.predict(X_test)
+    # Plotting
     sns.set_theme(style="whitegrid")
     fig, axes = plt.subplots(1, 2, figsize=(20, 8))
     cm = confusion_matrix(y_test, preds)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', xticklabels=le.classes_, yticklabels=le.classes_, ax=axes[0], cbar=False)
     axes[0].set_title(f"Confusion Matrix (Acc: {accuracy_score(y_test, preds):.2%})")
     importances = clf.feature_importances_
+    idx = np.argsort(importances)[-10:]
+    axes[1].barh([features[i] for i in idx], importances[idx], color='#2ecc71')
     axes[1].set_title("Top-10 Discriminative Features")
     plt.tight_layout()
+    report = classification_report(y_test, preds, target_names=le.classes_)
+    return fig, f"### 🏆 Results\n**Target Column:** `{target_col}`\n```\n{report}\n```"
 def update_explorer(ds_name: str, split_name: str):
+    """Manage the Explorer tab data view."""
     assets = load_all_assets(ds_name)
     df = assets["df"]
     splits = df["split"].unique().tolist() if "split" in df.columns else ["train"]
     if split_name not in splits:
         split_name = splits[0]
         f"### 📋 {ds_name} Explorer"
     )
+# --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="QSBench Classifier") as demo:
     gr.Markdown("# 🌌 QSBench: Circuit Family Classifier")
     with gr.Tabs():
         with gr.TabItem("🔎 Explorer"):
+            meta_txt = gr.Markdown("### Initializing...")
             with gr.Row():
                 ds_sel = gr.Dropdown(list(REPO_CONFIG.keys()), value="Core (Clean)", label="Dataset")
                 sp_sel = gr.Dropdown(["train"], value="train", label="Split")
             data_view = gr.Dataframe(interactive=False)
             with gr.Row():
+                c_raw = gr.Code(label="Source QASM", language="python")
                 c_tr = gr.Code(label="Transpiled QASM", language="python")
         with gr.TabItem("🧠 Classification"):
             with gr.Row():
                 with gr.Column(scale=1):
                     ml_ds_sel = gr.Dropdown(list(REPO_CONFIG.keys()), value="Core (Clean)", label="Environment")
+                    ml_feat_sel = gr.CheckboxGroup(label="Structural Metrics", choices=[])
+                    train_btn = gr.Button("Train Classifier", variant="primary")
                 with gr.Column(scale=2):
                     p_out = gr.Plot()
                     t_out = gr.Markdown()
     gr.Markdown("--- \n ### 🔗 [Website](https://qsbench.github.io) | [Hugging Face](https://huggingface.co/QSBench) | [GitHub](https://github.com/QSBench)")
+    # Event Mapping
     ds_sel.change(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     sp_sel.change(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     ml_ds_sel.change(sync_ml_metrics, [ml_ds_sel], [ml_feat_sel])
     train_btn.click(train_classifier, [ml_ds_sel, ml_feat_sel], [p_out, t_out])
+    # Startup Load
     demo.load(update_explorer, [ds_sel, sp_sel], [sp_sel, data_view, c_raw, c_tr, meta_txt])
     demo.load(sync_ml_metrics, [ml_ds_sel], [ml_feat_sel])