Spaces:

QSBench
/

Entanglement_Score_Regression

Sleeping

App Files Files Community

QSBench commited on 12 days ago

Commit

81da9d5

verified ·

1 Parent(s): 33e2b92

Create app.py

Browse files

Files changed (1) hide show

app.py +471 -0

app.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import ast
+import logging
+import os
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# -----------------------------------------------------------------------------
+# Configuration
+# -----------------------------------------------------------------------------
+APP_TITLE = "Entanglement Score Regression"
+APP_SUBTITLE = "Predict the continuous Meyer-Wallach entanglement score from circuit topology and gate structure."
+# Set this to the CSV file you place in the Space repository.
+# You can also override it with an environment variable in Spaces.
+DATA_PATH = os.getenv("QS_DATA_PATH", "QSBench-Amplitude-v1.0.0-demo_shard_00000.csv")
+# Columns that should never be used as direct features.
+NON_FEATURE_COLS = {
+    "sample_id",
+    "sample_seed",
+    "circuit_hash",
+    "split",
+    "circuit_qasm",
+    "qasm_raw",
+    "qasm_transpiled",
+    "circuit_type_resolved",
+    "circuit_type_requested",
+    "noise_type",
+    "noise_prob",
+    "observable_bases",
+    "observable_mode",
+    "backend_device",
+    "precision_mode",
+    "circuit_signature",
+    "entanglement",
+    "meyer_wallach",  # target column
+}
+# Optional columns to visually hide from the feature picker because they are usually constant
+# or less informative in small demo shards.
+SOFT_EXCLUDE_PATTERNS = ["ideal_", "noisy_", "error_", "sign_ideal_", "sign_noisy_"]
+_ASSET_CACHE: Dict[str, pd.DataFrame] = {}
+# -----------------------------------------------------------------------------
+# Data loading and feature engineering
+# -----------------------------------------------------------------------------
+def load_dataset_df() -> pd.DataFrame:
+    """Load the demo shard from disk and cache it in memory."""
+    if "df" not in _ASSET_CACHE:
+        if not os.path.exists(DATA_PATH):
+            raise FileNotFoundError(
+                f"Dataset file not found: {DATA_PATH}. "
+                "Place the CSV in the Space repository or set QS_DATA_PATH."
+            )
+        logger.info("Loading dataset from %s", DATA_PATH)
+        df = pd.read_csv(DATA_PATH)
+        df = enrich_dataframe(df)
+        _ASSET_CACHE["df"] = df
+    return _ASSET_CACHE["df"]
+def safe_parse(value):
+    """Safely parse a string representation of a Python literal."""
+    if isinstance(value, str):
+        try:
+            return ast.literal_eval(value)
+        except Exception:
+            return value
+    return value
+def adjacency_features(adj_value) -> Dict[str, float]:
+    """Derive compact topology features from the adjacency matrix."""
+    parsed = safe_parse(adj_value)
+    if not isinstance(parsed, list) or len(parsed) == 0:
+        return {
+            "adj_edge_count": np.nan,
+            "adj_density": np.nan,
+            "adj_degree_mean": np.nan,
+            "adj_degree_std": np.nan,
+        }
+    try:
+        arr = np.array(parsed, dtype=float)
+        n = arr.shape[0]
+        # For an undirected adjacency matrix, sum counts both directions.
+        edge_count = float(np.triu(arr, k=1).sum())
+        possible_edges = float(n * (n - 1) / 2)
+        density = edge_count / possible_edges if possible_edges > 0 else np.nan
+        degrees = arr.sum(axis=1)
+        return {
+            "adj_edge_count": edge_count,
+            "adj_density": density,
+            "adj_degree_mean": float(np.mean(degrees)),
+            "adj_degree_std": float(np.std(degrees)),
+        }
+    except Exception:
+        return {
+            "adj_edge_count": np.nan,
+            "adj_density": np.nan,
+            "adj_degree_mean": np.nan,
+            "adj_degree_std": np.nan,
+        }
+def qasm_features(qasm_value) -> Dict[str, float]:
+    """Extract simple string-based statistics from QASM text."""
+    if not isinstance(qasm_value, str) or not qasm_value.strip():
+        return {
+            "qasm_length": np.nan,
+            "qasm_line_count": np.nan,
+            "qasm_gate_keyword_count": np.nan,
+            "qasm_measure_count": np.nan,
+            "qasm_comment_count": np.nan,
+        }
+    text = qasm_value
+    lines = [line for line in text.splitlines() if line.strip()]
+    gate_keywords = re.findall(r"\b(cx|h|x|y|z|rx|ry|rz|u1|u2|u3|u|swap|cz|ccx|rxx|ryy|rzz)\b", text, flags=re.IGNORECASE)
+    measure_count = len(re.findall(r"\bmeasure\b", text, flags=re.IGNORECASE))
+    comment_count = sum(1 for line in lines if line.strip().startswith("//"))
+    return {
+        "qasm_length": float(len(text)),
+        "qasm_line_count": float(len(lines)),
+        "qasm_gate_keyword_count": float(len(gate_keywords)),
+        "qasm_measure_count": float(measure_count),
+        "qasm_comment_count": float(comment_count),
+    }
+def enrich_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """Create extra features that are useful for regression."""
+    df = df.copy()
+    if "adjacency" in df.columns:
+        adj_df = df["adjacency"].apply(adjacency_features).apply(pd.Series)
+        df = pd.concat([df, adj_df], axis=1)
+    qasm_source = "qasm_transpiled" if "qasm_transpiled" in df.columns else "qasm_raw"
+    if qasm_source in df.columns:
+        qasm_df = df[qasm_source].apply(qasm_features).apply(pd.Series)
+        df = pd.concat([df, qasm_df], axis=1)
+    # Normalize obvious object columns that can be safely treated as strings.
+    for col in ["noise_type", "backend_device", "precision_mode", "observable_mode"]:
+        if col in df.columns:
+            df[col] = df[col].astype("string")
+    return df
+def load_guide_content() -> str:
+    """Load the user guide if it exists in the repository."""
+    guide_path = "GUIDE.md"
+    if os.path.exists(guide_path):
+        with open(guide_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return (
+        "# Guide\n\n"
+        "The guide file is not added yet. In the next step, we can build a full user manual "
+        "with dataset description, model interpretation, and example workflows."
+    )
+# -----------------------------------------------------------------------------
+# Feature selection helpers
+# -----------------------------------------------------------------------------
+def get_available_feature_columns(df: pd.DataFrame) -> List[str]:
+    """Return all numeric feature columns after excluding target and metadata."""
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    features = []
+    for col in numeric_cols:
+        if col in NON_FEATURE_COLS:
+            continue
+        if any(pattern in col for pattern in SOFT_EXCLUDE_PATTERNS):
+            continue
+        features.append(col)
+    return sorted(features)
+def default_feature_selection(features: List[str]) -> List[str]:
+    """Pick a stable set of high-value defaults."""
+    preferred = [
+        "gate_entropy",
+        "adjacency",
+        "adj_density",
+        "adj_degree_mean",
+        "adj_degree_std",
+        "depth",
+        "total_gates",
+        "single_qubit_gates",
+        "two_qubit_gates",
+        "cx_count",
+        "qasm_length",
+        "qasm_line_count",
+        "qasm_gate_keyword_count",
+    ]
+    return [f for f in preferred if f in features][:8]
+# -----------------------------------------------------------------------------
+# Visualization helpers
+# -----------------------------------------------------------------------------
+def make_regression_figure(y_true: np.ndarray, y_pred: np.ndarray, feature_names: Optional[List[str]] = None, importances: Optional[np.ndarray] = None) -> plt.Figure:
+    """Create a compact three-panel regression summary figure."""
+    fig = plt.figure(figsize=(20, 6))
+    gs = fig.add_gridspec(1, 3)
+    ax1 = fig.add_subplot(gs[0, 0])
+    ax2 = fig.add_subplot(gs[0, 1])
+    ax3 = fig.add_subplot(gs[0, 2])
+    # Actual vs predicted.
+    ax1.scatter(y_true, y_pred, alpha=0.75)
+    min_v = min(float(np.min(y_true)), float(np.min(y_pred)))
+    max_v = max(float(np.max(y_true)), float(np.max(y_pred)))
+    ax1.plot([min_v, max_v], [min_v, max_v], linestyle="--")
+    ax1.set_title("Actual vs Predicted")
+    ax1.set_xlabel("Actual Meyer-Wallach")
+    ax1.set_ylabel("Predicted Meyer-Wallach")
+    # Residual histogram.
+    residuals = y_true - y_pred
+    ax2.hist(residuals, bins=20)
+    ax2.set_title("Residual Distribution")
+    ax2.set_xlabel("Residual")
+    ax2.set_ylabel("Count")
+    # Feature importance chart.
+    if importances is not None and feature_names is not None and len(importances) == len(feature_names):
+        idx = np.argsort(importances)[-10:]
+        ax3.barh([feature_names[i] for i in idx], importances[idx])
+        ax3.set_title("Top-10 Feature Importances")
+        ax3.set_xlabel("Importance")
+    else:
+        ax3.text(0.5, 0.5, "Feature importances are unavailable.", ha="center", va="center")
+        ax3.set_axis_off()
+    fig.tight_layout()
+    return fig
+# -----------------------------------------------------------------------------
+# UI callbacks
+# -----------------------------------------------------------------------------
+def refresh_explorer(split_name: str) -> Tuple[gr.update, pd.DataFrame, str, str, str, str]:
+    """Refresh explorer output based on the selected split."""
+    df = load_dataset_df()
+    splits = df["split"].dropna().unique().tolist() if "split" in df.columns else ["train"]
+    if not splits:
+        splits = ["train"]
+    if split_name not in splits:
+        split_name = splits[0]
+    filtered = df[df["split"] == split_name] if "split" in df.columns else df
+    display_df = filtered.head(12).copy()
+    raw_qasm = display_df["qasm_raw"].iloc[0] if "qasm_raw" in display_df.columns and not display_df.empty else "// N/A"
+    transpiled_qasm = display_df["qasm_transpiled"].iloc[0] if "qasm_transpiled" in display_df.columns and not display_df.empty else "// N/A"
+    target_info = (
+        f"### Dataset overview\n\n"
+        f"**Rows:** {len(df):,}  \n"
+        f"**Visible split:** `{split_name}`  \n"
+        f"**Target:** `meyer_wallach`  \n"
+        f"**Target range:** {df['meyer_wallach'].min():.4f} → {df['meyer_wallach'].max():.4f}"
+    )
+    summary = (
+        f"### Split summary\n\n"
+        f"**Available splits:** {', '.join(splits)}  \n"
+        f"**Preview rows:** {len(display_df)}"
+    )
+    return (
+        gr.update(choices=splits, value=split_name),
+        display_df,
+        raw_qasm,
+        transpiled_qasm,
+        target_info,
+        summary,
+    )
+def sync_feature_picker() -> gr.update:
+    """Refresh the feature list from the loaded dataset."""
+    df = load_dataset_df()
+    features = get_available_feature_columns(df)
+    defaults = default_feature_selection(features)
+    return gr.update(choices=features, value=defaults)
+def train_regressor(feature_columns: List[str], test_size: float, n_estimators: int, max_depth: int, random_state: int) -> Tuple[Optional[plt.Figure], str]:
+    """Train a regression model and return metrics plus a plot."""
+    if not feature_columns:
+        return None, "### ❌ Please select at least one feature."
+    df = load_dataset_df()
+    required_cols = feature_columns + ["meyer_wallach"]
+    train_df = df.dropna(subset=required_cols).copy()
+    if len(train_df) < 10:
+        return None, "### ❌ Not enough clean rows after filtering missing values."
+    X = train_df[feature_columns]
+    y = train_df["meyer_wallach"]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+    )
+    # Random forest works well for small, tabular demo data and gives feature importances.
+    model = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", StandardScaler()),
+            (
+                "regressor",
+                RandomForestRegressor(
+                    n_estimators=n_estimators,
+                    max_depth=max_depth if max_depth > 0 else None,
+                    random_state=random_state,
+                    n_jobs=-1,
+                ),
+            ),
+        ]
+    )
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
+    mae = float(mean_absolute_error(y_test, y_pred))
+    r2 = float(r2_score(y_test, y_pred))
+    regressor = model.named_steps["regressor"]
+    importances = getattr(regressor, "feature_importances_", None)
+    fig = make_regression_figure(y_test.to_numpy(), y_pred, list(feature_columns), importances)
+    results = (
+        "### Regression results\n\n"
+        f"**Rows used:** {len(train_df):,}  \n"
+        f"**Test size:** {test_size:.0%}  \n"
+        f"**RMSE:** {rmse:.4f}  \n"
+        f"**MAE:** {mae:.4f}  \n"
+        f"**R²:** {r2:.4f}\n\n"
+        "The closer the scatter points are to the diagonal line, the better the model."
+    )
+    return fig, results
+def build_dataset_profile() -> str:
+    """Generate a compact dataset summary for the explorer tab."""
+    df = load_dataset_df()
+    target = df["meyer_wallach"]
+    return (
+        f"### Dataset profile\n\n"
+        f"**Rows:** {len(df):,}  \n"
+        f"**Columns:** {len(df.columns):,}  \n"
+        f"**Meyer-Wallach mean:** {target.mean():.4f}  \n"
+        f"**Meyer-Wallach std:** {target.std():.4f}  \n"
+        f"**Meyer-Wallach min/max:** {target.min():.4f} / {target.max():.4f}"
+    )
+# -----------------------------------------------------------------------------
+# UI
+# -----------------------------------------------------------------------------
+CUSTOM_CSS = """
+footer {
+    margin-top: 1rem;
+}
+.gradio-container {
+    max-width: 1400px !important;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE, css=CUSTOM_CSS) as demo:
+    gr.Markdown(f"# 🌌 {APP_TITLE}")
+    gr.Markdown(APP_SUBTITLE)
+    with gr.Tabs():
+        with gr.TabItem("🔎 Explorer"):
+            with gr.Row():
+                split_dropdown = gr.Dropdown(label="Split", choices=["train"], value="train")
+                profile_box = gr.Markdown(value="### Loading dataset...")
+            with gr.Row():
+                explorer_summary = gr.Markdown(value="### Loading split summary...")
+            explorer_df = gr.Dataframe(interactive=False, label="Preview rows")
+            with gr.Row():
+                raw_qasm_code = gr.Code(label="Raw QASM", language="text")
+                transpiled_qasm_code = gr.Code(label="Transpiled QASM", language="text")
+        with gr.TabItem("🧠 Regression"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    feature_picker = gr.CheckboxGroup(label="Input features", choices=[])
+                    test_size_slider = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="Test split")
+                    n_estimators_slider = gr.Slider(50, 400, value=200, step=10, label="Number of trees")
+                    max_depth_slider = gr.Slider(2, 30, value=12, step=1, label="Max tree depth")
+                    random_state_number = gr.Number(value=42, precision=0, label="Random seed")
+                    train_btn = gr.Button("Train & Evaluate", variant="primary")
+                with gr.Column(scale=2):
+                    plot_output = gr.Plot()
+                    metrics_output = gr.Markdown()
+        with gr.TabItem("📖 Guide"):
+            gr.Markdown(load_guide_content())
+    gr.Markdown("---")
+    gr.Markdown(
+        "### 🔗 Links  \n"
+        "[Website](https://qsbench.github.io) | [Hugging Face](https://huggingface.co/QSBench) | [GitHub](https://github.com/QSBench)"
+    )
+    # Bind events.
+    split_dropdown.change(
+        refresh_explorer,
+        inputs=[split_dropdown],
+        outputs=[split_dropdown, explorer_df, raw_qasm_code, transpiled_qasm_code, profile_box, explorer_summary],
+    )
+    train_btn.click(
+        train_regressor,
+        inputs=[feature_picker, test_size_slider, n_estimators_slider, max_depth_slider, random_state_number],
+        outputs=[plot_output, metrics_output],
+    )
+    demo.load(
+        refresh_explorer,
+        inputs=[split_dropdown],
+        outputs=[split_dropdown, explorer_df, raw_qasm_code, transpiled_qasm_code, profile_box, explorer_summary],
+    )
+    demo.load(sync_feature_picker, outputs=[feature_picker])
+if __name__ == "__main__":
+    demo.launch()