Spaces:

QSBench
/

Multi-Target_Regression

Running

App Files Files Community

QSBench commited on 3 days ago

Commit

c906129

1 Parent(s): dcea369

fix

Browse files

Files changed (1) hide show

app.py +67 -31

app.py CHANGED Viewed

@@ -1,61 +1,98 @@
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import r2_score
 import matplotlib.pyplot as plt
-# Загружаем датасет (все данные в одном сплите 'train')
 print("Loading dataset...")
 ds_all = load_dataset("QSBench/QSBench-Core-v1.0.0-demo")
-# Берём только сплит 'train' (там все строки)
 df_all = pd.DataFrame(ds_all['train'])
-# Разделяем по колонке 'split'
 splits = {}
 for split_name in df_all['split'].unique():
     splits[split_name] = df_all[df_all['split'] == split_name].reset_index(drop=True)
 print("Available splits:", list(splits.keys()))
-# Функция для отображения таблицы
 def show_data(split):
     if split in splits:
         return splits[split].head(10)
     else:
         return f"Split '{split}' not found"
-# Функция для обучения модели и создания графика
 def train_model():
-    # Проверяем, что есть нужные сплиты
     if 'train' not in splits or 'test' not in splits:
         return None, "Error: train or test split not found in dataset"
-    feature_cols = ["total_gates", "gate_entropy", "meyer_wallach"]
-    target_col = "ideal_expval_Z_global"
-    # Проверяем наличие колонок
-    if not all(col in splits['train'].columns for col in feature_cols + [target_col]):
-        missing = [col for col in feature_cols + [target_col] if col not in splits['train'].columns]
-        return None, f"Error: missing columns: {missing}"
-    X_train = splits['train'][feature_cols]
     y_train = splits['train'][target_col]
-    X_test = splits['test'][feature_cols]
     y_test = splits['test'][target_col]
     model = RandomForestRegressor(n_estimators=100, random_state=42)
     model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
     r2 = r2_score(y_test, y_pred)
-    fig, ax = plt.subplots()
-    ax.scatter(y_test, y_pred, alpha=0.5)
-    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
-    ax.set_xlabel("True value")
-    ax.set_ylabel("Predicted")
-    ax.set_title(f"Predictions vs. Truth (R² = {r2:.4f})")
-    return fig, f"R² score: {r2:.4f}"
 # Интерфейс
 with gr.Blocks(title="QSBench Demo Explorer") as demo:
@@ -67,7 +104,7 @@ with gr.Blocks(title="QSBench Demo Explorer") as demo:
     👉 **Full datasets (up to 200k samples, noisy versions, 10‑qubit transpilation packs) are available for purchase.**
     [Visit the QSBench website](https://qsbench.github.io/)
     """)
     with gr.Tabs():
         with gr.TabItem("Data Explorer"):
             split_selector = gr.Dropdown(
@@ -77,15 +114,14 @@ with gr.Blocks(title="QSBench Demo Explorer") as demo:
             )
             data_table = gr.Dataframe(label="First 10 rows", interactive=False)
             split_selector.change(fn=show_data, inputs=split_selector, outputs=data_table)
-            # Загружаем данные по умолчанию
             demo.load(fn=lambda: show_data(list(splits.keys())[0]), outputs=data_table)
         with gr.TabItem("Model Demo"):
             train_button = gr.Button("Train Random Forest")
             plot_output = gr.Plot()
-            text_output = gr.Textbox(label="Result", interactive=False)
             train_button.click(fn=train_model, outputs=[plot_output, text_output])
     gr.Markdown("---")
     gr.Markdown("""
     ### Get the full datasets

 import gradio as gr
 import pandas as pd
+import numpy as np
 from datasets import load_dataset
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import r2_score
 import matplotlib.pyplot as plt
+# Загружаем датасет и разделяем по колонке 'split'
 print("Loading dataset...")
 ds_all = load_dataset("QSBench/QSBench-Core-v1.0.0-demo")
 df_all = pd.DataFrame(ds_all['train'])
 splits = {}
 for split_name in df_all['split'].unique():
     splits[split_name] = df_all[df_all['split'] == split_name].reset_index(drop=True)
 print("Available splits:", list(splits.keys()))
+# Список потенциальных признаков (числовые колонки, которые не являются целевыми или текстовыми)
+numeric_cols = df_all.select_dtypes(include=[np.number]).columns.tolist()
+# Исключаем явные целевые переменные и идентификаторы
+exclude = ['sample_id', 'sample_seed', 'ideal_expval_Z_global', 'ideal_expval_X_global', 'ideal_expval_Y_global',
+           'noisy_expval_Z_global', 'noisy_expval_X_global', 'noisy_expval_Y_global',
+           'error_Z_global', 'error_X_global', 'error_Y_global',
+           'sign_ideal_Z_global', 'sign_noisy_Z_global',
+           'ideal_expval_Z_q0', 'ideal_expval_Z_q1', 'ideal_expval_Z_q2', 'ideal_expval_Z_q3', 'ideal_expval_Z_q4', 'ideal_expval_Z_q5',
+           'noisy_expval_Z_q0', 'noisy_expval_Z_q1', 'noisy_expval_Z_q2', 'noisy_expval_Z_q3', 'noisy_expval_Z_q4', 'noisy_expval_Z_q5',
+           'ideal_expval_X_q0', 'ideal_expval_X_q1', 'ideal_expval_X_q2', 'ideal_expval_X_q3', 'ideal_expval_X_q4', 'ideal_expval_X_q5',
+           'noisy_expval_X_q0', 'noisy_expval_X_q1', 'noisy_expval_X_q2', 'noisy_expval_X_q3', 'noisy_expval_X_q4', 'noisy_expval_X_q5',
+           'ideal_expval_Y_q0', 'ideal_expval_Y_q1', 'ideal_expval_Y_q2', 'ideal_expval_Y_q3', 'ideal_expval_Y_q4', 'ideal_expval_Y_q5',
+           'noisy_expval_Y_q0', 'noisy_expval_Y_q1', 'noisy_expval_Y_q2', 'noisy_expval_Y_q3', 'noisy_expval_Y_q4', 'noisy_expval_Y_q5']
+feature_cols = [col for col in numeric_cols if col not in exclude and not col.startswith('error_')]
+# Целевая переменная
+target_col = "ideal_expval_Z_global"
 def show_data(split):
     if split in splits:
         return splits[split].head(10)
     else:
         return f"Split '{split}' not found"
 def train_model():
     if 'train' not in splits or 'test' not in splits:
         return None, "Error: train or test split not found in dataset"
+    # Проверяем наличие признаков
+    available_features = [col for col in feature_cols if col in splits['train'].columns]
+    if not available_features:
+        return None, f"Error: no numeric feature columns found (tried: {feature_cols})"
+    X_train = splits['train'][available_features]
     y_train = splits['train'][target_col]
+    X_test = splits['test'][available_features]
     y_test = splits['test'][target_col]
     model = RandomForestRegressor(n_estimators=100, random_state=42)
     model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
     r2 = r2_score(y_test, y_pred)
+    # График предсказаний
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+    ax1.scatter(y_test, y_pred, alpha=0.5)
+    ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
+    ax1.set_xlabel("True value")
+    ax1.set_ylabel("Predicted")
+    ax1.set_title(f"Predictions vs. Truth\nR² = {r2:.4f}")
+    # Важность признаков
+    importances = model.feature_importances_
+    indices = np.argsort(importances)[-10:]  # топ-10 признаков
+    ax2.barh(range(len(indices)), importances[indices])
+    ax2.set_yticks(range(len(indices)))
+    ax2.set_yticklabels([available_features[i] for i in indices])
+    ax2.set_xlabel("Feature importance")
+    ax2.set_title("Top 10 most important features")
+    plt.tight_layout()
+    explanation = f"""
+    **R² score:** {r2:.4f}
+    **What does it mean?**
+    R² measures how well the model explains the variance in the target.
+    - 1.0 = perfect prediction
+    - 0.0 = model predicts the mean (no better than guessing)
+    - Negative values = model performs worse than guessing the mean.
+    The current score is negative, which indicates that the chosen features (`total_gates`, `gate_entropy`, `meyer_wallach`, and others) are not strongly predictive of the ideal Z expectation value on this small dataset.
+    This is expected: quantum expectation values depend on many subtle circuit details. Larger datasets with richer features would allow better models.
+    👉 **Our full datasets** contain up to 200,000 circuits, additional noise models, and more features – perfect for serious Quantum Machine Learning research.
+    """
+    return fig, explanation
 # Интерфейс
 with gr.Blocks(title="QSBench Demo Explorer") as demo:
     👉 **Full datasets (up to 200k samples, noisy versions, 10‑qubit transpilation packs) are available for purchase.**
     [Visit the QSBench website](https://qsbench.github.io/)
     """)
     with gr.Tabs():
         with gr.TabItem("Data Explorer"):
             split_selector = gr.Dropdown(
             )
             data_table = gr.Dataframe(label="First 10 rows", interactive=False)
             split_selector.change(fn=show_data, inputs=split_selector, outputs=data_table)
             demo.load(fn=lambda: show_data(list(splits.keys())[0]), outputs=data_table)
         with gr.TabItem("Model Demo"):
             train_button = gr.Button("Train Random Forest")
             plot_output = gr.Plot()
+            text_output = gr.Markdown()
             train_button.click(fn=train_model, outputs=[plot_output, text_output])
     gr.Markdown("---")
     gr.Markdown("""
     ### Get the full datasets