Spaces:

PraneshJs
/

PandasVSDuckDB

Sleeping

App Files Files Community

PraneshJs commited on Dec 12, 2025

Commit

c347ab7

verified ·

1 Parent(s): 24430f3

Update app.py

Browse files

Files changed (1) hide show

app.py +499 -278

app.py CHANGED Viewed

@@ -1,19 +1,34 @@
 import time
 import numpy as np
 import pandas as pd
-import duckdb
 import gradio as gr
 import matplotlib.pyplot as plt
 from PIL import Image
-import io
-import os
-duckdb_con = duckdb.connect(database=":memory:")
-# ----------------------------------------------------------
-# Synthetic Data Generator
-# ----------------------------------------------------------
 def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
     rng = np.random.default_rng(42)
     ids = np.arange(n_rows)
@@ -28,316 +43,522 @@ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
         {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
     )
-# ----------------------------------------------------------
-# Timing utility
-# ----------------------------------------------------------
 def time_function(fn, repeats=3):
-    repeats = int(repeats)
     times = []
     for _ in range(repeats):
         start = time.perf_counter()
         fn()
         end = time.perf_counter()
         times.append(end - start)
-    return np.mean(times), np.std(times), times
-# ----------------------------------------------------------
-# Benchmark Operations (Compute + I/O)
-# ----------------------------------------------------------
 # ---- Filter ----
-def bench_filter(df, repeats=3):
-    def pandas_op():
         _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
-    def duckdb_op():
-        duckdb_con.register("df", df)
-        duckdb_con.execute(f"""
-            SELECT *
-            FROM df
-            WHERE value1 > 0.5
-              AND category='{df['category'].iloc[0]}'
-        """).fetchdf()
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Filter rows", p_mean, p_std, p_all, d_mean, d_std, d_all)
-# ---- Groupby ----
-def bench_groupby(df, repeats=3):
-    def pandas_op():
         _ = df.groupby("category")[["value1", "value2"]].mean()
-    def duckdb_op():
-        duckdb_con.register("df", df)
-        duckdb_con.execute("""
-            SELECT category, AVG(value1), AVG(value2)
-            FROM df GROUP BY category
-        """).fetchdf()
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Groupby mean", p_mean, p_std, p_all, d_mean, d_std, d_all)
 # ---- Join ----
-def bench_join(df, repeats=3):
     categories = df["category"].unique()
     rng = np.random.default_rng(123)
-    dim_df = pd.DataFrame(
-        {"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}
-    )
-    def pandas_op():
         _ = df.merge(dim_df, on="category", how="left")
-    def duckdb_op():
-        duckdb_con.register("df", df)
-        duckdb_con.register("dim_df", dim_df)
-        duckdb_con.execute("""
-            SELECT d.*, dim.weight
-            FROM df d
-            LEFT JOIN dim_df dim
-            ON d.category = dim.category
-        """).fetchdf()
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Join on category", p_mean, p_std, p_all, d_mean, d_std, d_all)
-# ---- Read CSV ----
-def bench_read_csv(temp_csv_path, repeats=3):
-    def pandas_op():
-        _ = pd.read_csv(temp_csv_path)
-    def duckdb_op():
-        _ = duckdb.read_csv_auto(temp_csv_path)
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Read CSV", p_mean, p_std, p_all, d_mean, d_std, d_all)
-# ---- Read Parquet ----
-def bench_read_parquet(temp_parquet_path, repeats=3):
-    def pandas_op():
-        _ = pd.read_parquet(temp_parquet_path)
-    def duckdb_op():
-        _ = duckdb.read_parquet(temp_parquet_path)
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Read Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
-# ---- Write Parquet ----
-def bench_write_parquet(df, repeats=3):
-    def pandas_op():
         df.to_parquet("temp_pd.parquet")
-    def duckdb_op():
-        duckdb_con.register("df", df)
-        duckdb_con.execute("COPY df TO 'temp_duck.parquet' (FORMAT PARQUET)")
-    p_mean, p_std, p_all = time_function(pandas_op, repeats)
-    d_mean, d_std, d_all = time_function(duckdb_op, repeats)
-    return build_result("Write Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
-# ----------------------------------------------------------
-# Shared result formatting
-# ----------------------------------------------------------
-def build_result(op_name, p_mean, p_std, p_all, d_mean, d_std, d_all):
-    speedup = p_mean / d_mean if d_mean > 0 else None
-    return {
-        "operation": op_name,
-        "pandas_mean_s": p_mean,
-        "pandas_std_s": p_std,
-        "duckdb_mean_s": d_mean,
-        "duckdb_std_s": d_std,
-        "speedup": speedup,
-        "raw_pandas_runs": p_all,
-        "raw_duckdb_runs": d_all,
-    }
-# ----------------------------------------------------------
-# Benchmark Dispatcher
-# ----------------------------------------------------------
-def run_benchmark(operation, df=None, repeats=3):
-    repeats = int(repeats)
-    if operation == "Filter": return bench_filter(df, repeats)
-    if operation == "Groupby": return bench_groupby(df, repeats)
-    if operation == "Join": return bench_join(df, repeats)
-    if operation == "Write Parquet": return bench_write_parquet(df, repeats)
-    raise ValueError(f"Unsupported operation: {operation}")
-# ----------------------------------------------------------
-# Chart generator (PIL Image)
-# ----------------------------------------------------------
-def generate_chart(result):
-    fig, ax = plt.subplots(figsize=(4, 3))
-    engines = ["Pandas", "DuckDB"]
-    times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
-    ax.bar(engines, times)
-    ax.set_ylabel("Time (seconds)")
     ax.set_title(result["operation"])
     buf = io.BytesIO()
     plt.tight_layout()
     plt.savefig(buf, format="png")
     buf.seek(0)
     plt.close(fig)
     return Image.open(buf)
-# ----------------------------------------------------------
-# Markdown result
-# ----------------------------------------------------------
-def format_result(result):
-    speed = result["speedup"]
-    verdict = (
-        f"🚀 **DuckDB is ~{speed:.2f}× faster**"
-        if speed > 1
-        else f"🐼 **Pandas is ~{1/speed:.2f}× faster**"
-    )
-    md = f"""
-### 🔬 Benchmark Result — {result['operation']}
-| Engine | Mean (s) | Std (s) |
-|--------|----------|---------|
-| Pandas | `{result['pandas_mean_s']:.6f}` | `{result['pandas_std_s']:.6f}` |
-| DuckDB | `{result['duckdb_mean_s']:.6f}` | `{result['duckdb_std_s']:.6f}` |
-**Verdict:** {verdict}
-<details><summary>Raw timings</summary>
-- Pandas: `{[round(x,6) for x in result['raw_pandas_runs']]}`
-- DuckDB: `{[round(x,6) for x in result['raw_duckdb_runs']]}`
-</details>
-"""
     return md
-# ----------------------------------------------------------
-# Helper to load custom dataset
-# ----------------------------------------------------------
-def load_custom_dataset(file):
-    if file.name.endswith(".csv"):
-        return pd.read_csv(file.name)
-    if file.name.endswith(".parquet"):
-        return pd.read_parquet(file.name)
-    if file.name.endswith(".arrow"):
-        return pd.read_feather(file.name)
-    raise ValueError("Unsupported file format")
-# ----------------------------------------------------------
-# Gradio App
-# ----------------------------------------------------------
 theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
-with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
-    gr.Markdown("# 🐼 vs 🦆 DuckDB vs Pandas — Performance Playground")
     with gr.Tabs():
-        # ==================================================
-        # 🔥 Synthetic Mode
-        # ==================================================
-        with gr.Tab("🔥 Synthetic Dataset Benchmarks"):
-            dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset Size")
-            operation_synth = gr.Radio(
-                ["Filter", "Groupby", "Join", "Write Parquet"],
-                label="Operation",
-                value="Filter"
-            )
-            repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
-            btn_synth = gr.Button("🚀 Run Benchmark")
-            out_md_synth = gr.Markdown()
-            out_chart_synth = gr.Image()
-            def synthetic_runner(size, operation, repeats):
-                repeats = int(repeats)
                 n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
                 df = generate_data(n)
-                result = run_benchmark(operation, df, repeats)
-                chart = generate_chart(result)
-                return format_result(result), chart
-            btn_synth.click(
-                synthetic_runner,
-                [dataset_size, operation_synth, repeats_synth],
-                [out_md_synth, out_chart_synth],
-            )
-        # ==================================================
-        # 📁 Custom Dataset Mode
-        # ==================================================
-        with gr.Tab("📁 Custom Dataset Upload"):
-            file_in = gr.File(label="Upload CSV / Parquet / Arrow")
-            operation_custom = gr.Radio(
-                ["Filter", "Groupby", "Join", "Write Parquet"],
-                label="Operation",
-                value="Filter"
-            )
-            repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
-            btn_custom = gr.Button("Run on Uploaded Dataset")
-            out_md_custom = gr.Markdown()
-            out_chart_custom = gr.Image()
-            def custom_runner(file, operation, repeats):
-                repeats = int(repeats)
-                df = load_custom_dataset(file)
-                result = run_benchmark(operation, df, repeats)
-                return format_result(result), generate_chart(result)
-            btn_custom.click(
-                custom_runner,
-                [file_in, operation_custom, repeats_custom],
-                [out_md_custom, out_chart_custom],
-            )
 if __name__ == "__main__":
-    demo.launch()

+# app.py
 import time
+import io
+import os
+import traceback
 import numpy as np
 import pandas as pd
+import duckdb  # kept for parity if needed
 import gradio as gr
 import matplotlib.pyplot as plt
 from PIL import Image
+# optional libs: polars, fireducks
+try:
+    import polars as pl
+    HAS_POLARS = True
+except Exception:
+    pl = None
+    HAS_POLARS = False
+try:
+    import fireducks as fd
+    HAS_FIREDUCKS = True
+except Exception:
+    fd = None
+    HAS_FIREDUCKS = False
+# -------------------------
+# Basic utils / data gen
+# -------------------------
 def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
     rng = np.random.default_rng(42)
     ids = np.arange(n_rows)
         {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
     )
 def time_function(fn, repeats=3):
+    repeats = int(max(1, repeats))
     times = []
     for _ in range(repeats):
         start = time.perf_counter()
         fn()
         end = time.perf_counter()
         times.append(end - start)
+    return float(np.mean(times)), float(np.std(times)), [float(t) for t in times]
+# -------------------------
+# Helpers to ensure materialization
+# -------------------------
+def materialize_fireducks(maybe_fd_obj):
+    """
+    FireDucks operations are often lazy. Convert results to pandas
+    so we measure real execution. We attempt multiple ways:
+     - if result has .to_pandas() use it
+     - if result is a FireDucks Frame with .to_pandas, call it
+     - if result is already pandas, return as is
+    """
+    if not HAS_FIREDUCKS:
+        return maybe_fd_obj
+    try:
+        # If it's already pandas
+        if isinstance(maybe_fd_obj, pd.DataFrame):
+            return maybe_fd_obj
+        # common conversion method
+        if hasattr(maybe_fd_obj, "to_pandas"):
+            return maybe_fd_obj.to_pandas()
+        # fireducks may expose .to_pandas or fd.pandas.to_pandas - try generically
+        return maybe_fd_obj
+    except Exception:
+        return maybe_fd_obj
+def ensure_polars_from_pandas(df: pd.DataFrame):
+    """Return a Polars DataFrame constructed from pandas (if polars available)."""
+    if not HAS_POLARS:
+        raise RuntimeError("Polars not installed")
+    # convert pandas -> polars
+    return pl.from_pandas(df)
+def ensure_fireducks_from_pandas(df: pd.DataFrame):
+    """Return a FireDucks DataFrame constructed from pandas (if fireducks available).
+    Try a few constructor variants for compatibility with FD versions.
+    """
+    if not HAS_FIREDUCKS:
+        raise RuntimeError("FireDucks not installed")
+    # try common constructor patterns
+    try:
+        # Direct constructor
+        return fd.DataFrame(df)
+    except Exception:
+        pass
+    try:
+        # from_pandas helper if exists
+        if hasattr(fd, "pandas") and hasattr(fd.pandas, "from_pandas"):
+            return fd.pandas.from_pandas(df)
+    except Exception:
+        pass
+    try:
+        # some docs show Frame.from_pandas or Frame.from_csv
+        if hasattr(fd, "Frame") and hasattr(fd.Frame, "from_pandas"):
+            return fd.Frame.from_pandas(df)
+    except Exception:
+        pass
+    # Last fallback: some FD versions simply accept fd.DataFrame(df) above
+    raise RuntimeError("Could not create FireDucks DataFrame with available API")
+# -------------------------
+# Benchmark operations
+# Each bench function returns result dict using build_result()
+# -------------------------
+def build_result(op_name, pandas_stats, polars_stats, fireducks_stats):
+    # Each stats tuple = (mean, std, runs_list) or None if unavailable
+    p_mean, p_std, p_runs = pandas_stats if pandas_stats else (None, None, None)
+    pl_mean, pl_std, pl_runs = polars_stats if polars_stats else (None, None, None)
+    fd_mean, fd_std, fd_runs = fireducks_stats if fireducks_stats else (None, None, None)
+    # compute basic speedups relative to pandas (if possible)
+    speed_pl = (p_mean / pl_mean) if (p_mean and pl_mean and pl_mean > 0) else None
+    speed_fd = (p_mean / fd_mean) if (p_mean and fd_mean and fd_mean > 0) else None
+    return {
+        "operation": op_name,
+        "pandas_mean_s": p_mean,
+        "pandas_std_s": p_std,
+        "pandas_runs": p_runs,
+        "polars_mean_s": pl_mean,
+        "polars_std_s": pl_std,
+        "polars_runs": pl_runs,
+        "fireducks_mean_s": fd_mean,
+        "fireducks_std_s": fd_std,
+        "fireducks_runs": fd_runs,
+        "speedup_polars_over_pandas": speed_pl,
+        "speedup_fireducks_over_pandas": speed_fd,
+    }
 # ---- Filter ----
+def bench_filter(df: pd.DataFrame, repeats=3):
+    # pandas op
+    def p_op():
         _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
+    p_stats = time_function(p_op, repeats)
+    # polars op
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            # polars uses expression style
+            _ = pl_df.filter((pl.col("value1") > 0.5) & (pl.col("category") == pl_df["category"][0])).to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    # fireducks op
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df[(fd_df["value1"] > 0.5) & (fd_df["category"] == fd_df["category"].iloc[0])]
+            # materialize
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Filter", p_stats, pl_stats, fd_stats)
+# ---- GroupBy Mean ----
+def bench_groupby(df: pd.DataFrame, repeats=3):
+    def p_op():
         _ = df.groupby("category")[["value1", "value2"]].mean()
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            _ = pl_df.groupby("category").agg([pl.col("value1").mean(), pl.col("value2").mean()]).to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df.groupby("category")[["value1","value2"]].mean()
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Groupby mean", p_stats, pl_stats, fd_stats)
 # ---- Join ----
+def bench_join(df: pd.DataFrame, repeats=3):
     categories = df["category"].unique()
     rng = np.random.default_rng(123)
+    dim_df = pd.DataFrame({"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))})
+    def p_op():
         _ = df.merge(dim_df, on="category", how="left")
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        pl_dim = pl.from_pandas(dim_df)
+        def pl_op():
+            _ = pl_df.join(pl_dim, on="category", how="left").to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        fd_dim = ensure_fireducks_from_pandas(dim_df)
+        def fd_op():
+            res = fd_df.merge(fd_dim, on="category", how="left")
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Join on category", p_stats, pl_stats, fd_stats)
+# ---- Fillna ----
+def bench_fillna(df: pd.DataFrame, repeats=3):
+    def p_op():
+        _ = df.fillna(0)
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            _ = pl_df.fill_null(0).to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df.fillna(0)
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Fill NA / fillna", p_stats, pl_stats, fd_stats)
+# ---- Dropna ----
+def bench_dropna(df: pd.DataFrame, repeats=3):
+    def p_op():
+        _ = df.dropna()
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            _ = pl_df.drop_nulls().to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df.dropna()
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Drop NA / dropna", p_stats, pl_stats, fd_stats)
+# ---- Sort ----
+def bench_sort(df: pd.DataFrame, repeats=3):
+    def p_op():
+        _ = df.sort_values("value1")
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            _ = pl_df.sort("value1").to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df.sort_values("value1")
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Sort by value1", p_stats, pl_stats, fd_stats)
+# ---- Describe ----
+def bench_describe(df: pd.DataFrame, repeats=3):
+    def p_op():
+        _ = df.describe()
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = ensure_polars_from_pandas(df)
+        def pl_op():
+            _ = pl_df.describe().to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = ensure_fireducks_from_pandas(df)
+        def fd_op():
+            res = fd_df.describe()
+            _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    return build_result("Describe()", p_stats, pl_stats, fd_stats)
+# ---- Read CSV / Parquet / Write Parquet - these will write temp files and measure reads/writes ----
+def bench_read_csv(df: pd.DataFrame, repeats=3):
+    path = "temp_bench.csv"
+    df.to_csv(path, index=False)
+    def p_op():
+        _ = pd.read_csv(path)
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        def pl_op():
+            _ = pl.read_csv(path).to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        def fd_op():
+            # FireDucks read
+            try:
+                res = fd.read_csv(path)
+                _ = materialize_fireducks(res)
+            except Exception:
+                # fallback: create from pandas
+                res = fd.DataFrame(pd.read_csv(path))
+                _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    # clean
+    try:
+        os.remove(path)
+    except Exception:
+        pass
+    return build_result("Read CSV", p_stats, pl_stats, fd_stats)
+def bench_read_parquet(df: pd.DataFrame, repeats=3):
+    path = "temp_bench.parquet"
+    df.to_parquet(path, index=False)
+    def p_op():
+        _ = pd.read_parquet(path)
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        def pl_op():
+            _ = pl.read_parquet(path).to_pandas()
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        def fd_op():
+            try:
+                res = fd.read_parquet(path)
+                _ = materialize_fireducks(res)
+            except Exception:
+                res = fd.DataFrame(pd.read_parquet(path))
+                _ = materialize_fireducks(res)
+        fd_stats = time_function(fd_op, repeats)
+    try:
+        os.remove(path)
+    except Exception:
+        pass
+    return build_result("Read Parquet", p_stats, pl_stats, fd_stats)
+def bench_write_parquet(df: pd.DataFrame, repeats=3):
+    def p_op():
         df.to_parquet("temp_pd.parquet")
+    p_stats = time_function(p_op, repeats)
+    pl_stats = None
+    if HAS_POLARS:
+        pl_df = pl.from_pandas(df)
+        def pl_op():
+            pl_df.write_parquet("temp_pl.parquet")
+        pl_stats = time_function(pl_op, repeats)
+    fd_stats = None
+    if HAS_FIREDUCKS:
+        fd_df = None
+        try:
+            fd_df = ensure_fireducks_from_pandas(df)
+        except Exception:
+            fd_df = None
+        if fd_df is not None:
+            def fd_op():
+                try:
+                    # FireDucks may expose to_parquet or write_parquet
+                    if hasattr(fd_df, "to_parquet"):
+                        fd_df.to_parquet("temp_fd.parquet")
+                    else:
+                        # materialize to pandas and write
+                        materialize_fireducks(fd_df).to_parquet("temp_fd.parquet")
+                except Exception:
+                    materialize_fireducks(fd_df).to_parquet("temp_fd.parquet")
+            fd_stats = time_function(fd_op, repeats)
+    # cleanup
+    for p in ["temp_pd.parquet", "temp_pl.parquet", "temp_fd.parquet"]:
+        try:
+            os.remove(p)
+        except Exception:
+            pass
+    return build_result("Write Parquet", p_stats, pl_stats, fd_stats)
+# -------------------------
+# UI helpers: chart and md formatting
+# -------------------------
+def generate_chart_three(result):
+    fig, ax = plt.subplots(figsize=(5, 3))
+    labels = []
+    values = []
+    if result["pandas_mean_s"] is not None:
+        labels.append("Pandas")
+        values.append(result["pandas_mean_s"])
+    if result["polars_mean_s"] is not None:
+        labels.append("Polars")
+        values.append(result["polars_mean_s"])
+    if result["fireducks_mean_s"] is not None:
+        labels.append("FireDucks")
+        values.append(result["fireducks_mean_s"])
+    ax.bar(labels, values)
+    ax.set_ylabel("Time (s)")
     ax.set_title(result["operation"])
     buf = io.BytesIO()
     plt.tight_layout()
     plt.savefig(buf, format="png")
     buf.seek(0)
     plt.close(fig)
     return Image.open(buf)
+def format_result_md(result):
+    md = f"### 🔬 {result['operation']}\n\n"
+    md += "| Engine | Mean (s) | Std (s) |\n|---|---:|---:|\n"
+    md += f"| Pandas | `{result['pandas_mean_s']}` | `{result['pandas_std_s']}` |\n"
+    md += f"| Polars | `{result['polars_mean_s']}` | `{result['polars_std_s']}` |\n"
+    md += f"| FireDucks | `{result['fireducks_mean_s']}` | `{result['fireducks_std_s']}` |\n\n"
+    if result["speedup_polars_over_pandas"]:
+        md += f"- Polars speedup over Pandas: **{result['speedup_polars_over_pandas']:.2f}x**\n"
+    if result["speedup_fireducks_over_pandas"]:
+        md += f"- FireDucks speedup over Pandas: **{result['speedup_fireducks_over_pandas']:.2f}x**\n"
+    md += "\n<details><summary>Raw runs</summary>\n\n"
+    md += f"- Pandas runs: `{result['pandas_runs']}`\n"
+    md += f"- Polars runs: `{result['polars_runs']}`\n"
+    md += f"- FireDucks runs: `{result['fireducks_runs']}`\n"
+    md += "\n</details>\n"
     return md
+# -------------------------
+# main dispatcher
+# -------------------------
+OPERATION_MAP = {
+    "Filter": bench_filter,
+    "Groupby": bench_groupby,
+    "Join": bench_join,
+    "Fillna": bench_fillna,
+    "Dropna": bench_dropna,
+    "Sort": bench_sort,
+    "Describe": bench_describe,
+    "Read CSV": bench_read_csv,
+    "Read Parquet": bench_read_parquet,
+    "Write Parquet": bench_write_parquet,
+}
+def run_benchmark_dispatch(operation, df, repeats):
+    if operation not in OPERATION_MAP:
+        raise ValueError("Unsupported operation")
+    fn = OPERATION_MAP[operation]
+    return fn(df, repeats)
+# -------------------------
+# Gradio UI
+# -------------------------
 theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
+with gr.Blocks(title="Pandas vs Polars vs FireDucks Benchmark", theme=theme) as demo:
+    gr.Markdown("# 🐼 vs 🔥 vs ⚡ Pandas vs Polars vs FireDucks — Benchmark playground")
     with gr.Tabs():
+        with gr.Tab("Synthetic dataset"):
+            dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset size")
+            operation = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
+            repeats = gr.Slider(1, 7, value=3, label="Repeats")
+            run_btn = gr.Button("Run benchmark")
+            md_out = gr.Markdown()
+            chart_out = gr.Image()
+            def run_synth(size, op, reps):
+                # check libs
+                if not HAS_POLARS or not HAS_FIREDUCKS:
+                    missing = []
+                    if not HAS_POLARS: missing.append("polars")
+                    if not HAS_FIREDUCKS: missing.append("fireducks")
+                    return f"⚠ Missing libraries: {', '.join(missing)}. Install them in requirements.txt.", None
                 n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
                 df = generate_data(n)
+                result = run_benchmark_dispatch(op, df, int(reps))
+                chart = generate_chart_three(result)
+                md = format_result_md(result)
+                return md, chart
+            run_btn.click(run_synth, [dataset_size, operation, repeats], [md_out, chart_out])
+        with gr.Tab("Custom dataset"):
+            file_in = gr.File(label="Upload CSV / Parquet / Feather", file_types=['.csv', '.parquet', '.feather', '.arrow'])
+            operation_c = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
+            repeats_c = gr.Slider(1, 7, value=3, label="Repeats")
+            run_btn_c = gr.Button("Run on uploaded dataset")
+            md_out_c = gr.Markdown()
+            chart_out_c = gr.Image()
+            def run_custom(file, op, reps):
+                if file is None:
+                    return "Upload a dataset file first.", None
+                # quick load by file extension
+                fname = file.name
+                if fname.endswith(".csv"):
+                    df = pd.read_csv(fname)
+                elif fname.endswith(".parquet"):
+                    df = pd.read_parquet(fname)
+                elif fname.endswith(".feather") or fname.endswith(".arrow"):
+                    df = pd.read_feather(fname)
+                else:
+                    return "Unsupported file format", None
+                result = run_benchmark_dispatch(op, df, int(reps))
+                chart = generate_chart_three(result)
+                md = format_result_md(result)
+                return md, chart
+            run_btn_c.click(run_custom, [file_in, operation_c, repeats_c], [md_out_c, chart_out_c])
+    gr.Markdown("**Note:** This demo requires `polars` and `fireducks` installed in the environment. On HF Spaces add them to `requirements.txt`.")
+    gr.Markdown("Recommended `requirements.txt`: `pandas\npolars\nfireducks\ngrade\nmatplotlib\npillow\nduckdb`")
 if __name__ == "__main__":
+    demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get("PORT", 7860)))