# app.py import time import io import os import traceback import numpy as np import pandas as pd import duckdb # kept for parity (not used directly in these benches) import gradio as gr import matplotlib.pyplot as plt from PIL import Image # Optional libs try: import polars as pl HAS_POLARS = True except Exception: pl = None HAS_POLARS = False # FireDucks new API: import the pandas shim try: import fireducks.pandas as fdpd HAS_FIREDUCKS = True except Exception: fdpd = None HAS_FIREDUCKS = False # ------------------------- # Basic utils / data gen # ------------------------- def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame: rng = np.random.default_rng(42) ids = np.arange(n_rows) categories = rng.integers(0, n_groups, size=n_rows) categories = np.array([f"cat_{c}" for c in categories]) value1 = rng.normal(0, 1, size=n_rows) value2 = rng.normal(10, 5, size=n_rows) start_date = np.datetime64("2020-01-01") dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]") return pd.DataFrame( {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates} ) def time_function(fn, repeats=3): repeats = int(max(1, repeats)) times = [] for _ in range(repeats): start = time.perf_counter() fn() end = time.perf_counter() times.append(end - start) return float(np.mean(times)), float(np.std(times)), [float(t) for t in times] # ------------------------- # FireDucks helpers # ------------------------- def ensure_fireducks_from_pandas(df: pd.DataFrame): """ Convert a pandas DataFrame into a FireDucks-backed pandas object (shim). """ if not HAS_FIREDUCKS: raise RuntimeError("FireDucks (fireducks.pandas) not installed") # Try common constructors try: return fdpd.DataFrame(df) except Exception: pass try: if hasattr(fdpd, "from_pandas"): return fdpd.from_pandas(df) except Exception: pass raise RuntimeError("Could not construct FireDucks DataFrame from pandas with current shim") def materialize_fireducks(obj): """ Convert FireDucks result to pandas if possible for fair inspection. """ if isinstance(obj, pd.DataFrame): return obj if HAS_FIREDUCKS: try: if hasattr(obj, "to_pandas"): return obj.to_pandas() except Exception: pass return obj # ------------------------- # Benchmark helpers # ------------------------- def build_result(op_name, pandas_stats, polars_stats, fireducks_stats): p_mean, p_std, p_runs = pandas_stats if pandas_stats else (None, None, None) pl_mean, pl_std, pl_runs = polars_stats if polars_stats else (None, None, None) fd_mean, fd_std, fd_runs = fireducks_stats if fireducks_stats else (None, None, None) speed_pl = (p_mean / pl_mean) if (p_mean and pl_mean and pl_mean > 0) else None speed_fd = (p_mean / fd_mean) if (p_mean and fd_mean and fd_mean > 0) else None return { "operation": op_name, "pandas_mean_s": p_mean, "pandas_std_s": p_std, "pandas_runs": p_runs, "polars_mean_s": pl_mean, "polars_std_s": pl_std, "polars_runs": pl_runs, "fireducks_mean_s": fd_mean, "fireducks_std_s": fd_std, "fireducks_runs": fd_runs, "speedup_polars_over_pandas": speed_pl, "speedup_fireducks_over_pandas": speed_fd, } # ------------------------- # Bench functions (all kept) # ------------------------- def bench_filter(df: pd.DataFrame, repeats=3): def p_op(): _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])] p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): first_cat = pl_df["category"][0] _ = pl_df.filter((pl.col("value1") > 0.5) & (pl.col("category") == first_cat)).to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df[(fd_df["value1"] > 0.5) & (fd_df["category"] == fd_df["category"].iloc[0])] _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Filter", p_stats, pl_stats, fd_stats) def bench_groupby(df: pd.DataFrame, repeats=3): def p_op(): _ = df.groupby("category")[["value1", "value2"]].mean() p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): _ = pl_df.group_by("category").agg([pl.col("value1").mean(), pl.col("value2").mean()]).to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df.group_by("category")[["value1", "value2"]].mean() _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Groupby mean", p_stats, pl_stats, fd_stats) def bench_join(df: pd.DataFrame, repeats=3): categories = df["category"].unique() rng = np.random.default_rng(123) dim_df = pd.DataFrame({"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}) def p_op(): _ = df.merge(dim_df, on="category", how="left") p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) pl_dim = pl.from_pandas(dim_df) def pl_op(): _ = pl_df.join(pl_dim, on="category", how="left").to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) fd_dim = ensure_fireducks_from_pandas(dim_df) def fd_op(): res = fd_df.merge(fd_dim, on="category", how="left") _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Join on category", p_stats, pl_stats, fd_stats) def bench_fillna(df: pd.DataFrame, repeats=3): def p_op(): _ = df.fillna(0) p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): _ = pl_df.fill_null(0).to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df.fillna(0) _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Fill NA / fillna", p_stats, pl_stats, fd_stats) def bench_dropna(df: pd.DataFrame, repeats=3): def p_op(): _ = df.dropna() p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): _ = pl_df.drop_nulls().to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df.dropna() _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Drop NA / dropna", p_stats, pl_stats, fd_stats) def bench_sort(df: pd.DataFrame, repeats=3): def p_op(): _ = df.sort_values("value1") p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): _ = pl_df.sort("value1").to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df.sort_values("value1") _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Sort by value1", p_stats, pl_stats, fd_stats) def bench_describe(df: pd.DataFrame, repeats=3): def p_op(): _ = df.describe() p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): _ = pl_df.describe().to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): res = fd_df.describe() _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None return build_result("Describe()", p_stats, pl_stats, fd_stats) def bench_read_csv(df: pd.DataFrame, repeats=3): path = "temp_bench.csv" df.to_csv(path, index=False) def p_op(): _ = pd.read_csv(path) p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: def pl_op(): _ = pl.read_csv(path).to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: def fd_op(): res = fdpd.read_csv(path) _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: try: def fd_op_fb(): res = fdpd.DataFrame(pd.read_csv(path)) _ = materialize_fireducks(res) fd_stats = time_function(fd_op_fb, repeats) except Exception: fd_stats = None try: os.remove(path) except Exception: pass return build_result("Read CSV", p_stats, pl_stats, fd_stats) def bench_read_parquet(df: pd.DataFrame, repeats=3): path = "temp_bench.parquet" df.to_parquet(path, index=False) def p_op(): _ = pd.read_parquet(path) p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: def pl_op(): _ = pl.read_parquet(path).to_pandas() pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: def fd_op(): res = fdpd.read_parquet(path) _ = materialize_fireducks(res) fd_stats = time_function(fd_op, repeats) except Exception: try: def fd_op_fb(): res = fdpd.DataFrame(pd.read_parquet(path)) _ = materialize_fireducks(res) fd_stats = time_function(fd_op_fb, repeats) except Exception: fd_stats = None try: os.remove(path) except Exception: pass return build_result("Read Parquet", p_stats, pl_stats, fd_stats) def bench_write_parquet(df: pd.DataFrame, repeats=3): def p_op(): df.to_parquet("temp_pd.parquet") p_stats = time_function(p_op, repeats) pl_stats = None if HAS_POLARS: pl_df = pl.from_pandas(df) def pl_op(): pl_df.write_parquet("temp_pl.parquet") pl_stats = time_function(pl_op, repeats) fd_stats = None if HAS_FIREDUCKS: try: fd_df = ensure_fireducks_from_pandas(df) def fd_op(): if hasattr(fd_df, "to_parquet"): fd_df.to_parquet("temp_fd.parquet") else: materialize_fireducks(fd_df).to_parquet("temp_fd.parquet") fd_stats = time_function(fd_op, repeats) except Exception: fd_stats = None for p in ["temp_pd.parquet", "temp_pl.parquet", "temp_fd.parquet"]: try: os.remove(p) except Exception: pass return build_result("Write Parquet", p_stats, pl_stats, fd_stats) # ------------------------- # UI helpers: chart and images # ------------------------- def generate_chart_three(result): fig, ax = plt.subplots(figsize=(5, 3)) labels = [] values = [] if result["pandas_mean_s"] is not None: labels.append("Pandas") values.append(result["pandas_mean_s"]) if result["polars_mean_s"] is not None: labels.append("Polars") values.append(result["polars_mean_s"]) if result["fireducks_mean_s"] is not None: labels.append("FireDucks") values.append(result["fireducks_mean_s"]) ax.bar(labels, values) ax.set_ylabel("Time (s)") ax.set_title(result["operation"]) for i, v in enumerate(values): ax.text(i, v + max(values) * 0.01, f"{v:.4f}s", ha='center') buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format="png") buf.seek(0) plt.close(fig) return Image.open(buf) def generate_speedbars(result): """ Horizontal bars showing relative speed. Lower time = longer 'speed' bar. We'll normalize with the fastest (smallest) time. """ # Collect engines & times engines = [] times = [] if result["pandas_mean_s"] is not None: engines.append("Pandas"); times.append(result["pandas_mean_s"]) if result["polars_mean_s"] is not None: engines.append("Polars"); times.append(result["polars_mean_s"]) if result["fireducks_mean_s"] is not None: engines.append("FireDucks"); times.append(result["fireducks_mean_s"]) if len(times) == 0: # return a small empty image img = Image.new("RGB", (600, 80), color=(240,240,240)) return img fastest = min(times) # speed multiplier relative to pandas baseline (if pandas present) baseline = result["pandas_mean_s"] if result["pandas_mean_s"] else fastest # Normalize lengths: invert times so smaller time -> bigger bar inv = [fastest / t for t in times] max_inv = max(inv) lengths = [int(500 * (v / max_inv)) for v in inv] fig, ax = plt.subplots(figsize=(6, len(engines) * 0.6 + 0.5)) y_pos = np.arange(len(engines)) ax.barh(y_pos, lengths, align='center') ax.set_yticks(y_pos) ax.set_yticklabels(engines) ax.invert_yaxis() # fastest on top ax.set_xlabel("Relative speed (normalized to fastest)") # Annotate multiplier and actual time for i, (l, t) in enumerate(zip(lengths, times)): mult = baseline / t if baseline and t else None label = f"{t:.4f}s" if mult: label += f" ({mult:.2f}x vs baseline)" ax.text(l + 6, i, label, va='center') plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format="png") buf.seek(0) plt.close(fig) return Image.open(buf) def format_result_md(result): md = f"### 🔬 {result['operation']}\n\n" md += "| Engine | Mean (s) | Std (s) |\n|---|---:|---:|\n" md += f"| Pandas | `{result['pandas_mean_s']}` | `{result['pandas_std_s']}` |\n" md += f"| Polars | `{result['polars_mean_s']}` | `{result['polars_std_s']}` |\n" md += f"| FireDucks | `{result['fireducks_mean_s']}` | `{result['fireducks_std_s']}` |\n\n" if result["speedup_polars_over_pandas"]: md += f"- Polars speedup over Pandas: **{result['speedup_polars_over_pandas']:.2f}x**\n" if result["speedup_fireducks_over_pandas"]: md += f"- FireDucks speedup over Pandas: **{result['speedup_fireducks_over_pandas']:.2f}x**\n" md += "\n
Raw runs\n\n" md += f"- Pandas runs: `{result['pandas_runs']}`\n" md += f"- Polars runs: `{result['polars_runs']}`\n" md += f"- FireDucks runs: `{result['fireducks_runs']}`\n" md += "\n
\n" return md def fastest_engine_badge(result): engines = [] times = [] if result["pandas_mean_s"] is not None: engines.append("Pandas"); times.append(result["pandas_mean_s"]) if result["polars_mean_s"] is not None: engines.append("Polars"); times.append(result["polars_mean_s"]) if result["fireducks_mean_s"] is not None: engines.append("FireDucks"); times.append(result["fireducks_mean_s"]) if not engines: return "
No engines available
" idx = int(np.argmin(times)) fastest = engines[idx] time_val = times[idx] html = f"""
Fastest: {fastest} — {time_val:.4f}s
""" return html # ------------------------- # Dispatcher map # ------------------------- OPERATION_MAP = { "Filter": bench_filter, "Groupby": bench_groupby, "Join": bench_join, "Fillna": bench_fillna, "Dropna": bench_dropna, "Sort": bench_sort, "Describe": bench_describe, "Read CSV": bench_read_csv, "Read Parquet": bench_read_parquet, "Write Parquet": bench_write_parquet, } def run_benchmark_dispatch(operation, df, repeats): if operation not in OPERATION_MAP: raise ValueError("Unsupported operation") fn = OPERATION_MAP[operation] return fn(df, repeats) # ------------------------- # Gradio UI (Option A layout) # ------------------------- theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate") with gr.Blocks(title="Pandas vs Polars vs FireDucks Benchmark", theme=theme) as demo: gr.Markdown("# 🐼 vs 🔥 vs ⚡ Pandas vs Polars vs FireDucks — Benchmark playground") with gr.Tabs(): with gr.Tab("Synthetic dataset"): # Controls dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset size") operation = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation") repeats = gr.Slider(1, 7, value=3, label="Repeats") run_btn = gr.Button("Run benchmark") # OUTPUT LAYOUT (Option A): chart top -> speedbars -> fastest badge -> markdown chart_out = gr.Image(label="Timing chart (lower is better)", height=300, width=600) speedbars_out = gr.Image(label="Relative speedbars (fastest normalized to 1)", height=300, width=600) fastest_out = gr.HTML(label="Fastest engine") md_out = gr.Markdown() def run_synth(size, op, reps): # check optional libs missing = [] if not HAS_POLARS: missing.append("polars") if not HAS_FIREDUCKS: missing.append("fireducks (fireducks.pandas shim)") if missing: # return friendly warning in place of outputs warn = f"⚠ Missing libraries: {', '.join(missing)}. Add them to requirements.txt if you want those engines tested." # for images, return small placeholder image with warning text img = Image.new("RGB", (800, 200), color=(250,250,250)) return img, img, f"
{warn}
", f"**Warning**: {warn}" n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size] df = generate_data(n) result = run_benchmark_dispatch(op, df, int(reps)) # Build visuals chart = generate_chart_three(result) speedbars = generate_speedbars(result) fastest_html = fastest_engine_badge(result) md = format_result_md(result) return chart, speedbars, fastest_html, md run_btn.click(run_synth, [dataset_size, operation, repeats], [chart_out, speedbars_out, fastest_out, md_out]) with gr.Tab("Custom dataset"): file_in = gr.File(label="Upload CSV / Parquet / Feather / Arrow", file_types=['.csv', '.parquet', '.feather', '.arrow']) operation_c = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation") repeats_c = gr.Slider(1, 7, value=3, label="Repeats") run_btn_c = gr.Button("Run on uploaded dataset") chart_out_c = gr.Image(label="Timing chart") speedbars_out_c = gr.Image(label="Relative speedbars") fastest_out_c = gr.HTML(label="Fastest engine") md_out_c = gr.Markdown() def run_custom(file, op, reps): if file is None: img = Image.new("RGB", (800, 200), color=(250,250,250)) return img, img, "
Upload a dataset file first.
", "Upload a dataset file first." fname = file.name try: if fname.endswith(".csv"): df = pd.read_csv(fname) elif fname.endswith(".parquet"): df = pd.read_parquet(fname) elif fname.endswith(".feather") or fname.endswith(".arrow"): df = pd.read_feather(fname) else: return Image.new("RGB", (800,200),(250,250,250)), Image.new("RGB",(800,200),(250,250,250)), "
Unsupported file format
", "Unsupported file format" except Exception as e: return Image.new("RGB", (800,200),(250,250,250)), Image.new("RGB",(800,200),(250,250,250)), f"
Error reading file: {e}
", f"Error reading file: {e}" result = run_benchmark_dispatch(op, df, int(reps)) chart = generate_chart_three(result) speedbars = generate_speedbars(result) fastest_html = fastest_engine_badge(result) md = format_result_md(result) return chart, speedbars, fastest_html, md run_btn_c.click(run_custom, [file_in, operation_c, repeats_c], [chart_out_c, speedbars_out_c, fastest_out_c, md_out_c]) if __name__ == "__main__": demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get("PORT", 7860)))