PraneshJs commited on
Commit
c347ab7
·
verified ·
1 Parent(s): 24430f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +499 -278
app.py CHANGED
@@ -1,19 +1,34 @@
 
1
  import time
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
- import duckdb
5
  import gradio as gr
6
  import matplotlib.pyplot as plt
7
  from PIL import Image
8
- import io
9
- import os
10
-
11
- duckdb_con = duckdb.connect(database=":memory:")
12
-
13
- # ----------------------------------------------------------
14
- # Synthetic Data Generator
15
- # ----------------------------------------------------------
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
18
  rng = np.random.default_rng(42)
19
  ids = np.arange(n_rows)
@@ -28,316 +43,522 @@ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
28
  {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
29
  )
30
 
31
-
32
- # ----------------------------------------------------------
33
- # Timing utility
34
- # ----------------------------------------------------------
35
-
36
  def time_function(fn, repeats=3):
37
- repeats = int(repeats)
38
  times = []
39
  for _ in range(repeats):
40
  start = time.perf_counter()
41
  fn()
42
  end = time.perf_counter()
43
  times.append(end - start)
44
- return np.mean(times), np.std(times), times
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
-
47
- # ----------------------------------------------------------
48
- # Benchmark Operations (Compute + I/O)
49
- # ----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
50
 
51
  # ---- Filter ----
52
- def bench_filter(df, repeats=3):
53
- def pandas_op():
 
54
  _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
55
 
56
- def duckdb_op():
57
- duckdb_con.register("df", df)
58
- duckdb_con.execute(f"""
59
- SELECT *
60
- FROM df
61
- WHERE value1 > 0.5
62
- AND category='{df['category'].iloc[0]}'
63
- """).fetchdf()
64
-
65
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
66
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
67
-
68
- return build_result("Filter rows", p_mean, p_std, p_all, d_mean, d_std, d_all)
69
-
70
-
71
- # ---- Groupby ----
72
- def bench_groupby(df, repeats=3):
73
- def pandas_op():
 
 
 
 
 
 
 
 
74
  _ = df.groupby("category")[["value1", "value2"]].mean()
75
 
76
- def duckdb_op():
77
- duckdb_con.register("df", df)
78
- duckdb_con.execute("""
79
- SELECT category, AVG(value1), AVG(value2)
80
- FROM df GROUP BY category
81
- """).fetchdf()
82
 
83
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
84
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
 
 
 
85
 
86
- return build_result("Groupby mean", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
 
 
87
 
 
88
 
89
  # ---- Join ----
90
- def bench_join(df, repeats=3):
91
  categories = df["category"].unique()
92
  rng = np.random.default_rng(123)
93
- dim_df = pd.DataFrame(
94
- {"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}
95
- )
96
 
97
- def pandas_op():
98
  _ = df.merge(dim_df, on="category", how="left")
99
 
100
- def duckdb_op():
101
- duckdb_con.register("df", df)
102
- duckdb_con.register("dim_df", dim_df)
103
- duckdb_con.execute("""
104
- SELECT d.*, dim.weight
105
- FROM df d
106
- LEFT JOIN dim_df dim
107
- ON d.category = dim.category
108
- """).fetchdf()
109
-
110
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
111
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
112
-
113
- return build_result("Join on category", p_mean, p_std, p_all, d_mean, d_std, d_all)
114
-
115
-
116
- # ---- Read CSV ----
117
- def bench_read_csv(temp_csv_path, repeats=3):
118
- def pandas_op():
119
- _ = pd.read_csv(temp_csv_path)
120
-
121
- def duckdb_op():
122
- _ = duckdb.read_csv_auto(temp_csv_path)
123
-
124
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
125
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
126
-
127
- return build_result("Read CSV", p_mean, p_std, p_all, d_mean, d_std, d_all)
128
-
129
-
130
- # ---- Read Parquet ----
131
- def bench_read_parquet(temp_parquet_path, repeats=3):
132
- def pandas_op():
133
- _ = pd.read_parquet(temp_parquet_path)
134
-
135
- def duckdb_op():
136
- _ = duckdb.read_parquet(temp_parquet_path)
137
-
138
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
139
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
140
-
141
- return build_result("Read Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
142
-
143
-
144
- # ---- Write Parquet ----
145
- def bench_write_parquet(df, repeats=3):
146
- def pandas_op():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  df.to_parquet("temp_pd.parquet")
148
-
149
- def duckdb_op():
150
- duckdb_con.register("df", df)
151
- duckdb_con.execute("COPY df TO 'temp_duck.parquet' (FORMAT PARQUET)")
152
-
153
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
154
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
155
-
156
- return build_result("Write Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
157
-
158
-
159
- # ----------------------------------------------------------
160
- # Shared result formatting
161
- # ----------------------------------------------------------
162
-
163
- def build_result(op_name, p_mean, p_std, p_all, d_mean, d_std, d_all):
164
- speedup = p_mean / d_mean if d_mean > 0 else None
165
-
166
- return {
167
- "operation": op_name,
168
- "pandas_mean_s": p_mean,
169
- "pandas_std_s": p_std,
170
- "duckdb_mean_s": d_mean,
171
- "duckdb_std_s": d_std,
172
- "speedup": speedup,
173
- "raw_pandas_runs": p_all,
174
- "raw_duckdb_runs": d_all,
175
- }
176
-
177
-
178
- # ----------------------------------------------------------
179
- # Benchmark Dispatcher
180
- # ----------------------------------------------------------
181
-
182
- def run_benchmark(operation, df=None, repeats=3):
183
- repeats = int(repeats)
184
-
185
- if operation == "Filter": return bench_filter(df, repeats)
186
- if operation == "Groupby": return bench_groupby(df, repeats)
187
- if operation == "Join": return bench_join(df, repeats)
188
- if operation == "Write Parquet": return bench_write_parquet(df, repeats)
189
-
190
- raise ValueError(f"Unsupported operation: {operation}")
191
-
192
-
193
- # ----------------------------------------------------------
194
- # Chart generator (PIL Image)
195
- # ----------------------------------------------------------
196
-
197
- def generate_chart(result):
198
- fig, ax = plt.subplots(figsize=(4, 3))
199
-
200
- engines = ["Pandas", "DuckDB"]
201
- times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
202
-
203
- ax.bar(engines, times)
204
- ax.set_ylabel("Time (seconds)")
205
  ax.set_title(result["operation"])
206
-
207
  buf = io.BytesIO()
208
  plt.tight_layout()
209
  plt.savefig(buf, format="png")
210
  buf.seek(0)
211
  plt.close(fig)
212
-
213
  return Image.open(buf)
214
 
215
-
216
- # ----------------------------------------------------------
217
- # Markdown result
218
- # ----------------------------------------------------------
219
-
220
- def format_result(result):
221
- speed = result["speedup"]
222
- verdict = (
223
- f"🚀 **DuckDB is ~{speed:.2f}× faster**"
224
- if speed > 1
225
- else f"🐼 **Pandas is ~{1/speed:.2f}× faster**"
226
- )
227
-
228
- md = f"""
229
- ### 🔬 Benchmark Result — {result['operation']}
230
-
231
- | Engine | Mean (s) | Std (s) |
232
- |--------|----------|---------|
233
- | Pandas | `{result['pandas_mean_s']:.6f}` | `{result['pandas_std_s']:.6f}` |
234
- | DuckDB | `{result['duckdb_mean_s']:.6f}` | `{result['duckdb_std_s']:.6f}` |
235
-
236
- **Verdict:** {verdict}
237
-
238
- <details><summary>Raw timings</summary>
239
-
240
- - Pandas: `{[round(x,6) for x in result['raw_pandas_runs']]}`
241
- - DuckDB: `{[round(x,6) for x in result['raw_duckdb_runs']]}`
242
- </details>
243
- """
244
  return md
245
 
246
-
247
- # ----------------------------------------------------------
248
- # Helper to load custom dataset
249
- # ----------------------------------------------------------
250
-
251
- def load_custom_dataset(file):
252
- if file.name.endswith(".csv"):
253
- return pd.read_csv(file.name)
254
- if file.name.endswith(".parquet"):
255
- return pd.read_parquet(file.name)
256
- if file.name.endswith(".arrow"):
257
- return pd.read_feather(file.name)
258
- raise ValueError("Unsupported file format")
259
-
260
-
261
- # ----------------------------------------------------------
262
- # Gradio App
263
- # ----------------------------------------------------------
264
-
 
 
 
 
 
 
265
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
266
 
267
- with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
268
-
269
- gr.Markdown("# 🐼 vs 🦆 DuckDB vs Pandas — Performance Playground")
270
 
271
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- # ==================================================
274
- # 🔥 Synthetic Mode
275
- # ==================================================
276
- with gr.Tab("🔥 Synthetic Dataset Benchmarks"):
277
-
278
- dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset Size")
279
-
280
- operation_synth = gr.Radio(
281
- ["Filter", "Groupby", "Join", "Write Parquet"],
282
- label="Operation",
283
- value="Filter"
284
- )
285
-
286
- repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
287
-
288
- btn_synth = gr.Button("🚀 Run Benchmark")
289
-
290
- out_md_synth = gr.Markdown()
291
- out_chart_synth = gr.Image()
292
-
293
- def synthetic_runner(size, operation, repeats):
294
- repeats = int(repeats)
295
  n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
296
-
297
  df = generate_data(n)
298
- result = run_benchmark(operation, df, repeats)
299
- chart = generate_chart(result)
300
-
301
- return format_result(result), chart
302
-
303
- btn_synth.click(
304
- synthetic_runner,
305
- [dataset_size, operation_synth, repeats_synth],
306
- [out_md_synth, out_chart_synth],
307
- )
308
-
309
-
310
- # ==================================================
311
- # 📁 Custom Dataset Mode
312
- # ==================================================
313
- with gr.Tab("📁 Custom Dataset Upload"):
314
-
315
- file_in = gr.File(label="Upload CSV / Parquet / Arrow")
316
-
317
- operation_custom = gr.Radio(
318
- ["Filter", "Groupby", "Join", "Write Parquet"],
319
- label="Operation",
320
- value="Filter"
321
- )
322
-
323
- repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
324
-
325
- btn_custom = gr.Button("Run on Uploaded Dataset")
326
-
327
- out_md_custom = gr.Markdown()
328
- out_chart_custom = gr.Image()
329
-
330
- def custom_runner(file, operation, repeats):
331
- repeats = int(repeats)
332
- df = load_custom_dataset(file)
333
- result = run_benchmark(operation, df, repeats)
334
- return format_result(result), generate_chart(result)
335
-
336
- btn_custom.click(
337
- custom_runner,
338
- [file_in, operation_custom, repeats_custom],
339
- [out_md_custom, out_chart_custom],
340
- )
341
 
342
  if __name__ == "__main__":
343
- demo.launch()
 
1
+ # app.py
2
  import time
3
+ import io
4
+ import os
5
+ import traceback
6
+
7
  import numpy as np
8
  import pandas as pd
9
+ import duckdb # kept for parity if needed
10
  import gradio as gr
11
  import matplotlib.pyplot as plt
12
  from PIL import Image
 
 
 
 
 
 
 
 
13
 
14
+ # optional libs: polars, fireducks
15
+ try:
16
+ import polars as pl
17
+ HAS_POLARS = True
18
+ except Exception:
19
+ pl = None
20
+ HAS_POLARS = False
21
+
22
+ try:
23
+ import fireducks as fd
24
+ HAS_FIREDUCKS = True
25
+ except Exception:
26
+ fd = None
27
+ HAS_FIREDUCKS = False
28
+
29
+ # -------------------------
30
+ # Basic utils / data gen
31
+ # -------------------------
32
  def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
33
  rng = np.random.default_rng(42)
34
  ids = np.arange(n_rows)
 
43
  {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
44
  )
45
 
 
 
 
 
 
46
  def time_function(fn, repeats=3):
47
+ repeats = int(max(1, repeats))
48
  times = []
49
  for _ in range(repeats):
50
  start = time.perf_counter()
51
  fn()
52
  end = time.perf_counter()
53
  times.append(end - start)
54
+ return float(np.mean(times)), float(np.std(times)), [float(t) for t in times]
55
+
56
+ # -------------------------
57
+ # Helpers to ensure materialization
58
+ # -------------------------
59
+ def materialize_fireducks(maybe_fd_obj):
60
+ """
61
+ FireDucks operations are often lazy. Convert results to pandas
62
+ so we measure real execution. We attempt multiple ways:
63
+ - if result has .to_pandas() use it
64
+ - if result is a FireDucks Frame with .to_pandas, call it
65
+ - if result is already pandas, return as is
66
+ """
67
+ if not HAS_FIREDUCKS:
68
+ return maybe_fd_obj
69
+ try:
70
+ # If it's already pandas
71
+ if isinstance(maybe_fd_obj, pd.DataFrame):
72
+ return maybe_fd_obj
73
+ # common conversion method
74
+ if hasattr(maybe_fd_obj, "to_pandas"):
75
+ return maybe_fd_obj.to_pandas()
76
+ # fireducks may expose .to_pandas or fd.pandas.to_pandas - try generically
77
+ return maybe_fd_obj
78
+ except Exception:
79
+ return maybe_fd_obj
80
+
81
+
82
+ def ensure_polars_from_pandas(df: pd.DataFrame):
83
+ """Return a Polars DataFrame constructed from pandas (if polars available)."""
84
+ if not HAS_POLARS:
85
+ raise RuntimeError("Polars not installed")
86
+ # convert pandas -> polars
87
+ return pl.from_pandas(df)
88
+
89
+ def ensure_fireducks_from_pandas(df: pd.DataFrame):
90
+ """Return a FireDucks DataFrame constructed from pandas (if fireducks available).
91
+ Try a few constructor variants for compatibility with FD versions.
92
+ """
93
+ if not HAS_FIREDUCKS:
94
+ raise RuntimeError("FireDucks not installed")
95
+ # try common constructor patterns
96
+ try:
97
+ # Direct constructor
98
+ return fd.DataFrame(df)
99
+ except Exception:
100
+ pass
101
+ try:
102
+ # from_pandas helper if exists
103
+ if hasattr(fd, "pandas") and hasattr(fd.pandas, "from_pandas"):
104
+ return fd.pandas.from_pandas(df)
105
+ except Exception:
106
+ pass
107
+ try:
108
+ # some docs show Frame.from_pandas or Frame.from_csv
109
+ if hasattr(fd, "Frame") and hasattr(fd.Frame, "from_pandas"):
110
+ return fd.Frame.from_pandas(df)
111
+ except Exception:
112
+ pass
113
+ # Last fallback: some FD versions simply accept fd.DataFrame(df) above
114
+ raise RuntimeError("Could not create FireDucks DataFrame with available API")
115
+
116
+ # -------------------------
117
+ # Benchmark operations
118
+ # Each bench function returns result dict using build_result()
119
+ # -------------------------
120
+ def build_result(op_name, pandas_stats, polars_stats, fireducks_stats):
121
+ # Each stats tuple = (mean, std, runs_list) or None if unavailable
122
+ p_mean, p_std, p_runs = pandas_stats if pandas_stats else (None, None, None)
123
+ pl_mean, pl_std, pl_runs = polars_stats if polars_stats else (None, None, None)
124
+ fd_mean, fd_std, fd_runs = fireducks_stats if fireducks_stats else (None, None, None)
125
+
126
+ # compute basic speedups relative to pandas (if possible)
127
+ speed_pl = (p_mean / pl_mean) if (p_mean and pl_mean and pl_mean > 0) else None
128
+ speed_fd = (p_mean / fd_mean) if (p_mean and fd_mean and fd_mean > 0) else None
129
 
130
+ return {
131
+ "operation": op_name,
132
+ "pandas_mean_s": p_mean,
133
+ "pandas_std_s": p_std,
134
+ "pandas_runs": p_runs,
135
+ "polars_mean_s": pl_mean,
136
+ "polars_std_s": pl_std,
137
+ "polars_runs": pl_runs,
138
+ "fireducks_mean_s": fd_mean,
139
+ "fireducks_std_s": fd_std,
140
+ "fireducks_runs": fd_runs,
141
+ "speedup_polars_over_pandas": speed_pl,
142
+ "speedup_fireducks_over_pandas": speed_fd,
143
+ }
144
 
145
  # ---- Filter ----
146
+ def bench_filter(df: pd.DataFrame, repeats=3):
147
+ # pandas op
148
+ def p_op():
149
  _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
150
 
151
+ p_stats = time_function(p_op, repeats)
152
+
153
+ # polars op
154
+ pl_stats = None
155
+ if HAS_POLARS:
156
+ pl_df = ensure_polars_from_pandas(df)
157
+ def pl_op():
158
+ # polars uses expression style
159
+ _ = pl_df.filter((pl.col("value1") > 0.5) & (pl.col("category") == pl_df["category"][0])).to_pandas()
160
+ pl_stats = time_function(pl_op, repeats)
161
+
162
+ # fireducks op
163
+ fd_stats = None
164
+ if HAS_FIREDUCKS:
165
+ fd_df = ensure_fireducks_from_pandas(df)
166
+ def fd_op():
167
+ res = fd_df[(fd_df["value1"] > 0.5) & (fd_df["category"] == fd_df["category"].iloc[0])]
168
+ # materialize
169
+ _ = materialize_fireducks(res)
170
+ fd_stats = time_function(fd_op, repeats)
171
+
172
+ return build_result("Filter", p_stats, pl_stats, fd_stats)
173
+
174
+ # ---- GroupBy Mean ----
175
+ def bench_groupby(df: pd.DataFrame, repeats=3):
176
+ def p_op():
177
  _ = df.groupby("category")[["value1", "value2"]].mean()
178
 
179
+ p_stats = time_function(p_op, repeats)
 
 
 
 
 
180
 
181
+ pl_stats = None
182
+ if HAS_POLARS:
183
+ pl_df = ensure_polars_from_pandas(df)
184
+ def pl_op():
185
+ _ = pl_df.groupby("category").agg([pl.col("value1").mean(), pl.col("value2").mean()]).to_pandas()
186
+ pl_stats = time_function(pl_op, repeats)
187
 
188
+ fd_stats = None
189
+ if HAS_FIREDUCKS:
190
+ fd_df = ensure_fireducks_from_pandas(df)
191
+ def fd_op():
192
+ res = fd_df.groupby("category")[["value1","value2"]].mean()
193
+ _ = materialize_fireducks(res)
194
+ fd_stats = time_function(fd_op, repeats)
195
 
196
+ return build_result("Groupby mean", p_stats, pl_stats, fd_stats)
197
 
198
  # ---- Join ----
199
+ def bench_join(df: pd.DataFrame, repeats=3):
200
  categories = df["category"].unique()
201
  rng = np.random.default_rng(123)
202
+ dim_df = pd.DataFrame({"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))})
 
 
203
 
204
+ def p_op():
205
  _ = df.merge(dim_df, on="category", how="left")
206
 
207
+ p_stats = time_function(p_op, repeats)
208
+
209
+ pl_stats = None
210
+ if HAS_POLARS:
211
+ pl_df = ensure_polars_from_pandas(df)
212
+ pl_dim = pl.from_pandas(dim_df)
213
+ def pl_op():
214
+ _ = pl_df.join(pl_dim, on="category", how="left").to_pandas()
215
+ pl_stats = time_function(pl_op, repeats)
216
+
217
+ fd_stats = None
218
+ if HAS_FIREDUCKS:
219
+ fd_df = ensure_fireducks_from_pandas(df)
220
+ fd_dim = ensure_fireducks_from_pandas(dim_df)
221
+ def fd_op():
222
+ res = fd_df.merge(fd_dim, on="category", how="left")
223
+ _ = materialize_fireducks(res)
224
+ fd_stats = time_function(fd_op, repeats)
225
+
226
+ return build_result("Join on category", p_stats, pl_stats, fd_stats)
227
+
228
+ # ---- Fillna ----
229
+ def bench_fillna(df: pd.DataFrame, repeats=3):
230
+ def p_op():
231
+ _ = df.fillna(0)
232
+
233
+ p_stats = time_function(p_op, repeats)
234
+
235
+ pl_stats = None
236
+ if HAS_POLARS:
237
+ pl_df = ensure_polars_from_pandas(df)
238
+ def pl_op():
239
+ _ = pl_df.fill_null(0).to_pandas()
240
+ pl_stats = time_function(pl_op, repeats)
241
+
242
+ fd_stats = None
243
+ if HAS_FIREDUCKS:
244
+ fd_df = ensure_fireducks_from_pandas(df)
245
+ def fd_op():
246
+ res = fd_df.fillna(0)
247
+ _ = materialize_fireducks(res)
248
+ fd_stats = time_function(fd_op, repeats)
249
+
250
+ return build_result("Fill NA / fillna", p_stats, pl_stats, fd_stats)
251
+
252
+ # ---- Dropna ----
253
+ def bench_dropna(df: pd.DataFrame, repeats=3):
254
+ def p_op():
255
+ _ = df.dropna()
256
+
257
+ p_stats = time_function(p_op, repeats)
258
+
259
+ pl_stats = None
260
+ if HAS_POLARS:
261
+ pl_df = ensure_polars_from_pandas(df)
262
+ def pl_op():
263
+ _ = pl_df.drop_nulls().to_pandas()
264
+ pl_stats = time_function(pl_op, repeats)
265
+
266
+ fd_stats = None
267
+ if HAS_FIREDUCKS:
268
+ fd_df = ensure_fireducks_from_pandas(df)
269
+ def fd_op():
270
+ res = fd_df.dropna()
271
+ _ = materialize_fireducks(res)
272
+ fd_stats = time_function(fd_op, repeats)
273
+
274
+ return build_result("Drop NA / dropna", p_stats, pl_stats, fd_stats)
275
+
276
+ # ---- Sort ----
277
+ def bench_sort(df: pd.DataFrame, repeats=3):
278
+ def p_op():
279
+ _ = df.sort_values("value1")
280
+
281
+ p_stats = time_function(p_op, repeats)
282
+
283
+ pl_stats = None
284
+ if HAS_POLARS:
285
+ pl_df = ensure_polars_from_pandas(df)
286
+ def pl_op():
287
+ _ = pl_df.sort("value1").to_pandas()
288
+ pl_stats = time_function(pl_op, repeats)
289
+
290
+ fd_stats = None
291
+ if HAS_FIREDUCKS:
292
+ fd_df = ensure_fireducks_from_pandas(df)
293
+ def fd_op():
294
+ res = fd_df.sort_values("value1")
295
+ _ = materialize_fireducks(res)
296
+ fd_stats = time_function(fd_op, repeats)
297
+
298
+ return build_result("Sort by value1", p_stats, pl_stats, fd_stats)
299
+
300
+ # ---- Describe ----
301
+ def bench_describe(df: pd.DataFrame, repeats=3):
302
+ def p_op():
303
+ _ = df.describe()
304
+
305
+ p_stats = time_function(p_op, repeats)
306
+
307
+ pl_stats = None
308
+ if HAS_POLARS:
309
+ pl_df = ensure_polars_from_pandas(df)
310
+ def pl_op():
311
+ _ = pl_df.describe().to_pandas()
312
+ pl_stats = time_function(pl_op, repeats)
313
+
314
+ fd_stats = None
315
+ if HAS_FIREDUCKS:
316
+ fd_df = ensure_fireducks_from_pandas(df)
317
+ def fd_op():
318
+ res = fd_df.describe()
319
+ _ = materialize_fireducks(res)
320
+ fd_stats = time_function(fd_op, repeats)
321
+
322
+ return build_result("Describe()", p_stats, pl_stats, fd_stats)
323
+
324
+ # ---- Read CSV / Parquet / Write Parquet - these will write temp files and measure reads/writes ----
325
+ def bench_read_csv(df: pd.DataFrame, repeats=3):
326
+ path = "temp_bench.csv"
327
+ df.to_csv(path, index=False)
328
+ def p_op():
329
+ _ = pd.read_csv(path)
330
+ p_stats = time_function(p_op, repeats)
331
+
332
+ pl_stats = None
333
+ if HAS_POLARS:
334
+ def pl_op():
335
+ _ = pl.read_csv(path).to_pandas()
336
+ pl_stats = time_function(pl_op, repeats)
337
+
338
+ fd_stats = None
339
+ if HAS_FIREDUCKS:
340
+ def fd_op():
341
+ # FireDucks read
342
+ try:
343
+ res = fd.read_csv(path)
344
+ _ = materialize_fireducks(res)
345
+ except Exception:
346
+ # fallback: create from pandas
347
+ res = fd.DataFrame(pd.read_csv(path))
348
+ _ = materialize_fireducks(res)
349
+ fd_stats = time_function(fd_op, repeats)
350
+
351
+ # clean
352
+ try:
353
+ os.remove(path)
354
+ except Exception:
355
+ pass
356
+
357
+ return build_result("Read CSV", p_stats, pl_stats, fd_stats)
358
+
359
+ def bench_read_parquet(df: pd.DataFrame, repeats=3):
360
+ path = "temp_bench.parquet"
361
+ df.to_parquet(path, index=False)
362
+ def p_op():
363
+ _ = pd.read_parquet(path)
364
+ p_stats = time_function(p_op, repeats)
365
+
366
+ pl_stats = None
367
+ if HAS_POLARS:
368
+ def pl_op():
369
+ _ = pl.read_parquet(path).to_pandas()
370
+ pl_stats = time_function(pl_op, repeats)
371
+
372
+ fd_stats = None
373
+ if HAS_FIREDUCKS:
374
+ def fd_op():
375
+ try:
376
+ res = fd.read_parquet(path)
377
+ _ = materialize_fireducks(res)
378
+ except Exception:
379
+ res = fd.DataFrame(pd.read_parquet(path))
380
+ _ = materialize_fireducks(res)
381
+ fd_stats = time_function(fd_op, repeats)
382
+
383
+ try:
384
+ os.remove(path)
385
+ except Exception:
386
+ pass
387
+
388
+ return build_result("Read Parquet", p_stats, pl_stats, fd_stats)
389
+
390
+ def bench_write_parquet(df: pd.DataFrame, repeats=3):
391
+ def p_op():
392
  df.to_parquet("temp_pd.parquet")
393
+ p_stats = time_function(p_op, repeats)
394
+
395
+ pl_stats = None
396
+ if HAS_POLARS:
397
+ pl_df = pl.from_pandas(df)
398
+ def pl_op():
399
+ pl_df.write_parquet("temp_pl.parquet")
400
+ pl_stats = time_function(pl_op, repeats)
401
+
402
+ fd_stats = None
403
+ if HAS_FIREDUCKS:
404
+ fd_df = None
405
+ try:
406
+ fd_df = ensure_fireducks_from_pandas(df)
407
+ except Exception:
408
+ fd_df = None
409
+ if fd_df is not None:
410
+ def fd_op():
411
+ try:
412
+ # FireDucks may expose to_parquet or write_parquet
413
+ if hasattr(fd_df, "to_parquet"):
414
+ fd_df.to_parquet("temp_fd.parquet")
415
+ else:
416
+ # materialize to pandas and write
417
+ materialize_fireducks(fd_df).to_parquet("temp_fd.parquet")
418
+ except Exception:
419
+ materialize_fireducks(fd_df).to_parquet("temp_fd.parquet")
420
+ fd_stats = time_function(fd_op, repeats)
421
+
422
+ # cleanup
423
+ for p in ["temp_pd.parquet", "temp_pl.parquet", "temp_fd.parquet"]:
424
+ try:
425
+ os.remove(p)
426
+ except Exception:
427
+ pass
428
+
429
+ return build_result("Write Parquet", p_stats, pl_stats, fd_stats)
430
+
431
+ # -------------------------
432
+ # UI helpers: chart and md formatting
433
+ # -------------------------
434
+ def generate_chart_three(result):
435
+ fig, ax = plt.subplots(figsize=(5, 3))
436
+ labels = []
437
+ values = []
438
+ if result["pandas_mean_s"] is not None:
439
+ labels.append("Pandas")
440
+ values.append(result["pandas_mean_s"])
441
+ if result["polars_mean_s"] is not None:
442
+ labels.append("Polars")
443
+ values.append(result["polars_mean_s"])
444
+ if result["fireducks_mean_s"] is not None:
445
+ labels.append("FireDucks")
446
+ values.append(result["fireducks_mean_s"])
447
+ ax.bar(labels, values)
448
+ ax.set_ylabel("Time (s)")
 
449
  ax.set_title(result["operation"])
 
450
  buf = io.BytesIO()
451
  plt.tight_layout()
452
  plt.savefig(buf, format="png")
453
  buf.seek(0)
454
  plt.close(fig)
 
455
  return Image.open(buf)
456
 
457
+ def format_result_md(result):
458
+ md = f"### 🔬 {result['operation']}\n\n"
459
+ md += "| Engine | Mean (s) | Std (s) |\n|---|---:|---:|\n"
460
+ md += f"| Pandas | `{result['pandas_mean_s']}` | `{result['pandas_std_s']}` |\n"
461
+ md += f"| Polars | `{result['polars_mean_s']}` | `{result['polars_std_s']}` |\n"
462
+ md += f"| FireDucks | `{result['fireducks_mean_s']}` | `{result['fireducks_std_s']}` |\n\n"
463
+ if result["speedup_polars_over_pandas"]:
464
+ md += f"- Polars speedup over Pandas: **{result['speedup_polars_over_pandas']:.2f}x**\n"
465
+ if result["speedup_fireducks_over_pandas"]:
466
+ md += f"- FireDucks speedup over Pandas: **{result['speedup_fireducks_over_pandas']:.2f}x**\n"
467
+ md += "\n<details><summary>Raw runs</summary>\n\n"
468
+ md += f"- Pandas runs: `{result['pandas_runs']}`\n"
469
+ md += f"- Polars runs: `{result['polars_runs']}`\n"
470
+ md += f"- FireDucks runs: `{result['fireducks_runs']}`\n"
471
+ md += "\n</details>\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  return md
473
 
474
+ # -------------------------
475
+ # main dispatcher
476
+ # -------------------------
477
+ OPERATION_MAP = {
478
+ "Filter": bench_filter,
479
+ "Groupby": bench_groupby,
480
+ "Join": bench_join,
481
+ "Fillna": bench_fillna,
482
+ "Dropna": bench_dropna,
483
+ "Sort": bench_sort,
484
+ "Describe": bench_describe,
485
+ "Read CSV": bench_read_csv,
486
+ "Read Parquet": bench_read_parquet,
487
+ "Write Parquet": bench_write_parquet,
488
+ }
489
+
490
+ def run_benchmark_dispatch(operation, df, repeats):
491
+ if operation not in OPERATION_MAP:
492
+ raise ValueError("Unsupported operation")
493
+ fn = OPERATION_MAP[operation]
494
+ return fn(df, repeats)
495
+
496
+ # -------------------------
497
+ # Gradio UI
498
+ # -------------------------
499
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
500
 
501
+ with gr.Blocks(title="Pandas vs Polars vs FireDucks Benchmark", theme=theme) as demo:
502
+ gr.Markdown("# 🐼 vs 🔥 vs ⚡ Pandas vs Polars vs FireDucks — Benchmark playground")
 
503
 
504
  with gr.Tabs():
505
+ with gr.Tab("Synthetic dataset"):
506
+ dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset size")
507
+ operation = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
508
+ repeats = gr.Slider(1, 7, value=3, label="Repeats")
509
+ run_btn = gr.Button("Run benchmark")
510
+
511
+ md_out = gr.Markdown()
512
+ chart_out = gr.Image()
513
+
514
+ def run_synth(size, op, reps):
515
+ # check libs
516
+ if not HAS_POLARS or not HAS_FIREDUCKS:
517
+ missing = []
518
+ if not HAS_POLARS: missing.append("polars")
519
+ if not HAS_FIREDUCKS: missing.append("fireducks")
520
+ return f"⚠ Missing libraries: {', '.join(missing)}. Install them in requirements.txt.", None
521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
 
523
  df = generate_data(n)
524
+ result = run_benchmark_dispatch(op, df, int(reps))
525
+ chart = generate_chart_three(result)
526
+ md = format_result_md(result)
527
+ return md, chart
528
+
529
+ run_btn.click(run_synth, [dataset_size, operation, repeats], [md_out, chart_out])
530
+
531
+ with gr.Tab("Custom dataset"):
532
+ file_in = gr.File(label="Upload CSV / Parquet / Feather", file_types=['.csv', '.parquet', '.feather', '.arrow'])
533
+ operation_c = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
534
+ repeats_c = gr.Slider(1, 7, value=3, label="Repeats")
535
+ run_btn_c = gr.Button("Run on uploaded dataset")
536
+ md_out_c = gr.Markdown()
537
+ chart_out_c = gr.Image()
538
+
539
+ def run_custom(file, op, reps):
540
+ if file is None:
541
+ return "Upload a dataset file first.", None
542
+ # quick load by file extension
543
+ fname = file.name
544
+ if fname.endswith(".csv"):
545
+ df = pd.read_csv(fname)
546
+ elif fname.endswith(".parquet"):
547
+ df = pd.read_parquet(fname)
548
+ elif fname.endswith(".feather") or fname.endswith(".arrow"):
549
+ df = pd.read_feather(fname)
550
+ else:
551
+ return "Unsupported file format", None
552
+
553
+ result = run_benchmark_dispatch(op, df, int(reps))
554
+ chart = generate_chart_three(result)
555
+ md = format_result_md(result)
556
+ return md, chart
557
+
558
+ run_btn_c.click(run_custom, [file_in, operation_c, repeats_c], [md_out_c, chart_out_c])
559
+
560
+ gr.Markdown("**Note:** This demo requires `polars` and `fireducks` installed in the environment. On HF Spaces add them to `requirements.txt`.")
561
+ gr.Markdown("Recommended `requirements.txt`: `pandas\npolars\nfireducks\ngrade\nmatplotlib\npillow\nduckdb`")
 
 
 
 
 
562
 
563
  if __name__ == "__main__":
564
+ demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get("PORT", 7860)))