PraneshJs commited on
Commit
eb0b572
Β·
verified Β·
1 Parent(s): 47947ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -499
app.py CHANGED
@@ -1,61 +1,37 @@
1
  import time
2
- import io
3
- import traceback
4
- from typing import Dict, Callable, Any, Tuple
5
-
6
  import numpy as np
7
  import pandas as pd
8
  import duckdb
9
  import gradio as gr
10
  import matplotlib.pyplot as plt
11
  from PIL import Image
12
-
13
- # ----------------- 1. Global Setup -----------------
14
 
15
  duckdb_con = duckdb.connect(database=":memory:")
16
 
17
- # ----------------- 2. Data Generation & Loading -----------------
 
 
18
 
19
- def generate_data(n_rows, n_groups=50):
20
  rng = np.random.default_rng(42)
21
- ids = np.arange(n_rows, dtype=np.int64)
22
-
23
  categories = rng.integers(0, n_groups, size=n_rows)
24
- categories = np.array(["cat_" + str(c) for c in categories], dtype=object)
25
-
26
  value1 = rng.normal(0, 1, size=n_rows)
27
  value2 = rng.normal(10, 5, size=n_rows)
28
-
29
- null_mask = rng.random(n_rows) < 0.05
30
- value1[null_mask] = np.nan
31
-
32
  start_date = np.datetime64("2020-01-01")
33
  dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
34
 
35
- df = pd.DataFrame(
36
- {
37
- "id": ids,
38
- "category": categories,
39
- "value1": value1,
40
- "value2": value2,
41
- "date": dates,
42
- }
43
  )
44
- return df
45
 
46
- def load_custom_dataset(file) -> pd.DataFrame:
47
- if file is None:
48
- raise ValueError("No file uploaded.")
49
- name = file.name.lower()
50
- if name.endswith(".csv"):
51
- return pd.read_csv(file.name)
52
- if name.endswith(".parquet"):
53
- return pd.read_parquet(file.name)
54
- if name.endswith(".arrow") or name.endswith(".feather"):
55
- return pd.read_feather(file.name)
56
- raise ValueError("Unsupported file format. Use CSV, Parquet, or Arrow/Feather.")
57
 
58
- # ----------------- 3. Timing Utils -----------------
 
 
59
 
60
  def time_function(fn, repeats=3):
61
  repeats = int(repeats)
@@ -65,106 +41,57 @@ def time_function(fn, repeats=3):
65
  fn()
66
  end = time.perf_counter()
67
  times.append(end - start)
68
- return float(np.mean(times)), float(np.std(times)), [float(t) for t in times]
69
 
70
- def build_result(pm, ps, pr, dm, ds, dr):
71
- if dm > 1e-9:
72
- speedup = pm / dm
73
- else:
74
- speedup = 0.0
75
- return {
76
- "pandas_mean_s": pm,
77
- "pandas_std_s": ps,
78
- "duckdb_mean_s": dm,
79
- "duckdb_std_s": ds,
80
- "speedup": speedup,
81
- "raw_pandas_runs": pr,
82
- "raw_duckdb_runs": dr,
83
- }
84
 
85
- # ----------------- 4. Benchmarks -----------------
 
 
86
 
87
- def bench_filter_simple(df, repeats=3):
 
88
  def pandas_op():
89
- _ = df[(df["value1"] > 0.5) & (df["category"] == "cat_1")]
90
 
91
  def duckdb_op():
92
  duckdb_con.register("df", df)
93
- duckdb_con.execute(
94
- "SELECT * FROM df WHERE value1 > 0.5 AND category = 'cat_1';"
95
- ).fetchdf()
 
 
 
96
 
97
- pm, ps, pr = time_function(pandas_op, repeats)
98
- dm, ds, dr = time_function(duckdb_op, repeats)
99
- return build_result(pm, ps, pr, dm, ds, dr)
100
 
101
- def bench_filter_complex(df, repeats=3):
102
- def pandas_op():
103
- _ = df[
104
- (df["value1"] > 0)
105
- & (df["value2"] < 12)
106
- & (df["date"].between("2020-03-01", "2020-09-30"))
107
- ]
108
 
109
- def duckdb_op():
110
- duckdb_con.register("df", df)
111
- duckdb_con.execute(
112
- "SELECT * FROM df "
113
- "WHERE value1 > 0 "
114
- "AND value2 < 12 "
115
- "AND date BETWEEN DATE '2020-03-01' AND DATE '2020-09-30';"
116
- ).fetchdf()
117
-
118
- pm, ps, pr = time_function(pandas_op, repeats)
119
- dm, ds, dr = time_function(duckdb_op, repeats)
120
- return build_result(pm, ps, pr, dm, ds, dr)
121
-
122
- def bench_groupby_basic(df, repeats=3):
123
- def pandas_op():
124
- _ = df.groupby("category").agg(
125
- mean_value1=("value1", "mean"),
126
- sum_value2=("value2", "sum"),
127
- cnt=("id", "count"),
128
- )
129
 
130
- def duckdb_op():
131
- duckdb_con.register("df", df)
132
- duckdb_con.execute(
133
- "SELECT category, "
134
- "AVG(value1) AS mean_value1, "
135
- "SUM(value2) AS sum_value2, "
136
- "COUNT(*) AS cnt "
137
- "FROM df GROUP BY category;"
138
- ).fetchdf()
139
-
140
- pm, ps, pr = time_function(pandas_op, repeats)
141
- dm, ds, dr = time_function(duckdb_op, repeats)
142
- return build_result(pm, ps, pr, dm, ds, dr)
143
-
144
- def bench_groupby_having(df, repeats=3):
145
  def pandas_op():
146
- agg = df.groupby("category").agg(mean_value1=("value1", "mean"))
147
- _ = agg[agg["mean_value1"] > 0]
148
 
149
  def duckdb_op():
150
  duckdb_con.register("df", df)
151
- duckdb_con.execute(
152
- "SELECT category, AVG(value1) AS mean_value1 "
153
- "FROM df GROUP BY category HAVING AVG(value1) > 0;"
154
- ).fetchdf()
 
 
 
 
 
155
 
156
- pm, ps, pr = time_function(pandas_op, repeats)
157
- dm, ds, dr = time_function(duckdb_op, repeats)
158
- return build_result(pm, ps, pr, dm, ds, dr)
159
 
 
160
  def bench_join(df, repeats=3):
161
  categories = df["category"].unique()
162
  rng = np.random.default_rng(123)
163
  dim_df = pd.DataFrame(
164
- {
165
- "category": categories,
166
- "weight": rng.uniform(0.5, 2.0, size=len(categories)),
167
- }
168
  )
169
 
170
  def pandas_op():
@@ -173,445 +100,243 @@ def bench_join(df, repeats=3):
173
  def duckdb_op():
174
  duckdb_con.register("df", df)
175
  duckdb_con.register("dim_df", dim_df)
176
- duckdb_con.execute(
177
- "SELECT d.*, dim.weight "
178
- "FROM df d LEFT JOIN dim_df dim "
179
- "ON d.category = dim.category;"
180
- ).fetchdf()
 
181
 
182
- pm, ps, pr = time_function(pandas_op, repeats)
183
- dm, ds, dr = time_function(duckdb_op, repeats)
184
- return build_result(pm, ps, pr, dm, ds, dr)
185
 
186
- def bench_order_by(df, repeats=3):
 
 
 
 
187
  def pandas_op():
188
- _ = df.sort_values(["value1", "date"], ascending=[False, True])
189
 
190
  def duckdb_op():
191
- duckdb_con.register("df", df)
192
- duckdb_con.execute(
193
- "SELECT * FROM df ORDER BY value1 DESC, date ASC;"
194
- ).fetchdf()
195
 
196
- pm, ps, pr = time_function(pandas_op, repeats)
197
- dm, ds, dr = time_function(duckdb_op, repeats)
198
- return build_result(pm, ps, pr, dm, ds, dr)
199
 
200
- def bench_window_row_number(df, repeats=3):
 
 
 
 
201
  def pandas_op():
202
- temp = df.sort_values(["category", "value1"], ascending=[True, False]).copy()
203
- temp["rn"] = temp.groupby("category").cumcount() + 1
204
- _ = temp
205
 
206
  def duckdb_op():
207
- duckdb_con.register("df", df)
208
- duckdb_con.execute(
209
- "SELECT *, "
210
- "ROW_NUMBER() OVER (PARTITION BY category ORDER BY value1 DESC) AS rn "
211
- "FROM df;"
212
- ).fetchdf()
213
 
214
- pm, ps, pr = time_function(pandas_op, repeats)
215
- dm, ds, dr = time_function(duckdb_op, repeats)
216
- return build_result(pm, ps, pr, dm, ds, dr)
217
 
218
- def bench_window_running_total(df, repeats=3):
219
- def pandas_op():
220
- temp = df.sort_values("date").copy()
221
- temp["running_sum"] = temp["value1"].fillna(0).cumsum()
222
- _ = temp
223
 
224
- def duckdb_op():
225
- duckdb_con.register("df", df)
226
- duckdb_con.execute(
227
- "SELECT *, "
228
- "SUM(COALESCE(value1, 0)) OVER (ORDER BY date "
229
- "ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_sum "
230
- "FROM df;"
231
- ).fetchdf()
232
-
233
- pm, ps, pr = time_function(pandas_op, repeats)
234
- dm, ds, dr = time_function(duckdb_op, repeats)
235
- return build_result(pm, ps, pr, dm, ds, dr)
236
-
237
- def bench_drop_nulls(df, repeats=3):
238
  def pandas_op():
239
- _ = df[df["value1"].notna()]
240
 
241
  def duckdb_op():
242
  duckdb_con.register("df", df)
243
- duckdb_con.execute(
244
- "SELECT * FROM df WHERE value1 IS NOT NULL;"
245
- ).fetchdf()
246
 
247
- pm, ps, pr = time_function(pandas_op, repeats)
248
- dm, ds, dr = time_function(duckdb_op, repeats)
249
- return build_result(pm, ps, pr, dm, ds, dr)
250
 
251
- def bench_fill_nulls(df, repeats=3):
252
- def pandas_op():
253
- _ = df["value1"].fillna(0)
254
 
255
- def duckdb_op():
256
- duckdb_con.register("df", df)
257
- duckdb_con.execute(
258
- "SELECT COALESCE(value1, 0) AS value1_filled FROM df;"
259
- ).fetchdf()
260
 
261
- pm, ps, pr = time_function(pandas_op, repeats)
262
- dm, ds, dr = time_function(duckdb_op, repeats)
263
- return build_result(pm, ps, pr, dm, ds, dr)
264
 
265
- def bench_distinct_count(df, repeats=3):
266
- def pandas_op():
267
- _ = df["category"].nunique()
268
 
269
- def duckdb_op():
270
- duckdb_con.register("df", df)
271
- duckdb_con.execute(
272
- "SELECT COUNT(DISTINCT category) AS distinct_categories FROM df;"
273
- ).fetchdf()
 
 
 
 
 
274
 
275
- pm, ps, pr = time_function(pandas_op, repeats)
276
- dm, ds, dr = time_function(duckdb_op, repeats)
277
- return build_result(pm, ps, pr, dm, ds, dr)
278
 
279
- def bench_materialize_parquet(df, repeats=3):
280
- def pandas_op():
281
- agg = df.groupby("category").agg(
282
- mean_value1=("value1", "mean"),
283
- sum_value2=("value2", "sum"),
284
- )
285
- agg.to_parquet("pandas_grouped.parquet")
286
 
287
- def duckdb_op():
288
- duckdb_con.register("df", df)
289
- duckdb_con.execute(
290
- "CREATE OR REPLACE TEMP TABLE agg AS "
291
- "SELECT category, AVG(value1) AS mean_value1, "
292
- "SUM(value2) AS sum_value2 FROM df GROUP BY category;"
293
- )
294
- duckdb_con.execute(
295
- "COPY agg TO 'duck_grouped.parquet' (FORMAT PARQUET);"
296
- )
297
-
298
- pm, ps, pr = time_function(pandas_op, repeats)
299
- dm, ds, dr = time_function(duckdb_op, repeats)
300
- return build_result(pm, ps, pr, dm, ds, dr)
301
-
302
- # ----------------- 5. Operation Registry -----------------
303
-
304
- OPERATIONS = {
305
- "Filter (simple WHERE)": {
306
- "sql": "SELECT * FROM df WHERE value1 > 0.5 AND category = 'cat_1';",
307
- "pandas": 'df[(df["value1"] > 0.5) & (df["category"] == "cat_1")]',
308
- "bench": bench_filter_simple,
309
- },
310
- "Filter (complex WHERE + date range)": {
311
- "sql": (
312
- "SELECT * FROM df\n"
313
- "WHERE value1 > 0\n"
314
- " AND value2 < 12\n"
315
- " AND date BETWEEN DATE '2020-03-01' AND DATE '2020-09-30';"
316
- ),
317
- "pandas": (
318
- 'df[(df["value1"] > 0)\n'
319
- ' & (df["value2"] < 12)\n'
320
- ' & (df["date"].between("2020-03-01", "2020-09-30"))]'
321
- ),
322
- "bench": bench_filter_complex,
323
- },
324
- "Groupby (multi-agg)": {
325
- "sql": (
326
- "SELECT category,\n"
327
- " AVG(value1) AS mean_value1,\n"
328
- " SUM(value2) AS sum_value2,\n"
329
- " COUNT(*) AS cnt\n"
330
- "FROM df\n"
331
- "GROUP BY category;"
332
- ),
333
- "pandas": (
334
- 'df.groupby("category").agg(\n'
335
- ' mean_value1=("value1", "mean"),\n'
336
- ' sum_value2=("value2", "sum"),\n'
337
- ' cnt=("id", "count"),\n'
338
- ")"
339
- ),
340
- "bench": bench_groupby_basic,
341
- },
342
- "Groupby + HAVING": {
343
- "sql": (
344
- "SELECT category,\n"
345
- " AVG(value1) AS mean_value1\n"
346
- "FROM df\n"
347
- "GROUP BY category\n"
348
- "HAVING AVG(value1) > 0;"
349
- ),
350
- "pandas": (
351
- 'agg = df.groupby("category").agg(mean_value1=("value1", "mean"))\n'
352
- 'agg[agg["mean_value1"] > 0]'
353
- ),
354
- "bench": bench_groupby_having,
355
- },
356
- "Join (fact ⨝ dim on category)": {
357
- "sql": (
358
- "WITH dim AS (\n"
359
- " SELECT category, AVG(value1) AS weight\n"
360
- " FROM df\n"
361
- " GROUP BY category\n"
362
- ")\n"
363
- "SELECT d.*, dim.weight\n"
364
- "FROM df d\n"
365
- "LEFT JOIN dim ON d.category = dim.category;"
366
- ),
367
- "pandas": (
368
- "dim = df.groupby('category', as_index=False)['value1']"
369
- ".mean().rename(columns={'value1':'weight'})\n"
370
- "df.merge(dim, on='category', how='left')"
371
- ),
372
- "bench": bench_join,
373
- },
374
- "Order By (value1 DESC, date ASC)": {
375
- "sql": "SELECT * FROM df ORDER BY value1 DESC, date ASC;",
376
- "pandas": 'df.sort_values(["value1", "date"], ascending=[False, True])',
377
- "bench": bench_order_by,
378
- },
379
- "Window: ROW_NUMBER() PARTITION BY category": {
380
- "sql": (
381
- "SELECT *,\n"
382
- " ROW_NUMBER() OVER (\n"
383
- " PARTITION BY category\n"
384
- " ORDER BY value1 DESC\n"
385
- " ) AS rn\n"
386
- "FROM df;"
387
- ),
388
- "pandas": (
389
- 'temp = df.sort_values(["category", "value1"], ascending=[True, False])\n'
390
- 'temp["rn"] = temp.groupby("category").cumcount() + 1'
391
- ),
392
- "bench": bench_window_row_number,
393
- },
394
- "Window: Running SUM(value1) OVER (ORDER BY date)": {
395
- "sql": (
396
- "SELECT *,\n"
397
- " SUM(COALESCE(value1, 0)) OVER (\n"
398
- " ORDER BY date\n"
399
- " ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n"
400
- " ) AS running_sum\n"
401
- "FROM df;"
402
- ),
403
- "pandas": (
404
- 'temp = df.sort_values("date")\n'
405
- 'temp["running_sum"] = temp["value1"].fillna(0).cumsum()'
406
- ),
407
- "bench": bench_window_running_total,
408
- },
409
- "Drop NULLs (value1 IS NOT NULL)": {
410
- "sql": "SELECT * FROM df WHERE value1 IS NOT NULL;",
411
- "pandas": 'df[df["value1"].notna()]',
412
- "bench": bench_drop_nulls,
413
- },
414
- "Fill NULLs (COALESCE(value1, 0))": {
415
- "sql": "SELECT COALESCE(value1, 0) AS value1_filled FROM df;",
416
- "pandas": 'df["value1"].fillna(0)',
417
- "bench": bench_fill_nulls,
418
- },
419
- "Distinct count (COUNT(DISTINCT category))": {
420
- "sql": "SELECT COUNT(DISTINCT category) AS distinct_categories FROM df;",
421
- "pandas": 'df["category"].nunique()',
422
- "bench": bench_distinct_count,
423
- },
424
- "Materialize Groupby β†’ Parquet": {
425
- "sql": (
426
- "CREATE OR REPLACE TEMP TABLE agg AS\n"
427
- "SELECT category,\n"
428
- " AVG(value1) AS mean_value1,\n"
429
- " SUM(value2) AS sum_value2\n"
430
- "FROM df\n"
431
- "GROUP BY category;\n"
432
- "COPY agg TO 'duck_grouped.parquet' (FORMAT PARQUET);"
433
- ),
434
- "pandas": (
435
- 'agg = df.groupby("category").agg(\n'
436
- ' mean_value1=("value1", "mean"),\n'
437
- ' sum_value2=("value2", "sum"),\n'
438
- ")\n"
439
- 'agg.to_parquet("pandas_grouped.parquet")'
440
- ),
441
- "bench": bench_materialize_parquet,
442
- },
443
- }
444
-
445
- # ----------------- 6. Logic & Formatting -----------------
446
-
447
- def run_benchmark(operation_label, df, repeats):
448
- if operation_label not in OPERATIONS:
449
- raise ValueError("Unknown operation: " + str(operation_label))
450
- op_meta = OPERATIONS[operation_label]
451
- bench_fn = op_meta["bench"]
452
- result = bench_fn(df, repeats)
453
- result["operation"] = operation_label
454
- return result, op_meta
455
 
456
  def generate_chart(result):
457
- fig, ax = plt.subplots(figsize=(6, 4))
 
458
  engines = ["Pandas", "DuckDB"]
459
  times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
460
- colors = ["#1f77b4", "#ff7f0e"]
461
- ax.bar(engines, times, color=colors)
462
  ax.set_ylabel("Time (seconds)")
463
- ax.set_title(str(result.get("operation", "Benchmark Result")))
464
- for i, v in enumerate(times):
465
- ax.text(i, v, "{0:.4f}s".format(v), ha="center", va="bottom")
466
  buf = io.BytesIO()
467
  plt.tight_layout()
468
- plt.savefig(buf, format="png", dpi=100)
469
  buf.seek(0)
470
  plt.close(fig)
 
471
  return Image.open(buf)
472
 
473
- def format_result(result, op_meta):
 
 
 
 
 
474
  speed = result["speedup"]
475
- if speed is None or speed <= 0:
476
- verdict = "Speedup could not be computed."
477
- elif speed > 1:
478
- verdict = "DuckDB is about {0:.2f}x faster than Pandas.".format(speed)
479
- else:
480
- verdict = "Pandas is about {0:.2f}x faster than DuckDB.".format(1.0 / speed)
481
-
482
- sql_code = op_meta["sql"]
483
- pandas_code = op_meta["pandas"]
484
-
485
- raw_pandas_list = ["{0:.6f}".format(x) for x in result["raw_pandas_runs"]]
486
- raw_duck_list = ["{0:.6f}".format(x) for x in result["raw_duckdb_runs"]]
487
-
488
- raw_pandas = ", ".join(raw_pandas_list)
489
- raw_duck = ", ".join(raw_duck_list)
490
-
491
- lines = []
492
- lines.append("Benchmark: " + str(result["operation"]))
493
- lines.append("")
494
- lines.append(
495
- "Pandas mean: {0:.6f} s (std {1:.6f})".format(
496
- result["pandas_mean_s"], result["pandas_std_s"]
497
- )
498
- )
499
- lines.append(
500
- "DuckDB mean: {0:.6f} s (std {1:.6f})".format(
501
- result["duckdb_mean_s"], result["duckdb_std_s"]
502
- )
503
  )
504
- lines.append("Verdict: " + verdict)
505
- lines.append("")
506
- lines.append("Raw timings (seconds):")
507
- lines.append(" Pandas: [" + raw_pandas + "]")
508
- lines.append(" DuckDB: [" + raw_duck + "]")
509
- lines.append("")
510
- lines.append("SQL (DuckDB):")
511
- lines.append(sql_code)
512
- lines.append("")
513
- lines.append("Pandas equivalent:")
514
- lines.append(pandas_code)
515
- return "\n".join(lines)
516
-
517
- # ----------------- 7. Gradio App -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
520
 
521
- with gr.Blocks(title="DuckDB vs Pandas β€” SQL Analytics Benchmark", theme=theme) as demo:
522
- gr.Markdown(
523
- "# DuckDB vs Pandas β€” SQL Analytics Benchmark\n\n"
524
- "Compare DuckDB (SQL) and Pandas (Python) on realistic analytics operations."
525
- )
526
 
527
  with gr.Tabs():
528
- with gr.Tab("Synthetic Dataset Benchmarks"):
529
- with gr.Row():
530
- with gr.Column(scale=1):
531
- dataset_size = gr.Radio(
532
- ["100k", "500k", "2M"],
533
- value="100k",
534
- label="Dataset Size (synthetic rows)",
535
- )
536
- operation_synth = gr.Dropdown(
537
- choices=list(OPERATIONS.keys()),
538
- value="Filter (simple WHERE)",
539
- label="Operation",
540
- )
541
- repeats_synth = gr.Slider(
542
- 1,
543
- 7,
544
- value=3,
545
- step=1,
546
- label="Timing repeats (average over N runs)",
547
- )
548
- btn_synth = gr.Button("Run Benchmark", variant="primary")
549
-
550
- with gr.Column(scale=1):
551
- out_chart_synth = gr.Image(label="Performance Chart", type="pil")
552
- out_text_synth = gr.Textbox(label="Result", lines=20)
553
-
554
- def synthetic_runner(size, op, repeats):
555
- try:
556
- repeats = int(repeats)
557
- n_map = {"100k": 100000, "500k": 500000, "2M": 2000000}
558
- df = generate_data(n_map[size])
559
- result, meta = run_benchmark(op, df, repeats)
560
- chart = generate_chart(result)
561
- return chart, format_result(result, meta)
562
- except Exception:
563
- return None, "Error:\n" + traceback.format_exc()
564
 
565
  btn_synth.click(
566
  synthetic_runner,
567
  [dataset_size, operation_synth, repeats_synth],
568
- [out_chart_synth, out_text_synth],
569
  )
570
 
571
- with gr.Tab("Custom Dataset Upload"):
572
- gr.Markdown(
573
- "Your file must contain these columns: id, category, value1, value2, date"
 
 
 
 
 
 
 
 
 
574
  )
575
 
576
- with gr.Row():
577
- with gr.Column(scale=1):
578
- file_in = gr.File(label="Upload CSV / Parquet / Arrow")
579
- operation_custom = gr.Dropdown(
580
- choices=list(OPERATIONS.keys()),
581
- value="Filter (simple WHERE)",
582
- label="Operation",
583
- )
584
- repeats_custom = gr.Slider(
585
- 1,
586
- 7,
587
- value=3,
588
- step=1,
589
- label="Timing repeats",
590
- )
591
- btn_custom = gr.Button("Run Benchmark", variant="primary")
592
-
593
- with gr.Column(scale=1):
594
- out_chart_custom = gr.Image(label="Performance Chart", type="pil")
595
- out_text_custom = gr.Textbox(label="Result", lines=20)
596
-
597
- def custom_runner(file, op, repeats):
598
- try:
599
- repeats = int(repeats)
600
- df = load_custom_dataset(file)
601
- required = {"id", "category", "value1", "value2", "date"}
602
- missing = required - set(df.columns)
603
- if missing:
604
- raise ValueError("Missing columns: " + str(sorted(missing)))
605
- result, meta = run_benchmark(op, df, repeats)
606
- chart = generate_chart(result)
607
- return chart, format_result(result, meta)
608
- except Exception:
609
- return None, "Error:\n" + traceback.format_exc()
610
 
611
  btn_custom.click(
612
  custom_runner,
613
  [file_in, operation_custom, repeats_custom],
614
- [out_chart_custom, out_text_custom],
615
  )
616
 
617
  if __name__ == "__main__":
 
1
  import time
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import duckdb
5
  import gradio as gr
6
  import matplotlib.pyplot as plt
7
  from PIL import Image
8
+ import io
9
+ import os
10
 
11
  duckdb_con = duckdb.connect(database=":memory:")
12
 
13
+ # ----------------------------------------------------------
14
+ # Synthetic Data Generator
15
+ # ----------------------------------------------------------
16
 
17
+ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
18
  rng = np.random.default_rng(42)
19
+ ids = np.arange(n_rows)
 
20
  categories = rng.integers(0, n_groups, size=n_rows)
21
+ categories = np.array([f"cat_{c}" for c in categories])
 
22
  value1 = rng.normal(0, 1, size=n_rows)
23
  value2 = rng.normal(10, 5, size=n_rows)
 
 
 
 
24
  start_date = np.datetime64("2020-01-01")
25
  dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
26
 
27
+ return pd.DataFrame(
28
+ {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
 
 
 
 
 
 
29
  )
 
30
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # ----------------------------------------------------------
33
+ # Timing utility
34
+ # ----------------------------------------------------------
35
 
36
  def time_function(fn, repeats=3):
37
  repeats = int(repeats)
 
41
  fn()
42
  end = time.perf_counter()
43
  times.append(end - start)
44
+ return np.mean(times), np.std(times), times
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # ----------------------------------------------------------
48
+ # Benchmark Operations (Compute + I/O)
49
+ # ----------------------------------------------------------
50
 
51
+ # ---- Filter ----
52
+ def bench_filter(df, repeats=3):
53
  def pandas_op():
54
+ _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
55
 
56
  def duckdb_op():
57
  duckdb_con.register("df", df)
58
+ duckdb_con.execute(f"""
59
+ SELECT *
60
+ FROM df
61
+ WHERE value1 > 0.5
62
+ AND category='{df['category'].iloc[0]}'
63
+ """).fetchdf()
64
 
65
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
66
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
67
 
68
+ return build_result("Filter rows", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
 
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # ---- Groupby ----
72
+ def bench_groupby(df, repeats=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def pandas_op():
74
+ _ = df.groupby("category")[["value1", "value2"]].mean()
 
75
 
76
  def duckdb_op():
77
  duckdb_con.register("df", df)
78
+ duckdb_con.execute("""
79
+ SELECT category, AVG(value1), AVG(value2)
80
+ FROM df GROUP BY category
81
+ """).fetchdf()
82
+
83
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
84
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
85
+
86
+ return build_result("Groupby mean", p_mean, p_std, p_all, d_mean, d_std, d_all)
87
 
 
 
 
88
 
89
+ # ---- Join ----
90
  def bench_join(df, repeats=3):
91
  categories = df["category"].unique()
92
  rng = np.random.default_rng(123)
93
  dim_df = pd.DataFrame(
94
+ {"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}
 
 
 
95
  )
96
 
97
  def pandas_op():
 
100
  def duckdb_op():
101
  duckdb_con.register("df", df)
102
  duckdb_con.register("dim_df", dim_df)
103
+ duckdb_con.execute("""
104
+ SELECT d.*, dim.weight
105
+ FROM df d
106
+ LEFT JOIN dim_df dim
107
+ ON d.category = dim.category
108
+ """).fetchdf()
109
 
110
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
111
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
112
 
113
+ return build_result("Join on category", p_mean, p_std, p_all, d_mean, d_std, d_all)
114
+
115
+
116
+ # ---- Read CSV ----
117
+ def bench_read_csv(temp_csv_path, repeats=3):
118
  def pandas_op():
119
+ _ = pd.read_csv(temp_csv_path)
120
 
121
  def duckdb_op():
122
+ _ = duckdb.read_csv_auto(temp_csv_path)
 
 
 
123
 
124
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
125
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
126
 
127
+ return build_result("Read CSV", p_mean, p_std, p_all, d_mean, d_std, d_all)
128
+
129
+
130
+ # ---- Read Parquet ----
131
+ def bench_read_parquet(temp_parquet_path, repeats=3):
132
  def pandas_op():
133
+ _ = pd.read_parquet(temp_parquet_path)
 
 
134
 
135
  def duckdb_op():
136
+ _ = duckdb.read_parquet(temp_parquet_path)
 
 
 
 
 
137
 
138
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
139
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
140
 
141
+ return build_result("Read Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
142
 
143
+
144
+ # ---- Write Parquet ----
145
+ def bench_write_parquet(df, repeats=3):
 
 
 
 
 
 
 
 
 
 
 
146
  def pandas_op():
147
+ df.to_parquet("temp_pd.parquet")
148
 
149
  def duckdb_op():
150
  duckdb_con.register("df", df)
151
+ duckdb_con.execute("COPY df TO 'temp_duck.parquet' (FORMAT PARQUET)")
 
 
152
 
153
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
154
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
155
 
156
+ return build_result("Write Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
157
 
 
 
 
 
 
158
 
159
+ # ----------------------------------------------------------
160
+ # Shared result formatting
161
+ # ----------------------------------------------------------
162
 
163
+ def build_result(op_name, p_mean, p_std, p_all, d_mean, d_std, d_all):
164
+ speedup = p_mean / d_mean if d_mean > 0 else None
 
165
 
166
+ return {
167
+ "operation": op_name,
168
+ "pandas_mean_s": p_mean,
169
+ "pandas_std_s": p_std,
170
+ "duckdb_mean_s": d_mean,
171
+ "duckdb_std_s": d_std,
172
+ "speedup": speedup,
173
+ "raw_pandas_runs": p_all,
174
+ "raw_duckdb_runs": d_all,
175
+ }
176
 
 
 
 
177
 
178
+ # ----------------------------------------------------------
179
+ # Benchmark Dispatcher
180
+ # ----------------------------------------------------------
 
 
 
 
181
 
182
+ def run_benchmark(operation, df=None, repeats=3):
183
+ repeats = int(repeats)
184
+
185
+ if operation == "Filter": return bench_filter(df, repeats)
186
+ if operation == "Groupby": return bench_groupby(df, repeats)
187
+ if operation == "Join": return bench_join(df, repeats)
188
+ if operation == "Write Parquet": return bench_write_parquet(df, repeats)
189
+
190
+ raise ValueError(f"Unsupported operation: {operation}")
191
+
192
+
193
+ # ----------------------------------------------------------
194
+ # Chart generator (PIL Image)
195
+ # ----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  def generate_chart(result):
198
+ fig, ax = plt.subplots(figsize=(4, 3))
199
+
200
  engines = ["Pandas", "DuckDB"]
201
  times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
202
+
203
+ ax.bar(engines, times)
204
  ax.set_ylabel("Time (seconds)")
205
+ ax.set_title(result["operation"])
206
+
 
207
  buf = io.BytesIO()
208
  plt.tight_layout()
209
+ plt.savefig(buf, format="png")
210
  buf.seek(0)
211
  plt.close(fig)
212
+
213
  return Image.open(buf)
214
 
215
+
216
+ # ----------------------------------------------------------
217
+ # Markdown result
218
+ # ----------------------------------------------------------
219
+
220
+ def format_result(result):
221
  speed = result["speedup"]
222
+ verdict = (
223
+ f"πŸš€ **DuckDB is ~{speed:.2f}Γ— faster**"
224
+ if speed > 1
225
+ else f"🐼 **Pandas is ~{1/speed:.2f}Γ— faster**"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  )
227
+
228
+ md = f"""
229
+ ### πŸ”¬ Benchmark Result β€” {result['operation']}
230
+
231
+ | Engine | Mean (s) | Std (s) |
232
+ |--------|----------|---------|
233
+ | Pandas | `{result['pandas_mean_s']:.6f}` | `{result['pandas_std_s']:.6f}` |
234
+ | DuckDB | `{result['duckdb_mean_s']:.6f}` | `{result['duckdb_std_s']:.6f}` |
235
+
236
+ **Verdict:** {verdict}
237
+
238
+ <details><summary>Raw timings</summary>
239
+
240
+ - Pandas: `{[round(x,6) for x in result['raw_pandas_runs']]}`
241
+ - DuckDB: `{[round(x,6) for x in result['raw_duckdb_runs']]}`
242
+ </details>
243
+ """
244
+ return md
245
+
246
+
247
+ # ----------------------------------------------------------
248
+ # Helper to load custom dataset
249
+ # ----------------------------------------------------------
250
+
251
+ def load_custom_dataset(file):
252
+ if file.name.endswith(".csv"):
253
+ return pd.read_csv(file.name)
254
+ if file.name.endswith(".parquet"):
255
+ return pd.read_parquet(file.name)
256
+ if file.name.endswith(".arrow"):
257
+ return pd.read_feather(file.name)
258
+ raise ValueError("Unsupported file format")
259
+
260
+
261
+ # ----------------------------------------------------------
262
+ # Gradio App
263
+ # ----------------------------------------------------------
264
 
265
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
266
 
267
+ with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
268
+
269
+ gr.Markdown("# 🐼 vs πŸ¦† DuckDB vs Pandas β€” Performance Playground")
 
 
270
 
271
  with gr.Tabs():
272
+
273
+ # ==================================================
274
+ # πŸ”₯ Synthetic Mode
275
+ # ==================================================
276
+ with gr.Tab("πŸ”₯ Synthetic Dataset Benchmarks"):
277
+
278
+ dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset Size")
279
+
280
+ operation_synth = gr.Radio(
281
+ ["Filter", "Groupby", "Join", "Write Parquet"],
282
+ label="Operation",
283
+ value="Filter"
284
+ )
285
+
286
+ repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
287
+
288
+ btn_synth = gr.Button("πŸš€ Run Benchmark")
289
+
290
+ out_md_synth = gr.Markdown()
291
+ out_chart_synth = gr.Image()
292
+
293
+ def synthetic_runner(size, operation, repeats):
294
+ repeats = int(repeats)
295
+ n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
296
+
297
+ df = generate_data(n)
298
+ result = run_benchmark(operation, df, repeats)
299
+ chart = generate_chart(result)
300
+
301
+ return format_result(result), chart
 
 
 
 
 
 
302
 
303
  btn_synth.click(
304
  synthetic_runner,
305
  [dataset_size, operation_synth, repeats_synth],
306
+ [out_md_synth, out_chart_synth],
307
  )
308
 
309
+
310
+ # ==================================================
311
+ # πŸ“ Custom Dataset Mode
312
+ # ==================================================
313
+ with gr.Tab("πŸ“ Custom Dataset Upload"):
314
+
315
+ file_in = gr.File(label="Upload CSV / Parquet / Arrow")
316
+
317
+ operation_custom = gr.Radio(
318
+ ["Filter", "Groupby", "Join", "Write Parquet"],
319
+ label="Operation",
320
+ value="Filter"
321
  )
322
 
323
+ repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
324
+
325
+ btn_custom = gr.Button("Run on Uploaded Dataset")
326
+
327
+ out_md_custom = gr.Markdown()
328
+ out_chart_custom = gr.Image()
329
+
330
+ def custom_runner(file, operation, repeats):
331
+ repeats = int(repeats)
332
+ df = load_custom_dataset(file)
333
+ result = run_benchmark(operation, df, repeats)
334
+ return format_result(result), generate_chart(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  btn_custom.click(
337
  custom_runner,
338
  [file_in, operation_custom, repeats_custom],
339
+ [out_md_custom, out_chart_custom],
340
  )
341
 
342
  if __name__ == "__main__":