PraneshJs commited on
Commit
47947ce
·
verified ·
1 Parent(s): b91eb38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +499 -224
app.py CHANGED
@@ -1,37 +1,61 @@
1
  import time
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import duckdb
5
  import gradio as gr
6
  import matplotlib.pyplot as plt
7
  from PIL import Image
8
- import io
9
- import os
10
 
11
  duckdb_con = duckdb.connect(database=":memory:")
12
 
13
- # ----------------------------------------------------------
14
- # Synthetic Data Generator
15
- # ----------------------------------------------------------
16
 
17
- def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
18
  rng = np.random.default_rng(42)
19
- ids = np.arange(n_rows)
 
20
  categories = rng.integers(0, n_groups, size=n_rows)
21
- categories = np.array([f"cat_{c}" for c in categories])
 
22
  value1 = rng.normal(0, 1, size=n_rows)
23
  value2 = rng.normal(10, 5, size=n_rows)
 
 
 
 
24
  start_date = np.datetime64("2020-01-01")
25
  dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
26
 
27
- return pd.DataFrame(
28
- {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
 
 
 
 
 
 
29
  )
 
30
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # ----------------------------------------------------------
33
- # Timing utility
34
- # ----------------------------------------------------------
35
 
36
  def time_function(fn, repeats=3):
37
  repeats = int(repeats)
@@ -41,57 +65,106 @@ def time_function(fn, repeats=3):
41
  fn()
42
  end = time.perf_counter()
43
  times.append(end - start)
44
- return np.mean(times), np.std(times), times
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # ----------------------------------------------------------
48
- # Benchmark Operations (Compute + I/O)
49
- # ----------------------------------------------------------
50
 
51
- # ---- Filter ----
52
- def bench_filter(df, repeats=3):
53
  def pandas_op():
54
- _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
55
 
56
  def duckdb_op():
57
  duckdb_con.register("df", df)
58
- duckdb_con.execute(f"""
59
- SELECT *
60
- FROM df
61
- WHERE value1 > 0.5
62
- AND category='{df['category'].iloc[0]}'
63
- """).fetchdf()
64
 
65
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
66
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
67
 
68
- return build_result("Filter rows", p_mean, p_std, p_all, d_mean, d_std, d_all)
69
-
70
-
71
- # ---- Groupby ----
72
- def bench_groupby(df, repeats=3):
73
  def pandas_op():
74
- _ = df.groupby("category")[["value1", "value2"]].mean()
 
 
 
 
75
 
76
  def duckdb_op():
77
  duckdb_con.register("df", df)
78
- duckdb_con.execute("""
79
- SELECT category, AVG(value1), AVG(value2)
80
- FROM df GROUP BY category
81
- """).fetchdf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
84
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- return build_result("Groupby mean", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
 
87
 
 
 
 
88
 
89
- # ---- Join ----
90
  def bench_join(df, repeats=3):
91
  categories = df["category"].unique()
92
  rng = np.random.default_rng(123)
93
  dim_df = pd.DataFrame(
94
- {"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}
 
 
 
95
  )
96
 
97
  def pandas_op():
@@ -100,243 +173,445 @@ def bench_join(df, repeats=3):
100
  def duckdb_op():
101
  duckdb_con.register("df", df)
102
  duckdb_con.register("dim_df", dim_df)
103
- duckdb_con.execute("""
104
- SELECT d.*, dim.weight
105
- FROM df d
106
- LEFT JOIN dim_df dim
107
- ON d.category = dim.category
108
- """).fetchdf()
109
 
110
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
111
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
 
112
 
113
- return build_result("Join on category", p_mean, p_std, p_all, d_mean, d_std, d_all)
114
-
115
-
116
- # ---- Read CSV ----
117
- def bench_read_csv(temp_csv_path, repeats=3):
118
  def pandas_op():
119
- _ = pd.read_csv(temp_csv_path)
120
 
121
  def duckdb_op():
122
- _ = duckdb.read_csv_auto(temp_csv_path)
123
-
124
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
125
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
126
-
127
- return build_result("Read CSV", p_mean, p_std, p_all, d_mean, d_std, d_all)
128
 
 
 
 
129
 
130
- # ---- Read Parquet ----
131
- def bench_read_parquet(temp_parquet_path, repeats=3):
132
  def pandas_op():
133
- _ = pd.read_parquet(temp_parquet_path)
 
 
134
 
135
  def duckdb_op():
136
- _ = duckdb.read_parquet(temp_parquet_path)
137
-
138
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
139
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
140
-
141
- return build_result("Read Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
142
 
 
 
 
143
 
144
- # ---- Write Parquet ----
145
- def bench_write_parquet(df, repeats=3):
146
  def pandas_op():
147
- df.to_parquet("temp_pd.parquet")
 
 
148
 
149
  def duckdb_op():
150
  duckdb_con.register("df", df)
151
- duckdb_con.execute("COPY df TO 'temp_duck.parquet' (FORMAT PARQUET)")
152
-
153
- p_mean, p_std, p_all = time_function(pandas_op, repeats)
154
- d_mean, d_std, d_all = time_function(duckdb_op, repeats)
155
-
156
- return build_result("Write Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
157
-
 
 
 
 
 
 
 
158
 
159
- # ----------------------------------------------------------
160
- # Shared result formatting
161
- # ----------------------------------------------------------
 
 
162
 
163
- def build_result(op_name, p_mean, p_std, p_all, d_mean, d_std, d_all):
164
- speedup = p_mean / d_mean if d_mean > 0 else None
 
165
 
166
- return {
167
- "operation": op_name,
168
- "pandas_mean_s": p_mean,
169
- "pandas_std_s": p_std,
170
- "duckdb_mean_s": d_mean,
171
- "duckdb_std_s": d_std,
172
- "speedup": speedup,
173
- "raw_pandas_runs": p_all,
174
- "raw_duckdb_runs": d_all,
175
- }
176
 
 
 
 
 
 
177
 
178
- # ----------------------------------------------------------
179
- # Benchmark Dispatcher
180
- # ----------------------------------------------------------
181
 
182
- def run_benchmark(operation, df=None, repeats=3):
183
- repeats = int(repeats)
 
184
 
185
- if operation == "Filter": return bench_filter(df, repeats)
186
- if operation == "Groupby": return bench_groupby(df, repeats)
187
- if operation == "Join": return bench_join(df, repeats)
188
- if operation == "Write Parquet": return bench_write_parquet(df, repeats)
 
189
 
190
- raise ValueError(f"Unsupported operation: {operation}")
 
 
191
 
 
 
 
 
 
 
 
192
 
193
- # ----------------------------------------------------------
194
- # Chart generator (PIL Image)
195
- # ----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  def generate_chart(result):
198
- fig, ax = plt.subplots(figsize=(4, 3))
199
-
200
  engines = ["Pandas", "DuckDB"]
201
  times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
202
-
203
- ax.bar(engines, times)
204
  ax.set_ylabel("Time (seconds)")
205
- ax.set_title(result["operation"])
206
-
 
207
  buf = io.BytesIO()
208
  plt.tight_layout()
209
- plt.savefig(buf, format="png")
210
  buf.seek(0)
211
  plt.close(fig)
212
-
213
  return Image.open(buf)
214
 
215
-
216
- # ----------------------------------------------------------
217
- # Markdown result
218
- # ----------------------------------------------------------
219
-
220
- def format_result(result):
221
  speed = result["speedup"]
222
- verdict = (
223
- f"🚀 **DuckDB is ~{speed:.2f}× faster**"
224
- if speed > 1
225
- else f"🐼 **Pandas is ~{1/speed:.2f}× faster**"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  )
227
-
228
- md = f"""
229
- ### 🔬 Benchmark Result — {result['operation']}
230
-
231
- | Engine | Mean (s) | Std (s) |
232
- |--------|----------|---------|
233
- | Pandas | `{result['pandas_mean_s']:.6f}` | `{result['pandas_std_s']:.6f}` |
234
- | DuckDB | `{result['duckdb_mean_s']:.6f}` | `{result['duckdb_std_s']:.6f}` |
235
-
236
- **Verdict:** {verdict}
237
-
238
- <details><summary>Raw timings</summary>
239
-
240
- - Pandas: `{[round(x,6) for x in result['raw_pandas_runs']]}`
241
- - DuckDB: `{[round(x,6) for x in result['raw_duckdb_runs']]}`
242
- </details>
243
- """
244
- return md
245
-
246
-
247
- # ----------------------------------------------------------
248
- # Helper to load custom dataset
249
- # ----------------------------------------------------------
250
-
251
- def load_custom_dataset(file):
252
- if file.name.endswith(".csv"):
253
- return pd.read_csv(file.name)
254
- if file.name.endswith(".parquet"):
255
- return pd.read_parquet(file.name)
256
- if file.name.endswith(".arrow"):
257
- return pd.read_feather(file.name)
258
- raise ValueError("Unsupported file format")
259
-
260
-
261
- # ----------------------------------------------------------
262
- # Gradio App
263
- # ----------------------------------------------------------
264
 
265
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
266
 
267
- with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
268
-
269
- gr.Markdown("# 🐼 vs 🦆 DuckDB vs Pandas — Performance Playground")
 
 
270
 
271
  with gr.Tabs():
272
-
273
- # ==================================================
274
- # 🔥 Synthetic Mode
275
- # ==================================================
276
- with gr.Tab("🔥 Synthetic Dataset Benchmarks"):
277
-
278
- dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset Size")
279
-
280
- operation_synth = gr.Radio(
281
- ["Filter", "Groupby", "Join", "Write Parquet"],
282
- label="Operation",
283
- value="Filter"
284
- )
285
-
286
- repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
287
-
288
- btn_synth = gr.Button("🚀 Run Benchmark")
289
-
290
- out_md_synth = gr.Markdown()
291
- out_chart_synth = gr.Image()
292
-
293
- def synthetic_runner(size, operation, repeats):
294
- repeats = int(repeats)
295
- n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
296
-
297
- df = generate_data(n)
298
- result = run_benchmark(operation, df, repeats)
299
- chart = generate_chart(result)
300
-
301
- return format_result(result), chart
 
 
 
 
 
 
302
 
303
  btn_synth.click(
304
  synthetic_runner,
305
  [dataset_size, operation_synth, repeats_synth],
306
- [out_md_synth, out_chart_synth],
307
  )
308
 
309
-
310
- # ==================================================
311
- # 📁 Custom Dataset Mode
312
- # ==================================================
313
- with gr.Tab("📁 Custom Dataset Upload"):
314
-
315
- file_in = gr.File(label="Upload CSV / Parquet / Arrow")
316
-
317
- operation_custom = gr.Radio(
318
- ["Filter", "Groupby", "Join", "Write Parquet"],
319
- label="Operation",
320
- value="Filter"
321
  )
322
 
323
- repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
324
-
325
- btn_custom = gr.Button("Run on Uploaded Dataset")
326
-
327
- out_md_custom = gr.Markdown()
328
- out_chart_custom = gr.Image()
329
-
330
- def custom_runner(file, operation, repeats):
331
- repeats = int(repeats)
332
- df = load_custom_dataset(file)
333
- result = run_benchmark(operation, df, repeats)
334
- return format_result(result), generate_chart(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  btn_custom.click(
337
  custom_runner,
338
  [file_in, operation_custom, repeats_custom],
339
- [out_md_custom, out_chart_custom],
340
  )
341
 
342
  if __name__ == "__main__":
 
1
  import time
2
+ import io
3
+ import traceback
4
+ from typing import Dict, Callable, Any, Tuple
5
+
6
  import numpy as np
7
  import pandas as pd
8
  import duckdb
9
  import gradio as gr
10
  import matplotlib.pyplot as plt
11
  from PIL import Image
12
+
13
+ # ----------------- 1. Global Setup -----------------
14
 
15
  duckdb_con = duckdb.connect(database=":memory:")
16
 
17
+ # ----------------- 2. Data Generation & Loading -----------------
 
 
18
 
19
+ def generate_data(n_rows, n_groups=50):
20
  rng = np.random.default_rng(42)
21
+ ids = np.arange(n_rows, dtype=np.int64)
22
+
23
  categories = rng.integers(0, n_groups, size=n_rows)
24
+ categories = np.array(["cat_" + str(c) for c in categories], dtype=object)
25
+
26
  value1 = rng.normal(0, 1, size=n_rows)
27
  value2 = rng.normal(10, 5, size=n_rows)
28
+
29
+ null_mask = rng.random(n_rows) < 0.05
30
+ value1[null_mask] = np.nan
31
+
32
  start_date = np.datetime64("2020-01-01")
33
  dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
34
 
35
+ df = pd.DataFrame(
36
+ {
37
+ "id": ids,
38
+ "category": categories,
39
+ "value1": value1,
40
+ "value2": value2,
41
+ "date": dates,
42
+ }
43
  )
44
+ return df
45
 
46
+ def load_custom_dataset(file) -> pd.DataFrame:
47
+ if file is None:
48
+ raise ValueError("No file uploaded.")
49
+ name = file.name.lower()
50
+ if name.endswith(".csv"):
51
+ return pd.read_csv(file.name)
52
+ if name.endswith(".parquet"):
53
+ return pd.read_parquet(file.name)
54
+ if name.endswith(".arrow") or name.endswith(".feather"):
55
+ return pd.read_feather(file.name)
56
+ raise ValueError("Unsupported file format. Use CSV, Parquet, or Arrow/Feather.")
57
 
58
+ # ----------------- 3. Timing Utils -----------------
 
 
59
 
60
  def time_function(fn, repeats=3):
61
  repeats = int(repeats)
 
65
  fn()
66
  end = time.perf_counter()
67
  times.append(end - start)
68
+ return float(np.mean(times)), float(np.std(times)), [float(t) for t in times]
69
 
70
+ def build_result(pm, ps, pr, dm, ds, dr):
71
+ if dm > 1e-9:
72
+ speedup = pm / dm
73
+ else:
74
+ speedup = 0.0
75
+ return {
76
+ "pandas_mean_s": pm,
77
+ "pandas_std_s": ps,
78
+ "duckdb_mean_s": dm,
79
+ "duckdb_std_s": ds,
80
+ "speedup": speedup,
81
+ "raw_pandas_runs": pr,
82
+ "raw_duckdb_runs": dr,
83
+ }
84
 
85
+ # ----------------- 4. Benchmarks -----------------
 
 
86
 
87
+ def bench_filter_simple(df, repeats=3):
 
88
  def pandas_op():
89
+ _ = df[(df["value1"] > 0.5) & (df["category"] == "cat_1")]
90
 
91
  def duckdb_op():
92
  duckdb_con.register("df", df)
93
+ duckdb_con.execute(
94
+ "SELECT * FROM df WHERE value1 > 0.5 AND category = 'cat_1';"
95
+ ).fetchdf()
 
 
 
96
 
97
+ pm, ps, pr = time_function(pandas_op, repeats)
98
+ dm, ds, dr = time_function(duckdb_op, repeats)
99
+ return build_result(pm, ps, pr, dm, ds, dr)
100
 
101
+ def bench_filter_complex(df, repeats=3):
 
 
 
 
102
  def pandas_op():
103
+ _ = df[
104
+ (df["value1"] > 0)
105
+ & (df["value2"] < 12)
106
+ & (df["date"].between("2020-03-01", "2020-09-30"))
107
+ ]
108
 
109
  def duckdb_op():
110
  duckdb_con.register("df", df)
111
+ duckdb_con.execute(
112
+ "SELECT * FROM df "
113
+ "WHERE value1 > 0 "
114
+ "AND value2 < 12 "
115
+ "AND date BETWEEN DATE '2020-03-01' AND DATE '2020-09-30';"
116
+ ).fetchdf()
117
+
118
+ pm, ps, pr = time_function(pandas_op, repeats)
119
+ dm, ds, dr = time_function(duckdb_op, repeats)
120
+ return build_result(pm, ps, pr, dm, ds, dr)
121
+
122
+ def bench_groupby_basic(df, repeats=3):
123
+ def pandas_op():
124
+ _ = df.groupby("category").agg(
125
+ mean_value1=("value1", "mean"),
126
+ sum_value2=("value2", "sum"),
127
+ cnt=("id", "count"),
128
+ )
129
 
130
+ def duckdb_op():
131
+ duckdb_con.register("df", df)
132
+ duckdb_con.execute(
133
+ "SELECT category, "
134
+ "AVG(value1) AS mean_value1, "
135
+ "SUM(value2) AS sum_value2, "
136
+ "COUNT(*) AS cnt "
137
+ "FROM df GROUP BY category;"
138
+ ).fetchdf()
139
+
140
+ pm, ps, pr = time_function(pandas_op, repeats)
141
+ dm, ds, dr = time_function(duckdb_op, repeats)
142
+ return build_result(pm, ps, pr, dm, ds, dr)
143
+
144
+ def bench_groupby_having(df, repeats=3):
145
+ def pandas_op():
146
+ agg = df.groupby("category").agg(mean_value1=("value1", "mean"))
147
+ _ = agg[agg["mean_value1"] > 0]
148
 
149
+ def duckdb_op():
150
+ duckdb_con.register("df", df)
151
+ duckdb_con.execute(
152
+ "SELECT category, AVG(value1) AS mean_value1 "
153
+ "FROM df GROUP BY category HAVING AVG(value1) > 0;"
154
+ ).fetchdf()
155
 
156
+ pm, ps, pr = time_function(pandas_op, repeats)
157
+ dm, ds, dr = time_function(duckdb_op, repeats)
158
+ return build_result(pm, ps, pr, dm, ds, dr)
159
 
 
160
  def bench_join(df, repeats=3):
161
  categories = df["category"].unique()
162
  rng = np.random.default_rng(123)
163
  dim_df = pd.DataFrame(
164
+ {
165
+ "category": categories,
166
+ "weight": rng.uniform(0.5, 2.0, size=len(categories)),
167
+ }
168
  )
169
 
170
  def pandas_op():
 
173
  def duckdb_op():
174
  duckdb_con.register("df", df)
175
  duckdb_con.register("dim_df", dim_df)
176
+ duckdb_con.execute(
177
+ "SELECT d.*, dim.weight "
178
+ "FROM df d LEFT JOIN dim_df dim "
179
+ "ON d.category = dim.category;"
180
+ ).fetchdf()
 
181
 
182
+ pm, ps, pr = time_function(pandas_op, repeats)
183
+ dm, ds, dr = time_function(duckdb_op, repeats)
184
+ return build_result(pm, ps, pr, dm, ds, dr)
185
 
186
+ def bench_order_by(df, repeats=3):
 
 
 
 
187
  def pandas_op():
188
+ _ = df.sort_values(["value1", "date"], ascending=[False, True])
189
 
190
  def duckdb_op():
191
+ duckdb_con.register("df", df)
192
+ duckdb_con.execute(
193
+ "SELECT * FROM df ORDER BY value1 DESC, date ASC;"
194
+ ).fetchdf()
 
 
195
 
196
+ pm, ps, pr = time_function(pandas_op, repeats)
197
+ dm, ds, dr = time_function(duckdb_op, repeats)
198
+ return build_result(pm, ps, pr, dm, ds, dr)
199
 
200
+ def bench_window_row_number(df, repeats=3):
 
201
  def pandas_op():
202
+ temp = df.sort_values(["category", "value1"], ascending=[True, False]).copy()
203
+ temp["rn"] = temp.groupby("category").cumcount() + 1
204
+ _ = temp
205
 
206
  def duckdb_op():
207
+ duckdb_con.register("df", df)
208
+ duckdb_con.execute(
209
+ "SELECT *, "
210
+ "ROW_NUMBER() OVER (PARTITION BY category ORDER BY value1 DESC) AS rn "
211
+ "FROM df;"
212
+ ).fetchdf()
213
 
214
+ pm, ps, pr = time_function(pandas_op, repeats)
215
+ dm, ds, dr = time_function(duckdb_op, repeats)
216
+ return build_result(pm, ps, pr, dm, ds, dr)
217
 
218
+ def bench_window_running_total(df, repeats=3):
 
219
  def pandas_op():
220
+ temp = df.sort_values("date").copy()
221
+ temp["running_sum"] = temp["value1"].fillna(0).cumsum()
222
+ _ = temp
223
 
224
  def duckdb_op():
225
  duckdb_con.register("df", df)
226
+ duckdb_con.execute(
227
+ "SELECT *, "
228
+ "SUM(COALESCE(value1, 0)) OVER (ORDER BY date "
229
+ "ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_sum "
230
+ "FROM df;"
231
+ ).fetchdf()
232
+
233
+ pm, ps, pr = time_function(pandas_op, repeats)
234
+ dm, ds, dr = time_function(duckdb_op, repeats)
235
+ return build_result(pm, ps, pr, dm, ds, dr)
236
+
237
+ def bench_drop_nulls(df, repeats=3):
238
+ def pandas_op():
239
+ _ = df[df["value1"].notna()]
240
 
241
+ def duckdb_op():
242
+ duckdb_con.register("df", df)
243
+ duckdb_con.execute(
244
+ "SELECT * FROM df WHERE value1 IS NOT NULL;"
245
+ ).fetchdf()
246
 
247
+ pm, ps, pr = time_function(pandas_op, repeats)
248
+ dm, ds, dr = time_function(duckdb_op, repeats)
249
+ return build_result(pm, ps, pr, dm, ds, dr)
250
 
251
+ def bench_fill_nulls(df, repeats=3):
252
+ def pandas_op():
253
+ _ = df["value1"].fillna(0)
 
 
 
 
 
 
 
254
 
255
+ def duckdb_op():
256
+ duckdb_con.register("df", df)
257
+ duckdb_con.execute(
258
+ "SELECT COALESCE(value1, 0) AS value1_filled FROM df;"
259
+ ).fetchdf()
260
 
261
+ pm, ps, pr = time_function(pandas_op, repeats)
262
+ dm, ds, dr = time_function(duckdb_op, repeats)
263
+ return build_result(pm, ps, pr, dm, ds, dr)
264
 
265
+ def bench_distinct_count(df, repeats=3):
266
+ def pandas_op():
267
+ _ = df["category"].nunique()
268
 
269
+ def duckdb_op():
270
+ duckdb_con.register("df", df)
271
+ duckdb_con.execute(
272
+ "SELECT COUNT(DISTINCT category) AS distinct_categories FROM df;"
273
+ ).fetchdf()
274
 
275
+ pm, ps, pr = time_function(pandas_op, repeats)
276
+ dm, ds, dr = time_function(duckdb_op, repeats)
277
+ return build_result(pm, ps, pr, dm, ds, dr)
278
 
279
+ def bench_materialize_parquet(df, repeats=3):
280
+ def pandas_op():
281
+ agg = df.groupby("category").agg(
282
+ mean_value1=("value1", "mean"),
283
+ sum_value2=("value2", "sum"),
284
+ )
285
+ agg.to_parquet("pandas_grouped.parquet")
286
 
287
+ def duckdb_op():
288
+ duckdb_con.register("df", df)
289
+ duckdb_con.execute(
290
+ "CREATE OR REPLACE TEMP TABLE agg AS "
291
+ "SELECT category, AVG(value1) AS mean_value1, "
292
+ "SUM(value2) AS sum_value2 FROM df GROUP BY category;"
293
+ )
294
+ duckdb_con.execute(
295
+ "COPY agg TO 'duck_grouped.parquet' (FORMAT PARQUET);"
296
+ )
297
+
298
+ pm, ps, pr = time_function(pandas_op, repeats)
299
+ dm, ds, dr = time_function(duckdb_op, repeats)
300
+ return build_result(pm, ps, pr, dm, ds, dr)
301
+
302
+ # ----------------- 5. Operation Registry -----------------
303
+
304
+ OPERATIONS = {
305
+ "Filter (simple WHERE)": {
306
+ "sql": "SELECT * FROM df WHERE value1 > 0.5 AND category = 'cat_1';",
307
+ "pandas": 'df[(df["value1"] > 0.5) & (df["category"] == "cat_1")]',
308
+ "bench": bench_filter_simple,
309
+ },
310
+ "Filter (complex WHERE + date range)": {
311
+ "sql": (
312
+ "SELECT * FROM df\n"
313
+ "WHERE value1 > 0\n"
314
+ " AND value2 < 12\n"
315
+ " AND date BETWEEN DATE '2020-03-01' AND DATE '2020-09-30';"
316
+ ),
317
+ "pandas": (
318
+ 'df[(df["value1"] > 0)\n'
319
+ ' & (df["value2"] < 12)\n'
320
+ ' & (df["date"].between("2020-03-01", "2020-09-30"))]'
321
+ ),
322
+ "bench": bench_filter_complex,
323
+ },
324
+ "Groupby (multi-agg)": {
325
+ "sql": (
326
+ "SELECT category,\n"
327
+ " AVG(value1) AS mean_value1,\n"
328
+ " SUM(value2) AS sum_value2,\n"
329
+ " COUNT(*) AS cnt\n"
330
+ "FROM df\n"
331
+ "GROUP BY category;"
332
+ ),
333
+ "pandas": (
334
+ 'df.groupby("category").agg(\n'
335
+ ' mean_value1=("value1", "mean"),\n'
336
+ ' sum_value2=("value2", "sum"),\n'
337
+ ' cnt=("id", "count"),\n'
338
+ ")"
339
+ ),
340
+ "bench": bench_groupby_basic,
341
+ },
342
+ "Groupby + HAVING": {
343
+ "sql": (
344
+ "SELECT category,\n"
345
+ " AVG(value1) AS mean_value1\n"
346
+ "FROM df\n"
347
+ "GROUP BY category\n"
348
+ "HAVING AVG(value1) > 0;"
349
+ ),
350
+ "pandas": (
351
+ 'agg = df.groupby("category").agg(mean_value1=("value1", "mean"))\n'
352
+ 'agg[agg["mean_value1"] > 0]'
353
+ ),
354
+ "bench": bench_groupby_having,
355
+ },
356
+ "Join (fact ⨝ dim on category)": {
357
+ "sql": (
358
+ "WITH dim AS (\n"
359
+ " SELECT category, AVG(value1) AS weight\n"
360
+ " FROM df\n"
361
+ " GROUP BY category\n"
362
+ ")\n"
363
+ "SELECT d.*, dim.weight\n"
364
+ "FROM df d\n"
365
+ "LEFT JOIN dim ON d.category = dim.category;"
366
+ ),
367
+ "pandas": (
368
+ "dim = df.groupby('category', as_index=False)['value1']"
369
+ ".mean().rename(columns={'value1':'weight'})\n"
370
+ "df.merge(dim, on='category', how='left')"
371
+ ),
372
+ "bench": bench_join,
373
+ },
374
+ "Order By (value1 DESC, date ASC)": {
375
+ "sql": "SELECT * FROM df ORDER BY value1 DESC, date ASC;",
376
+ "pandas": 'df.sort_values(["value1", "date"], ascending=[False, True])',
377
+ "bench": bench_order_by,
378
+ },
379
+ "Window: ROW_NUMBER() PARTITION BY category": {
380
+ "sql": (
381
+ "SELECT *,\n"
382
+ " ROW_NUMBER() OVER (\n"
383
+ " PARTITION BY category\n"
384
+ " ORDER BY value1 DESC\n"
385
+ " ) AS rn\n"
386
+ "FROM df;"
387
+ ),
388
+ "pandas": (
389
+ 'temp = df.sort_values(["category", "value1"], ascending=[True, False])\n'
390
+ 'temp["rn"] = temp.groupby("category").cumcount() + 1'
391
+ ),
392
+ "bench": bench_window_row_number,
393
+ },
394
+ "Window: Running SUM(value1) OVER (ORDER BY date)": {
395
+ "sql": (
396
+ "SELECT *,\n"
397
+ " SUM(COALESCE(value1, 0)) OVER (\n"
398
+ " ORDER BY date\n"
399
+ " ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n"
400
+ " ) AS running_sum\n"
401
+ "FROM df;"
402
+ ),
403
+ "pandas": (
404
+ 'temp = df.sort_values("date")\n'
405
+ 'temp["running_sum"] = temp["value1"].fillna(0).cumsum()'
406
+ ),
407
+ "bench": bench_window_running_total,
408
+ },
409
+ "Drop NULLs (value1 IS NOT NULL)": {
410
+ "sql": "SELECT * FROM df WHERE value1 IS NOT NULL;",
411
+ "pandas": 'df[df["value1"].notna()]',
412
+ "bench": bench_drop_nulls,
413
+ },
414
+ "Fill NULLs (COALESCE(value1, 0))": {
415
+ "sql": "SELECT COALESCE(value1, 0) AS value1_filled FROM df;",
416
+ "pandas": 'df["value1"].fillna(0)',
417
+ "bench": bench_fill_nulls,
418
+ },
419
+ "Distinct count (COUNT(DISTINCT category))": {
420
+ "sql": "SELECT COUNT(DISTINCT category) AS distinct_categories FROM df;",
421
+ "pandas": 'df["category"].nunique()',
422
+ "bench": bench_distinct_count,
423
+ },
424
+ "Materialize Groupby → Parquet": {
425
+ "sql": (
426
+ "CREATE OR REPLACE TEMP TABLE agg AS\n"
427
+ "SELECT category,\n"
428
+ " AVG(value1) AS mean_value1,\n"
429
+ " SUM(value2) AS sum_value2\n"
430
+ "FROM df\n"
431
+ "GROUP BY category;\n"
432
+ "COPY agg TO 'duck_grouped.parquet' (FORMAT PARQUET);"
433
+ ),
434
+ "pandas": (
435
+ 'agg = df.groupby("category").agg(\n'
436
+ ' mean_value1=("value1", "mean"),\n'
437
+ ' sum_value2=("value2", "sum"),\n'
438
+ ")\n"
439
+ 'agg.to_parquet("pandas_grouped.parquet")'
440
+ ),
441
+ "bench": bench_materialize_parquet,
442
+ },
443
+ }
444
+
445
+ # ----------------- 6. Logic & Formatting -----------------
446
+
447
+ def run_benchmark(operation_label, df, repeats):
448
+ if operation_label not in OPERATIONS:
449
+ raise ValueError("Unknown operation: " + str(operation_label))
450
+ op_meta = OPERATIONS[operation_label]
451
+ bench_fn = op_meta["bench"]
452
+ result = bench_fn(df, repeats)
453
+ result["operation"] = operation_label
454
+ return result, op_meta
455
 
456
  def generate_chart(result):
457
+ fig, ax = plt.subplots(figsize=(6, 4))
 
458
  engines = ["Pandas", "DuckDB"]
459
  times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
460
+ colors = ["#1f77b4", "#ff7f0e"]
461
+ ax.bar(engines, times, color=colors)
462
  ax.set_ylabel("Time (seconds)")
463
+ ax.set_title(str(result.get("operation", "Benchmark Result")))
464
+ for i, v in enumerate(times):
465
+ ax.text(i, v, "{0:.4f}s".format(v), ha="center", va="bottom")
466
  buf = io.BytesIO()
467
  plt.tight_layout()
468
+ plt.savefig(buf, format="png", dpi=100)
469
  buf.seek(0)
470
  plt.close(fig)
 
471
  return Image.open(buf)
472
 
473
+ def format_result(result, op_meta):
 
 
 
 
 
474
  speed = result["speedup"]
475
+ if speed is None or speed <= 0:
476
+ verdict = "Speedup could not be computed."
477
+ elif speed > 1:
478
+ verdict = "DuckDB is about {0:.2f}x faster than Pandas.".format(speed)
479
+ else:
480
+ verdict = "Pandas is about {0:.2f}x faster than DuckDB.".format(1.0 / speed)
481
+
482
+ sql_code = op_meta["sql"]
483
+ pandas_code = op_meta["pandas"]
484
+
485
+ raw_pandas_list = ["{0:.6f}".format(x) for x in result["raw_pandas_runs"]]
486
+ raw_duck_list = ["{0:.6f}".format(x) for x in result["raw_duckdb_runs"]]
487
+
488
+ raw_pandas = ", ".join(raw_pandas_list)
489
+ raw_duck = ", ".join(raw_duck_list)
490
+
491
+ lines = []
492
+ lines.append("Benchmark: " + str(result["operation"]))
493
+ lines.append("")
494
+ lines.append(
495
+ "Pandas mean: {0:.6f} s (std {1:.6f})".format(
496
+ result["pandas_mean_s"], result["pandas_std_s"]
497
+ )
498
  )
499
+ lines.append(
500
+ "DuckDB mean: {0:.6f} s (std {1:.6f})".format(
501
+ result["duckdb_mean_s"], result["duckdb_std_s"]
502
+ )
503
+ )
504
+ lines.append("Verdict: " + verdict)
505
+ lines.append("")
506
+ lines.append("Raw timings (seconds):")
507
+ lines.append(" Pandas: [" + raw_pandas + "]")
508
+ lines.append(" DuckDB: [" + raw_duck + "]")
509
+ lines.append("")
510
+ lines.append("SQL (DuckDB):")
511
+ lines.append(sql_code)
512
+ lines.append("")
513
+ lines.append("Pandas equivalent:")
514
+ lines.append(pandas_code)
515
+ return "\n".join(lines)
516
+
517
+ # ----------------- 7. Gradio App -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
520
 
521
+ with gr.Blocks(title="DuckDB vs Pandas — SQL Analytics Benchmark", theme=theme) as demo:
522
+ gr.Markdown(
523
+ "# DuckDB vs Pandas — SQL Analytics Benchmark\n\n"
524
+ "Compare DuckDB (SQL) and Pandas (Python) on realistic analytics operations."
525
+ )
526
 
527
  with gr.Tabs():
528
+ with gr.Tab("Synthetic Dataset Benchmarks"):
529
+ with gr.Row():
530
+ with gr.Column(scale=1):
531
+ dataset_size = gr.Radio(
532
+ ["100k", "500k", "2M"],
533
+ value="100k",
534
+ label="Dataset Size (synthetic rows)",
535
+ )
536
+ operation_synth = gr.Dropdown(
537
+ choices=list(OPERATIONS.keys()),
538
+ value="Filter (simple WHERE)",
539
+ label="Operation",
540
+ )
541
+ repeats_synth = gr.Slider(
542
+ 1,
543
+ 7,
544
+ value=3,
545
+ step=1,
546
+ label="Timing repeats (average over N runs)",
547
+ )
548
+ btn_synth = gr.Button("Run Benchmark", variant="primary")
549
+
550
+ with gr.Column(scale=1):
551
+ out_chart_synth = gr.Image(label="Performance Chart", type="pil")
552
+ out_text_synth = gr.Textbox(label="Result", lines=20)
553
+
554
+ def synthetic_runner(size, op, repeats):
555
+ try:
556
+ repeats = int(repeats)
557
+ n_map = {"100k": 100000, "500k": 500000, "2M": 2000000}
558
+ df = generate_data(n_map[size])
559
+ result, meta = run_benchmark(op, df, repeats)
560
+ chart = generate_chart(result)
561
+ return chart, format_result(result, meta)
562
+ except Exception:
563
+ return None, "Error:\n" + traceback.format_exc()
564
 
565
  btn_synth.click(
566
  synthetic_runner,
567
  [dataset_size, operation_synth, repeats_synth],
568
+ [out_chart_synth, out_text_synth],
569
  )
570
 
571
+ with gr.Tab("Custom Dataset Upload"):
572
+ gr.Markdown(
573
+ "Your file must contain these columns: id, category, value1, value2, date"
 
 
 
 
 
 
 
 
 
574
  )
575
 
576
+ with gr.Row():
577
+ with gr.Column(scale=1):
578
+ file_in = gr.File(label="Upload CSV / Parquet / Arrow")
579
+ operation_custom = gr.Dropdown(
580
+ choices=list(OPERATIONS.keys()),
581
+ value="Filter (simple WHERE)",
582
+ label="Operation",
583
+ )
584
+ repeats_custom = gr.Slider(
585
+ 1,
586
+ 7,
587
+ value=3,
588
+ step=1,
589
+ label="Timing repeats",
590
+ )
591
+ btn_custom = gr.Button("Run Benchmark", variant="primary")
592
+
593
+ with gr.Column(scale=1):
594
+ out_chart_custom = gr.Image(label="Performance Chart", type="pil")
595
+ out_text_custom = gr.Textbox(label="Result", lines=20)
596
+
597
+ def custom_runner(file, op, repeats):
598
+ try:
599
+ repeats = int(repeats)
600
+ df = load_custom_dataset(file)
601
+ required = {"id", "category", "value1", "value2", "date"}
602
+ missing = required - set(df.columns)
603
+ if missing:
604
+ raise ValueError("Missing columns: " + str(sorted(missing)))
605
+ result, meta = run_benchmark(op, df, repeats)
606
+ chart = generate_chart(result)
607
+ return chart, format_result(result, meta)
608
+ except Exception:
609
+ return None, "Error:\n" + traceback.format_exc()
610
 
611
  btn_custom.click(
612
  custom_runner,
613
  [file_in, operation_custom, repeats_custom],
614
+ [out_chart_custom, out_text_custom],
615
  )
616
 
617
  if __name__ == "__main__":