PraneshJs commited on
Commit
b91eb38
·
verified ·
1 Parent(s): 9cdc1a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -72
app.py CHANGED
@@ -6,6 +6,7 @@ import gradio as gr
6
  import matplotlib.pyplot as plt
7
  from PIL import Image
8
  import io
 
9
 
10
  duckdb_con = duckdb.connect(database=":memory:")
11
 
@@ -33,6 +34,7 @@ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
33
  # ----------------------------------------------------------
34
 
35
  def time_function(fn, repeats=3):
 
36
  times = []
37
  for _ in range(repeats):
38
  start = time.perf_counter()
@@ -43,45 +45,37 @@ def time_function(fn, repeats=3):
43
 
44
 
45
  # ----------------------------------------------------------
46
- # Benchmark Operations
47
  # ----------------------------------------------------------
48
 
 
49
  def bench_filter(df, repeats=3):
50
  def pandas_op():
51
  _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
52
 
53
  def duckdb_op():
54
  duckdb_con.register("df", df)
55
- q = f"""
56
- SELECT *
57
- FROM df
58
- WHERE value1 > 0.5
59
- AND category = '{df['category'].iloc[0]}'
60
- """
61
- _ = duckdb_con.execute(q).fetchdf()
62
 
63
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
64
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
65
 
66
- return {
67
- "operation": "Filter rows with comparison",
68
- "pandas_mean_s": p_mean,
69
- "pandas_std_s": p_std,
70
- "duckdb_mean_s": d_mean,
71
- "duckdb_std_s": d_std,
72
- "speedup": p_mean / d_mean if d_mean > 0 else None,
73
- "raw_pandas_runs": p_all,
74
- "raw_duckdb_runs": d_all,
75
- }
76
 
77
 
 
78
  def bench_groupby(df, repeats=3):
79
  def pandas_op():
80
  _ = df.groupby("category")[["value1", "value2"]].mean()
81
 
82
  def duckdb_op():
83
  duckdb_con.register("df", df)
84
- _ = duckdb_con.execute("""
85
  SELECT category, AVG(value1), AVG(value2)
86
  FROM df GROUP BY category
87
  """).fetchdf()
@@ -89,18 +83,10 @@ def bench_groupby(df, repeats=3):
89
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
90
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
91
 
92
- return {
93
- "operation": "Groupby mean",
94
- "pandas_mean_s": p_mean,
95
- "pandas_std_s": p_std,
96
- "duckdb_mean_s": d_mean,
97
- "duckdb_std_s": d_std,
98
- "speedup": p_mean / d_mean if d_mean > 0 else None,
99
- "raw_pandas_runs": p_all,
100
- "raw_duckdb_runs": d_all,
101
- }
102
 
103
 
 
104
  def bench_join(df, repeats=3):
105
  categories = df["category"].unique()
106
  rng = np.random.default_rng(123)
@@ -114,7 +100,7 @@ def bench_join(df, repeats=3):
114
  def duckdb_op():
115
  duckdb_con.register("df", df)
116
  duckdb_con.register("dim_df", dim_df)
117
- _ = duckdb_con.execute("""
118
  SELECT d.*, dim.weight
119
  FROM df d
120
  LEFT JOIN dim_df dim
@@ -124,13 +110,66 @@ def bench_join(df, repeats=3):
124
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
125
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return {
128
- "operation": "Left Join",
129
  "pandas_mean_s": p_mean,
130
  "pandas_std_s": p_std,
131
  "duckdb_mean_s": d_mean,
132
  "duckdb_std_s": d_std,
133
- "speedup": p_mean / d_mean if d_mean > 0 else None,
134
  "raw_pandas_runs": p_all,
135
  "raw_duckdb_runs": d_all,
136
  }
@@ -140,18 +179,19 @@ def bench_join(df, repeats=3):
140
  # Benchmark Dispatcher
141
  # ----------------------------------------------------------
142
 
143
- def run_benchmark(operation, df, repeats):
144
- if operation == "Filter":
145
- return bench_filter(df, repeats)
146
- if operation == "Groupby":
147
- return bench_groupby(df, repeats)
148
- if operation == "Join":
149
- return bench_join(df, repeats)
150
- return None
 
151
 
152
 
153
  # ----------------------------------------------------------
154
- # Chart generator (PIL Image) — FIXED
155
  # ----------------------------------------------------------
156
 
157
  def generate_chart(result):
@@ -162,7 +202,7 @@ def generate_chart(result):
162
 
163
  ax.bar(engines, times)
164
  ax.set_ylabel("Time (seconds)")
165
- ax.set_title("Pandas vs DuckDB Performance")
166
 
167
  buf = io.BytesIO()
168
  plt.tight_layout()
@@ -174,7 +214,7 @@ def generate_chart(result):
174
 
175
 
176
  # ----------------------------------------------------------
177
- # Markdown Formatting
178
  # ----------------------------------------------------------
179
 
180
  def format_result(result):
@@ -204,6 +244,20 @@ def format_result(result):
204
  return md
205
 
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  # ----------------------------------------------------------
208
  # Gradio App
209
  # ----------------------------------------------------------
@@ -216,62 +270,74 @@ with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
216
 
217
  with gr.Tabs():
218
 
219
- # ---------------------- Synthetic Mode ----------------------
 
 
220
  with gr.Tab("🔥 Synthetic Dataset Benchmarks"):
221
- dataset_size = gr.Radio(["100k", "500k", "2M"], label="Dataset Size", value="100k")
222
- operation_synth = gr.Radio(["Filter", "Groupby", "Join"], label="Operation")
 
 
 
 
 
 
 
223
  repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
224
- synth_btn = gr.Button("🚀 Run Benchmark")
225
 
226
- synth_output = gr.Markdown()
227
- synth_chart = gr.Image(label="Performance Chart")
 
 
228
 
229
  def synthetic_runner(size, operation, repeats):
 
230
  n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
 
231
  df = generate_data(n)
232
  result = run_benchmark(operation, df, repeats)
233
  chart = generate_chart(result)
 
234
  return format_result(result), chart
235
 
236
- synth_btn.click(
237
  synthetic_runner,
238
  [dataset_size, operation_synth, repeats_synth],
239
- [synth_output, synth_chart],
240
  )
241
 
242
- # ---------------------- Custom Dataset Mode ----------------------
 
 
 
243
  with gr.Tab("📁 Custom Dataset Upload"):
244
 
245
- file_input = gr.File(label="Upload a CSV / Parquet / Arrow file")
246
- operation_custom = gr.Radio(["Filter", "Groupby", "Join"], label="Operation")
 
 
 
 
 
 
247
  repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
248
 
249
- custom_btn = gr.Button("Run on Uploaded Dataset")
250
- custom_out = gr.Markdown()
251
- custom_chart = gr.Image(label="Performance Chart")
252
 
253
- def load_custom_dataset(file):
254
- if file.name.endswith(".csv"):
255
- return pd.read_csv(file.name)
256
- elif file.name.endswith(".parquet"):
257
- return pd.read_parquet(file.name)
258
- elif file.name.endswith(".arrow"):
259
- return pd.read_feather(file.name)
260
- else:
261
- raise ValueError("Unsupported format")
262
 
263
  def custom_runner(file, operation, repeats):
 
264
  df = load_custom_dataset(file)
265
  result = run_benchmark(operation, df, repeats)
266
- chart = generate_chart(result)
267
- return format_result(result), chart
268
 
269
- custom_btn.click(
270
  custom_runner,
271
- [file_input, operation_custom, repeats_custom],
272
- [custom_out, custom_chart],
273
  )
274
 
275
-
276
  if __name__ == "__main__":
277
  demo.launch()
 
6
  import matplotlib.pyplot as plt
7
  from PIL import Image
8
  import io
9
+ import os
10
 
11
  duckdb_con = duckdb.connect(database=":memory:")
12
 
 
34
  # ----------------------------------------------------------
35
 
36
  def time_function(fn, repeats=3):
37
+ repeats = int(repeats)
38
  times = []
39
  for _ in range(repeats):
40
  start = time.perf_counter()
 
45
 
46
 
47
  # ----------------------------------------------------------
48
+ # Benchmark Operations (Compute + I/O)
49
  # ----------------------------------------------------------
50
 
51
+ # ---- Filter ----
52
  def bench_filter(df, repeats=3):
53
  def pandas_op():
54
  _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
55
 
56
  def duckdb_op():
57
  duckdb_con.register("df", df)
58
+ duckdb_con.execute(f"""
59
+ SELECT *
60
+ FROM df
61
+ WHERE value1 > 0.5
62
+ AND category='{df['category'].iloc[0]}'
63
+ """).fetchdf()
 
64
 
65
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
66
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
67
 
68
+ return build_result("Filter rows", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
 
 
 
 
 
69
 
70
 
71
+ # ---- Groupby ----
72
  def bench_groupby(df, repeats=3):
73
  def pandas_op():
74
  _ = df.groupby("category")[["value1", "value2"]].mean()
75
 
76
  def duckdb_op():
77
  duckdb_con.register("df", df)
78
+ duckdb_con.execute("""
79
  SELECT category, AVG(value1), AVG(value2)
80
  FROM df GROUP BY category
81
  """).fetchdf()
 
83
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
84
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
85
 
86
+ return build_result("Groupby mean", p_mean, p_std, p_all, d_mean, d_std, d_all)
 
 
 
 
 
 
 
 
 
87
 
88
 
89
+ # ---- Join ----
90
  def bench_join(df, repeats=3):
91
  categories = df["category"].unique()
92
  rng = np.random.default_rng(123)
 
100
  def duckdb_op():
101
  duckdb_con.register("df", df)
102
  duckdb_con.register("dim_df", dim_df)
103
+ duckdb_con.execute("""
104
  SELECT d.*, dim.weight
105
  FROM df d
106
  LEFT JOIN dim_df dim
 
110
  p_mean, p_std, p_all = time_function(pandas_op, repeats)
111
  d_mean, d_std, d_all = time_function(duckdb_op, repeats)
112
 
113
+ return build_result("Join on category", p_mean, p_std, p_all, d_mean, d_std, d_all)
114
+
115
+
116
+ # ---- Read CSV ----
117
+ def bench_read_csv(temp_csv_path, repeats=3):
118
+ def pandas_op():
119
+ _ = pd.read_csv(temp_csv_path)
120
+
121
+ def duckdb_op():
122
+ _ = duckdb.read_csv_auto(temp_csv_path)
123
+
124
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
125
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
126
+
127
+ return build_result("Read CSV", p_mean, p_std, p_all, d_mean, d_std, d_all)
128
+
129
+
130
+ # ---- Read Parquet ----
131
+ def bench_read_parquet(temp_parquet_path, repeats=3):
132
+ def pandas_op():
133
+ _ = pd.read_parquet(temp_parquet_path)
134
+
135
+ def duckdb_op():
136
+ _ = duckdb.read_parquet(temp_parquet_path)
137
+
138
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
139
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
140
+
141
+ return build_result("Read Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
142
+
143
+
144
+ # ---- Write Parquet ----
145
+ def bench_write_parquet(df, repeats=3):
146
+ def pandas_op():
147
+ df.to_parquet("temp_pd.parquet")
148
+
149
+ def duckdb_op():
150
+ duckdb_con.register("df", df)
151
+ duckdb_con.execute("COPY df TO 'temp_duck.parquet' (FORMAT PARQUET)")
152
+
153
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
154
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
155
+
156
+ return build_result("Write Parquet", p_mean, p_std, p_all, d_mean, d_std, d_all)
157
+
158
+
159
+ # ----------------------------------------------------------
160
+ # Shared result formatting
161
+ # ----------------------------------------------------------
162
+
163
+ def build_result(op_name, p_mean, p_std, p_all, d_mean, d_std, d_all):
164
+ speedup = p_mean / d_mean if d_mean > 0 else None
165
+
166
  return {
167
+ "operation": op_name,
168
  "pandas_mean_s": p_mean,
169
  "pandas_std_s": p_std,
170
  "duckdb_mean_s": d_mean,
171
  "duckdb_std_s": d_std,
172
+ "speedup": speedup,
173
  "raw_pandas_runs": p_all,
174
  "raw_duckdb_runs": d_all,
175
  }
 
179
  # Benchmark Dispatcher
180
  # ----------------------------------------------------------
181
 
182
+ def run_benchmark(operation, df=None, repeats=3):
183
+ repeats = int(repeats)
184
+
185
+ if operation == "Filter": return bench_filter(df, repeats)
186
+ if operation == "Groupby": return bench_groupby(df, repeats)
187
+ if operation == "Join": return bench_join(df, repeats)
188
+ if operation == "Write Parquet": return bench_write_parquet(df, repeats)
189
+
190
+ raise ValueError(f"Unsupported operation: {operation}")
191
 
192
 
193
  # ----------------------------------------------------------
194
+ # Chart generator (PIL Image)
195
  # ----------------------------------------------------------
196
 
197
  def generate_chart(result):
 
202
 
203
  ax.bar(engines, times)
204
  ax.set_ylabel("Time (seconds)")
205
+ ax.set_title(result["operation"])
206
 
207
  buf = io.BytesIO()
208
  plt.tight_layout()
 
214
 
215
 
216
  # ----------------------------------------------------------
217
+ # Markdown result
218
  # ----------------------------------------------------------
219
 
220
  def format_result(result):
 
244
  return md
245
 
246
 
247
+ # ----------------------------------------------------------
248
+ # Helper to load custom dataset
249
+ # ----------------------------------------------------------
250
+
251
+ def load_custom_dataset(file):
252
+ if file.name.endswith(".csv"):
253
+ return pd.read_csv(file.name)
254
+ if file.name.endswith(".parquet"):
255
+ return pd.read_parquet(file.name)
256
+ if file.name.endswith(".arrow"):
257
+ return pd.read_feather(file.name)
258
+ raise ValueError("Unsupported file format")
259
+
260
+
261
  # ----------------------------------------------------------
262
  # Gradio App
263
  # ----------------------------------------------------------
 
270
 
271
  with gr.Tabs():
272
 
273
+ # ==================================================
274
+ # 🔥 Synthetic Mode
275
+ # ==================================================
276
  with gr.Tab("🔥 Synthetic Dataset Benchmarks"):
277
+
278
+ dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset Size")
279
+
280
+ operation_synth = gr.Radio(
281
+ ["Filter", "Groupby", "Join", "Write Parquet"],
282
+ label="Operation",
283
+ value="Filter"
284
+ )
285
+
286
  repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
 
287
 
288
+ btn_synth = gr.Button("🚀 Run Benchmark")
289
+
290
+ out_md_synth = gr.Markdown()
291
+ out_chart_synth = gr.Image()
292
 
293
  def synthetic_runner(size, operation, repeats):
294
+ repeats = int(repeats)
295
  n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
296
+
297
  df = generate_data(n)
298
  result = run_benchmark(operation, df, repeats)
299
  chart = generate_chart(result)
300
+
301
  return format_result(result), chart
302
 
303
+ btn_synth.click(
304
  synthetic_runner,
305
  [dataset_size, operation_synth, repeats_synth],
306
+ [out_md_synth, out_chart_synth],
307
  )
308
 
309
+
310
+ # ==================================================
311
+ # 📁 Custom Dataset Mode
312
+ # ==================================================
313
  with gr.Tab("📁 Custom Dataset Upload"):
314
 
315
+ file_in = gr.File(label="Upload CSV / Parquet / Arrow")
316
+
317
+ operation_custom = gr.Radio(
318
+ ["Filter", "Groupby", "Join", "Write Parquet"],
319
+ label="Operation",
320
+ value="Filter"
321
+ )
322
+
323
  repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
324
 
325
+ btn_custom = gr.Button("Run on Uploaded Dataset")
 
 
326
 
327
+ out_md_custom = gr.Markdown()
328
+ out_chart_custom = gr.Image()
 
 
 
 
 
 
 
329
 
330
  def custom_runner(file, operation, repeats):
331
+ repeats = int(repeats)
332
  df = load_custom_dataset(file)
333
  result = run_benchmark(operation, df, repeats)
334
+ return format_result(result), generate_chart(result)
 
335
 
336
+ btn_custom.click(
337
  custom_runner,
338
+ [file_in, operation_custom, repeats_custom],
339
+ [out_md_custom, out_chart_custom],
340
  )
341
 
 
342
  if __name__ == "__main__":
343
  demo.launch()