PraneshJs commited on
Commit
3f19c1a
Β·
verified Β·
1 Parent(s): 946ad0f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -0
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ import pandas as pd
4
+ import duckdb
5
+ import gradio as gr
6
+ import matplotlib.pyplot as plt
7
+ import io
8
+
9
+ duckdb_con = duckdb.connect(database=":memory:")
10
+
11
+ # ----------------------------------------------------------
12
+ # Synthetic Data Generator
13
+ # ----------------------------------------------------------
14
+
15
+ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
16
+ rng = np.random.default_rng(42)
17
+ ids = np.arange(n_rows)
18
+ categories = rng.integers(0, n_groups, size=n_rows)
19
+ categories = np.array([f"cat_{c}" for c in categories])
20
+ value1 = rng.normal(0, 1, size=n_rows)
21
+ value2 = rng.normal(10, 5, size=n_rows)
22
+ start_date = np.datetime64("2020-01-01")
23
+ dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
24
+
25
+ return pd.DataFrame(
26
+ {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
27
+ )
28
+
29
+
30
+ # ----------------------------------------------------------
31
+ # Timing utility
32
+ # ----------------------------------------------------------
33
+
34
+ def time_function(fn, repeats=3):
35
+ times = []
36
+ for _ in range(repeats):
37
+ start = time.perf_counter()
38
+ fn()
39
+ end = time.perf_counter()
40
+ times.append(end - start)
41
+ return np.mean(times), np.std(times), times
42
+
43
+
44
+ # ----------------------------------------------------------
45
+ # Benchmark Operations
46
+ # ----------------------------------------------------------
47
+
48
+ def bench_filter(df, repeats=3):
49
+ def pandas_op():
50
+ _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
51
+
52
+ def duckdb_op():
53
+ duckdb_con.register("df", df)
54
+ q = f"""
55
+ SELECT *
56
+ FROM df
57
+ WHERE value1 > 0.5
58
+ AND category = '{df['category'].iloc[0]}'
59
+ """
60
+ _ = duckdb_con.execute(q).fetchdf()
61
+
62
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
63
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
64
+
65
+ return {
66
+ "operation": "Filter rows with comparison",
67
+ "pandas_mean_s": p_mean,
68
+ "pandas_std_s": p_std,
69
+ "duckdb_mean_s": d_mean,
70
+ "duckdb_std_s": d_std,
71
+ "speedup": p_mean / d_mean if d_mean > 0 else None,
72
+ "raw_pandas_runs": p_all,
73
+ "raw_duckdb_runs": d_all,
74
+ }
75
+
76
+
77
+ def bench_groupby(df, repeats=3):
78
+ def pandas_op():
79
+ _ = df.groupby("category")[["value1", "value2"]].mean()
80
+
81
+ def duckdb_op():
82
+ duckdb_con.register("df", df)
83
+ _ = duckdb_con.execute("""
84
+ SELECT category, AVG(value1), AVG(value2)
85
+ FROM df GROUP BY category
86
+ """).fetchdf()
87
+
88
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
89
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
90
+
91
+ return {
92
+ "operation": "Groupby mean",
93
+ "pandas_mean_s": p_mean,
94
+ "pandas_std_s": p_std,
95
+ "duckdb_mean_s": d_mean,
96
+ "duckdb_std_s": d_std,
97
+ "speedup": p_mean / d_mean if d_mean > 0 else None,
98
+ "raw_pandas_runs": p_all,
99
+ "raw_duckdb_runs": d_all,
100
+ }
101
+
102
+
103
+ def bench_join(df, repeats=3):
104
+ categories = df["category"].unique()
105
+ rng = np.random.default_rng(123)
106
+ dim_df = pd.DataFrame(
107
+ {"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))}
108
+ )
109
+
110
+ def pandas_op():
111
+ _ = df.merge(dim_df, on="category", how="left")
112
+
113
+ def duckdb_op():
114
+ duckdb_con.register("df", df)
115
+ duckdb_con.register("dim_df", dim_df)
116
+ _ = duckdb_con.execute("""
117
+ SELECT d.*, dim.weight
118
+ FROM df d
119
+ LEFT JOIN dim_df dim
120
+ ON d.category = dim.category
121
+ """).fetchdf()
122
+
123
+ p_mean, p_std, p_all = time_function(pandas_op, repeats)
124
+ d_mean, d_std, d_all = time_function(duckdb_op, repeats)
125
+
126
+ return {
127
+ "operation": "Left Join",
128
+ "pandas_mean_s": p_mean,
129
+ "pandas_std_s": p_std,
130
+ "duckdb_mean_s": d_mean,
131
+ "duckdb_std_s": d_std,
132
+ "speedup": p_mean / d_mean if d_mean > 0 else None,
133
+ "raw_pandas_runs": p_all,
134
+ "raw_duckdb_runs": d_all,
135
+ }
136
+
137
+
138
+ # ----------------------------------------------------------
139
+ # Benchmark Dispatcher
140
+ # ----------------------------------------------------------
141
+
142
+ def run_benchmark(operation, df, repeats):
143
+ if operation == "Filter":
144
+ return bench_filter(df, repeats)
145
+ if operation == "Groupby":
146
+ return bench_groupby(df, repeats)
147
+ if operation == "Join":
148
+ return bench_join(df, repeats)
149
+ return None
150
+
151
+
152
+ # ----------------------------------------------------------
153
+ # Chart generator (NEW)
154
+ # ----------------------------------------------------------
155
+
156
+ def generate_chart(result):
157
+ fig, ax = plt.subplots(figsize=(4, 3))
158
+
159
+ engines = ["Pandas", "DuckDB"]
160
+ times = [result["pandas_mean_s"], result["duckdb_mean_s"]]
161
+
162
+ ax.bar(engines, times)
163
+ ax.set_ylabel("Time (seconds)")
164
+ ax.set_title("Pandas vs DuckDB Performance")
165
+
166
+ buf = io.BytesIO()
167
+ plt.tight_layout()
168
+ plt.savefig(buf, format="png")
169
+ buf.seek(0)
170
+ return buf
171
+
172
+
173
+ # ----------------------------------------------------------
174
+ # Formatting Results
175
+ # ----------------------------------------------------------
176
+
177
+ def format_result(result):
178
+ speed = result["speedup"]
179
+ verdict = (
180
+ f"πŸš€ **DuckDB is ~{speed:.2f}Γ— faster**"
181
+ if speed > 1
182
+ else f"🐼 **Pandas is ~{1/speed:.2f}Γ— faster**"
183
+ )
184
+
185
+ md = f"""
186
+ ### πŸ”¬ Benchmark Result β€” {result['operation']}
187
+
188
+ | Engine | Mean (s) | Std (s) |
189
+ |--------|----------|---------|
190
+ | Pandas | `{result['pandas_mean_s']:.6f}` | `{result['pandas_std_s']:.6f}` |
191
+ | DuckDB | `{result['duckdb_mean_s']:.6f}` | `{result['duckdb_std_s']:.6f}` |
192
+
193
+ **Verdict:** {verdict}
194
+
195
+ <details><summary>Raw timings</summary>
196
+
197
+ - Pandas: `{[round(x,6) for x in result['raw_pandas_runs']]}`
198
+ - DuckDB: `{[round(x,6) for x in result['raw_duckdb_runs']]}`
199
+ </details>
200
+ """
201
+ return md
202
+
203
+
204
+ # ----------------------------------------------------------
205
+ # Gradio App
206
+ # ----------------------------------------------------------
207
+
208
+ theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
209
+
210
+ with gr.Blocks(title="DuckDB vs Pandas Benchmark", theme=theme) as demo:
211
+
212
+ gr.Markdown("# 🐼 vs πŸ¦† DuckDB vs Pandas Performance Playground")
213
+
214
+ with gr.Tabs():
215
+
216
+ # ---------------------- Synthetic Mode ----------------------
217
+ with gr.Tab("πŸ”₯ Synthetic Dataset Benchmarks"):
218
+ dataset_size = gr.Radio(["100k", "500k", "2M"], label="Dataset Size", value="100k")
219
+ operation_synth = gr.Radio(["Filter", "Groupby", "Join"], label="Operation")
220
+ repeats_synth = gr.Slider(1, 7, value=3, label="Repeats")
221
+ synth_btn = gr.Button("πŸš€ Run Benchmark")
222
+
223
+ synth_output = gr.Markdown()
224
+ synth_chart = gr.Image(label="Performance Chart")
225
+
226
+ def synthetic_runner(size, operation, repeats):
227
+ n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
228
+ df = generate_data(n)
229
+ result = run_benchmark(operation, df, repeats)
230
+ chart = generate_chart(result)
231
+ return format_result(result), chart
232
+
233
+ synth_btn.click(
234
+ synthetic_runner,
235
+ [dataset_size, operation_synth, repeats_synth],
236
+ [synth_output, synth_chart],
237
+ )
238
+
239
+ # ---------------------- Custom Dataset Mode ----------------------
240
+ with gr.Tab("πŸ“ Custom Dataset Upload"):
241
+
242
+ file_input = gr.File(label="Upload a CSV / Parquet / Arrow file")
243
+ operation_custom = gr.Radio(["Filter", "Groupby", "Join"], label="Operation")
244
+ repeats_custom = gr.Slider(1, 7, value=3, label="Repeats")
245
+
246
+ custom_btn = gr.Button("Run on Uploaded Dataset")
247
+ custom_out = gr.Markdown()
248
+ custom_chart = gr.Image(label="Performance Chart")
249
+
250
+ def load_custom_dataset(file):
251
+ if file.name.endswith(".csv"):
252
+ return pd.read_csv(file.name)
253
+ elif file.name.endswith(".parquet"):
254
+ return pd.read_parquet(file.name)
255
+ elif file.name.endswith(".arrow"):
256
+ return pd.read_feather(file.name)
257
+ else:
258
+ raise ValueError("Unsupported format")
259
+
260
+ def custom_runner(file, operation, repeats):
261
+ df = load_custom_dataset(file)
262
+ result = run_benchmark(operation, df, repeats)
263
+ chart = generate_chart(result)
264
+ return format_result(result), chart
265
+
266
+ custom_btn.click(
267
+ custom_runner,
268
+ [file_input, operation_custom, repeats_custom],
269
+ [custom_out, custom_chart],
270
+ )
271
+
272
+
273
+ if __name__ == "__main__":
274
+ demo.launch()