iBrokeTheCode commited on
Commit
87c1f4c
·
1 Parent(s): 62ec48c

chore: Add first version of main dashboard

Browse files
Files changed (3) hide show
  1. README.md +12 -2
  2. app.py +126 -277
  3. app_bk.py +416 -0
README.md CHANGED
@@ -13,9 +13,19 @@ short_description: Extract, Load, Transform Pipeline applied to an E-Commerce
13
 
14
  ## Table of Contents
15
 
16
- 1. [Stack](#stack)
 
17
 
18
- ## 1. Stack
 
 
 
 
 
 
 
 
 
19
 
20
  - [Marimo](https://github.com/marimo-team/marimo): A Python library for building interactive dashboards.
21
  - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-config-reference): A platform for hosting and sharing interactive machine learning demos and applications
 
13
 
14
  ## Table of Contents
15
 
16
+ 1. [Description](#1-description)
17
+ 2. [Stack](#2-stack)
18
 
19
+ ## 1. Description
20
+
21
+ This project analyzes e-commerce data from a Brazilian marketplace to explore key business metrics related to **revenue** and **delivery performance**. Using an interactive Marimo application, the analysis provides insights into:
22
+
23
+ - **Revenue:** Annual revenue, popular product categories, and sales by state.
24
+ - **Delivery:** Delivery performance, including time-to-delivery and its correlation with public holidays.
25
+
26
+ The data pipeline processes information from [multiple CSV files](https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce) and a [public API](https://date.nager.at/Api), storing and analyzing the results using Python. The final interactive report is presented as a Hugging Face Space built with Marimo.
27
+
28
+ ## 2. Stack
29
 
30
  - [Marimo](https://github.com/marimo-team/marimo): A Python library for building interactive dashboards.
31
  - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-config-reference): A platform for hosting and sharing interactive machine learning demos and applications
app.py CHANGED
@@ -17,7 +17,7 @@ def _():
17
 
18
  @app.cell
19
  def _(mo):
20
- mo.md(r"""# E-Commerce ELT Pipeline""")
21
  return
22
 
23
 
@@ -25,41 +25,25 @@ def _(mo):
25
  def _(mo):
26
  mo.md(
27
  r"""
28
- 💡 Want a step-by-step walkthrough instead?
 
 
 
29
 
30
- You can check the Jupyter notebook version here: 👉 [Jupyter version](https://huggingface.co/spaces/iBrokeTheCode/E-Commerce_ELT/blob/main/tutorial_app.ipynb)
31
- """
32
- )
33
- return
34
 
 
35
 
36
- @app.cell
37
- def _(mo):
38
- mo.md(r"""## 1. Description""")
39
- return
40
 
 
41
 
42
- @app.cell
43
- def _(mo):
44
- mo.md(
45
- r"""
46
- This project analyzes e-commerce data from a Brazilian marketplace to explore key business metrics related to **revenue** and **delivery performance**. Using an interactive Marimo application, the analysis provides insights into:
47
-
48
- * **Revenue:** Annual revenue, popular product categories, and sales by state.
49
- * **Delivery:** Delivery performance, including time-to-delivery and its correlation with public holidays.
50
-
51
- The data pipeline processes information from multiple CSV files and a public API, storing and analyzing the results using Python. The final interactive report is presented as a Hugging Face Space built with Marimo.
52
  """
53
  )
54
  return
55
 
56
 
57
- @app.cell
58
- def _(mo):
59
- mo.md(r"""## 2. ETL""")
60
- return
61
-
62
-
63
  @app.cell
64
  def _():
65
  from pandas import DataFrame
@@ -70,6 +54,19 @@ def _():
70
  from src.extract import extract
71
  from src.load import load
72
  from src.transform import QueryEnum, run_queries
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return (
74
  DataFrame,
75
  Path,
@@ -78,18 +75,21 @@ def _():
78
  create_engine,
79
  extract,
80
  load,
 
 
 
 
 
 
 
 
 
81
  run_queries,
82
  )
83
 
84
 
85
  @app.cell
86
- def _(mo):
87
- mo.md(r"""### 2.1 Extract and Load""")
88
- return
89
-
90
-
91
- @app.cell
92
- def _(Path, config, create_engine, extract, load):
93
  DB_PATH = Path(config.SQLITE_DB_ABSOLUTE_PATH)
94
 
95
  if DB_PATH.exists() and DB_PATH.stat().st_size > 0:
@@ -107,308 +107,157 @@ def _(Path, config, create_engine, extract, load):
107
 
108
  load(dataframes=csv_dataframes, database=ENGINE)
109
  print("ETL process complete.")
110
- return (ENGINE,)
111
 
112
-
113
- @app.cell
114
- def _(mo):
115
- mo.md(r"""### 2.2 Transform""")
116
- return
117
-
118
-
119
- @app.cell
120
- def _(DataFrame, ENGINE, run_queries):
121
  query_results: dict[str, DataFrame] = run_queries(database=ENGINE)
122
  return (query_results,)
123
 
124
 
125
- @app.cell
126
- def _(mo):
127
- mo.md(r"""**A. Revenue by Month and Year**""")
128
- return
129
-
130
-
131
  @app.cell
132
  def _(QueryEnum, query_results: "dict[str, DataFrame]"):
 
133
  revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value]
134
- revenue_by_month_year
135
- return (revenue_by_month_year,)
136
-
137
-
138
- @app.cell
139
- def _(mo):
140
- mo.md(r"""**B. Top 10 Revenue by categories**""")
141
- return
142
-
143
 
144
- @app.cell
145
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
146
  top_10_revenue_categories = query_results[
147
  QueryEnum.TOP_10_REVENUE_CATEGORIES.value
148
  ]
149
- top_10_revenue_categories
150
- return (top_10_revenue_categories,)
151
 
152
-
153
- @app.cell
154
- def _(mo):
155
- mo.md(r"""**C. Top 10 Least Revenue by Categories**""")
156
- return
157
-
158
-
159
- @app.cell
160
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
161
  top_10_least_revenue_categories = query_results[
162
  QueryEnum.TOP_10_LEAST_REVENUE_CATEGORIES.value
163
  ]
164
- top_10_least_revenue_categories
165
- return (top_10_least_revenue_categories,)
166
-
167
 
168
- @app.cell
169
- def _(mo):
170
- mo.md(r"""**D. Revenue per State**""")
171
- return
172
-
173
-
174
- @app.cell
175
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
176
  revenue_per_state = query_results[QueryEnum.REVENUE_PER_STATE.value]
177
- revenue_per_state
178
- return (revenue_per_state,)
179
 
180
-
181
- @app.cell
182
- def _(mo):
183
- mo.md(r"""**E. Delivery Date Difference**""")
184
- return
185
-
186
-
187
- @app.cell
188
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
189
  delivery_date_difference = query_results[
190
  QueryEnum.DELIVERY_DATE_DIFFERENCE.value
191
  ]
192
- delivery_date_difference
193
- return (delivery_date_difference,)
194
-
195
-
196
- @app.cell
197
- def _(mo):
198
- mo.md(r"""**F. Real vs. Predicted Delivered Time**""")
199
- return
200
 
201
-
202
- @app.cell
203
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
204
  real_vs_estimated_delivery_time = query_results[
205
  QueryEnum.REAL_VS_ESTIMATED_DELIVERED_TIME.value
206
  ]
207
- real_vs_estimated_delivery_time
208
- return (real_vs_estimated_delivery_time,)
209
-
210
-
211
- @app.cell
212
- def _(mo):
213
- mo.md(r"""**G. Global Amount of Order Status**""")
214
- return
215
-
216
 
217
- @app.cell
218
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
219
  global_amount_order_status = query_results[
220
  QueryEnum.GLOBAL_AMOUNT_ORDER_STATUS.value
221
  ]
222
- global_amount_order_status
223
- return (global_amount_order_status,)
224
-
225
-
226
- @app.cell
227
- def _(mo):
228
- mo.md(r"""**H. Orders per Day and Holidays in 2017**""")
229
- return
230
 
231
-
232
- @app.cell
233
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
234
  orders_per_day_and_holidays = query_results[
235
  QueryEnum.ORDERS_PER_DAY_AND_HOLIDAYS_2017.value
236
  ]
237
- orders_per_day_and_holidays
238
- return (orders_per_day_and_holidays,)
239
-
240
-
241
- @app.cell
242
- def _(mo):
243
- mo.md(r"""**I. Freight Value Weight Relationship**""")
244
- return
245
-
246
 
247
- @app.cell
248
- def _(QueryEnum, query_results: "dict[str, DataFrame]"):
249
  freight_value_weight_relationship = query_results[
250
  QueryEnum.GET_FREIGHT_VALUE_WEIGHT_RELATIONSHIP.value
251
  ]
252
- freight_value_weight_relationship
253
- return (freight_value_weight_relationship,)
254
-
255
-
256
- @app.cell
257
- def _(mo):
258
- mo.md(r"""## 3. Plots""")
259
- return
260
-
261
-
262
- @app.cell
263
- def _():
264
- from src.plots import (
265
- plot_revenue_by_month_year,
266
- plot_real_vs_predicted_delivered_time,
267
- plot_global_amount_order_status,
268
- plot_revenue_per_state,
269
- plot_top_10_least_revenue_categories,
270
- plot_top_10_revenue_categories_amount,
271
- plot_top_10_revenue_categories,
272
- plot_freight_value_weight_relationship,
273
- plot_delivery_date_difference,
274
- plot_order_amount_per_day_with_holidays,
275
- )
276
  return (
277
- plot_delivery_date_difference,
278
- plot_freight_value_weight_relationship,
279
- plot_global_amount_order_status,
280
- plot_order_amount_per_day_with_holidays,
281
- plot_real_vs_predicted_delivered_time,
282
- plot_revenue_by_month_year,
283
- plot_revenue_per_state,
284
- plot_top_10_least_revenue_categories,
285
- plot_top_10_revenue_categories,
286
- plot_top_10_revenue_categories_amount,
287
  )
288
 
289
 
290
  @app.cell
291
  def _(mo):
292
- mo.md(r"""**A. Revenue by Month in 2017**""")
293
- return
294
-
295
-
296
- @app.cell
297
- def _(plot_revenue_by_month_year, revenue_by_month_year):
298
- plot_revenue_by_month_year(df=revenue_by_month_year, year=2017)
299
- return
300
-
301
-
302
- @app.cell
303
- def _(mo):
304
- mo.md(r"""**B. Real vs. Predicted Delivered Time**""")
305
- return
306
-
307
-
308
- @app.cell
309
- def _(plot_real_vs_predicted_delivered_time, real_vs_estimated_delivery_time):
310
- plot_real_vs_predicted_delivered_time(
311
- df=real_vs_estimated_delivery_time, year=2017
312
- )
313
- return
314
-
315
-
316
- @app.cell
317
- def _(mo):
318
- mo.md(r"""**C. Global Amount of Order Status**""")
319
- return
320
-
321
-
322
- @app.cell
323
- def _(global_amount_order_status, plot_global_amount_order_status):
324
- plot_global_amount_order_status(df=global_amount_order_status)
325
- return
326
-
327
-
328
- @app.cell
329
- def _(mo):
330
- mo.md(r"""**D. Revenue per State**""")
331
- return
332
-
333
-
334
- @app.cell
335
- def _(plot_revenue_per_state, revenue_per_state):
336
- plot_revenue_per_state(df=revenue_per_state)
337
- return
338
-
339
-
340
- @app.cell
341
- def _(mo):
342
- mo.md(r"""**E. Top 10 Least Revenue by Categories**""")
343
- return
344
-
345
-
346
- @app.cell
347
- def _(plot_top_10_least_revenue_categories, top_10_least_revenue_categories):
348
- plot_top_10_least_revenue_categories(df=top_10_least_revenue_categories)
349
- return
350
-
351
-
352
- @app.cell
353
- def _(mo):
354
- mo.md(r"""**F. Top 10 Revenue Categories Amount**""")
355
- return
356
-
357
-
358
- @app.cell
359
- def _(plot_top_10_revenue_categories_amount, top_10_revenue_categories):
360
- plot_top_10_revenue_categories_amount(df=top_10_revenue_categories)
361
- return
362
-
363
-
364
- @app.cell
365
- def _(mo):
366
- mo.md(r"""**G. Top 10 Revenue by Categories**""")
367
- return
368
-
369
-
370
- @app.cell
371
- def _(plot_top_10_revenue_categories, top_10_revenue_categories):
372
- plot_top_10_revenue_categories(df=top_10_revenue_categories)
373
- return
374
-
375
-
376
- @app.cell
377
- def _(mo):
378
- mo.md(r"""**H. Freight Value vs. Product Weight**""")
379
  return
380
 
381
 
382
  @app.cell
383
  def _(
384
  freight_value_weight_relationship,
 
 
 
385
  plot_freight_value_weight_relationship,
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  ):
387
- plot_freight_value_weight_relationship(df=freight_value_weight_relationship)
388
- return
389
-
390
-
391
- @app.cell
392
- def _(mo):
393
- mo.md(r"""**I. Diffrence Between Deliver Estimated Date and Delivery Date**""")
394
- return
395
-
 
 
396
 
397
- @app.cell
398
- def _(delivery_date_difference, plot_delivery_date_difference):
399
- plot_delivery_date_difference(df=delivery_date_difference)
400
- return
 
 
 
 
 
 
401
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- @app.cell
404
- def _(mo):
405
- mo.md(r"""**J. Order Amount per Day with Holidays**""")
406
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
 
409
  @app.cell
410
- def _(orders_per_day_and_holidays, plot_order_amount_per_day_with_holidays):
411
- plot_order_amount_per_day_with_holidays(df=orders_per_day_and_holidays)
 
 
 
 
 
 
 
412
  return
413
 
414
 
 
17
 
18
  @app.cell
19
  def _(mo):
20
+ mo.md(r"""# 📦 Brazilian E-Commerce Dashboard""")
21
  return
22
 
23
 
 
25
  def _(mo):
26
  mo.md(
27
  r"""
28
+ This interactive dashboard explores insights from the [Brazilian e-commerce dataset](https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce) and the [Public Holiday API](https://date.nager.at/Api) :
29
+ - Sales performance by category and state
30
+ - Delivery efficiency
31
+ - Seasonal trends and holidays impact
32
 
33
+ Use the tabs above to explore different insights!
 
 
 
34
 
35
+ _Built with Marimo._
36
 
37
+ ---
 
 
 
38
 
39
+ 💡 **Want a step-by-step walkthrough instead?**
40
 
41
+ You can check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/E-Commerce_ELT/blob/main/tutorial_app.ipynb)
 
 
 
 
 
 
 
 
 
42
  """
43
  )
44
  return
45
 
46
 
 
 
 
 
 
 
47
  @app.cell
48
  def _():
49
  from pandas import DataFrame
 
54
  from src.extract import extract
55
  from src.load import load
56
  from src.transform import QueryEnum, run_queries
57
+
58
+ from src.plots import (
59
+ plot_revenue_by_month_year,
60
+ plot_real_vs_predicted_delivered_time,
61
+ plot_global_amount_order_status,
62
+ plot_revenue_per_state,
63
+ plot_top_10_least_revenue_categories,
64
+ plot_top_10_revenue_categories_amount,
65
+ plot_top_10_revenue_categories,
66
+ plot_freight_value_weight_relationship,
67
+ plot_delivery_date_difference,
68
+ plot_order_amount_per_day_with_holidays,
69
+ )
70
  return (
71
  DataFrame,
72
  Path,
 
75
  create_engine,
76
  extract,
77
  load,
78
+ plot_freight_value_weight_relationship,
79
+ plot_global_amount_order_status,
80
+ plot_order_amount_per_day_with_holidays,
81
+ plot_real_vs_predicted_delivered_time,
82
+ plot_revenue_by_month_year,
83
+ plot_revenue_per_state,
84
+ plot_top_10_least_revenue_categories,
85
+ plot_top_10_revenue_categories,
86
+ plot_top_10_revenue_categories_amount,
87
  run_queries,
88
  )
89
 
90
 
91
  @app.cell
92
+ def _(DataFrame, Path, config, create_engine, extract, load, run_queries):
 
 
 
 
 
 
93
  DB_PATH = Path(config.SQLITE_DB_ABSOLUTE_PATH)
94
 
95
  if DB_PATH.exists() and DB_PATH.stat().st_size > 0:
 
107
 
108
  load(dataframes=csv_dataframes, database=ENGINE)
109
  print("ETL process complete.")
 
110
 
 
 
 
 
 
 
 
 
 
111
  query_results: dict[str, DataFrame] = run_queries(database=ENGINE)
112
  return (query_results,)
113
 
114
 
 
 
 
 
 
 
115
  @app.cell
116
  def _(QueryEnum, query_results: "dict[str, DataFrame]"):
117
+ # **A. Revenue by Month and Year**
118
  revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value]
 
 
 
 
 
 
 
 
 
119
 
120
+ # **B. Top 10 Revenue by categories**
 
121
  top_10_revenue_categories = query_results[
122
  QueryEnum.TOP_10_REVENUE_CATEGORIES.value
123
  ]
 
 
124
 
125
+ # **C. Top 10 Least Revenue by Categories**
 
 
 
 
 
 
 
 
126
  top_10_least_revenue_categories = query_results[
127
  QueryEnum.TOP_10_LEAST_REVENUE_CATEGORIES.value
128
  ]
 
 
 
129
 
130
+ # **D. Revenue per State**
 
 
 
 
 
 
 
131
  revenue_per_state = query_results[QueryEnum.REVENUE_PER_STATE.value]
 
 
132
 
133
+ # **E. Delivery Date Difference**
 
 
 
 
 
 
 
 
134
  delivery_date_difference = query_results[
135
  QueryEnum.DELIVERY_DATE_DIFFERENCE.value
136
  ]
 
 
 
 
 
 
 
 
137
 
138
+ # **F. Real vs. Predicted Delivered Time**
 
 
139
  real_vs_estimated_delivery_time = query_results[
140
  QueryEnum.REAL_VS_ESTIMATED_DELIVERED_TIME.value
141
  ]
 
 
 
 
 
 
 
 
 
142
 
143
+ # **G. Global Amount of Order Status**
 
144
  global_amount_order_status = query_results[
145
  QueryEnum.GLOBAL_AMOUNT_ORDER_STATUS.value
146
  ]
 
 
 
 
 
 
 
 
147
 
148
+ # **H. Orders per Day and Holidays in 2017**
 
 
149
  orders_per_day_and_holidays = query_results[
150
  QueryEnum.ORDERS_PER_DAY_AND_HOLIDAYS_2017.value
151
  ]
 
 
 
 
 
 
 
 
 
152
 
153
+ # **I. Freight Value Weight Relationship**
 
154
  freight_value_weight_relationship = query_results[
155
  QueryEnum.GET_FREIGHT_VALUE_WEIGHT_RELATIONSHIP.value
156
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  return (
158
+ freight_value_weight_relationship,
159
+ global_amount_order_status,
160
+ orders_per_day_and_holidays,
161
+ real_vs_estimated_delivery_time,
162
+ revenue_by_month_year,
163
+ revenue_per_state,
164
+ top_10_least_revenue_categories,
165
+ top_10_revenue_categories,
 
 
166
  )
167
 
168
 
169
  @app.cell
170
  def _(mo):
171
+ mo.md(r"""## Insights""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  return
173
 
174
 
175
  @app.cell
176
  def _(
177
  freight_value_weight_relationship,
178
+ global_amount_order_status,
179
+ mo,
180
+ orders_per_day_and_holidays,
181
  plot_freight_value_weight_relationship,
182
+ plot_global_amount_order_status,
183
+ plot_order_amount_per_day_with_holidays,
184
+ plot_real_vs_predicted_delivered_time,
185
+ plot_revenue_by_month_year,
186
+ plot_revenue_per_state,
187
+ plot_top_10_least_revenue_categories,
188
+ plot_top_10_revenue_categories,
189
+ plot_top_10_revenue_categories_amount,
190
+ real_vs_estimated_delivery_time,
191
+ revenue_by_month_year,
192
+ revenue_per_state,
193
+ top_10_least_revenue_categories,
194
+ top_10_revenue_categories,
195
  ):
196
+ overview_tab = mo.vstack(
197
+ [
198
+ mo.md("### Global Order Status Overview"),
199
+ mo.hstack(
200
+ [
201
+ global_amount_order_status,
202
+ plot_global_amount_order_status(df=global_amount_order_status),
203
+ ]
204
+ ),
205
+ ]
206
+ )
207
 
208
+ revenue_tab = mo.vstack(
209
+ [
210
+ mo.md("### Revenue by Month and Year"),
211
+ mo.ui.table(revenue_by_month_year),
212
+ plot_revenue_by_month_year(df=revenue_by_month_year, year=2017),
213
+ mo.md("### Revenue by State"),
214
+ mo.ui.table(revenue_per_state),
215
+ plot_revenue_per_state(revenue_per_state),
216
+ ]
217
+ )
218
 
219
+ categories_tab = mo.vstack(
220
+ [
221
+ mo.md("### Top 10 Revenue Categories"),
222
+ mo.ui.table(top_10_revenue_categories),
223
+ plot_top_10_revenue_categories(top_10_revenue_categories),
224
+ plot_top_10_revenue_categories_amount(top_10_revenue_categories),
225
+ mo.md("### Bottom 10 Revenue Categories"),
226
+ mo.ui.table(top_10_least_revenue_categories),
227
+ plot_top_10_least_revenue_categories(top_10_least_revenue_categories),
228
+ ]
229
+ )
230
 
231
+ delivery_tab = mo.vstack(
232
+ [
233
+ mo.md("### Freight Value vs Product Weight"),
234
+ mo.ui.table(freight_value_weight_relationship),
235
+ plot_freight_value_weight_relationship(
236
+ freight_value_weight_relationship
237
+ ),
238
+ mo.md("### Real vs Estimated Delivery Time"),
239
+ mo.ui.table(real_vs_estimated_delivery_time),
240
+ plot_real_vs_predicted_delivered_time(
241
+ df=real_vs_estimated_delivery_time, year=2017
242
+ ),
243
+ mo.md("### Orders and Holidays"),
244
+ mo.ui.table(orders_per_day_and_holidays),
245
+ plot_order_amount_per_day_with_holidays(orders_per_day_and_holidays),
246
+ ]
247
+ )
248
+ return categories_tab, delivery_tab, overview_tab, revenue_tab
249
 
250
 
251
  @app.cell
252
+ def _(categories_tab, delivery_tab, mo, overview_tab, revenue_tab):
253
+ mo.ui.tabs(
254
+ {
255
+ "📊 Overview": overview_tab,
256
+ "💰 Revenue": revenue_tab,
257
+ "📦 Categories": categories_tab,
258
+ "🚚 Freight & Delivery": delivery_tab,
259
+ }
260
+ )
261
  return
262
 
263
 
app_bk.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo
2
+
3
+ __generated_with = "0.14.16"
4
+ app = marimo.App(width="medium")
5
+
6
+
7
+ @app.cell
8
+ def _():
9
+ import marimo as mo
10
+
11
+ # /// script
12
+ # [tool.marimo.display]
13
+ # theme = "dark"
14
+ # ///
15
+ return (mo,)
16
+
17
+
18
+ @app.cell
19
+ def _(mo):
20
+ mo.md(r"""# E-Commerce ELT Pipeline""")
21
+ return
22
+
23
+
24
+ @app.cell
25
+ def _(mo):
26
+ mo.md(
27
+ r"""
28
+ 💡 Want a step-by-step walkthrough instead?
29
+
30
+ You can check the Jupyter notebook version here: 👉 [Jupyter version](https://huggingface.co/spaces/iBrokeTheCode/E-Commerce_ELT/blob/main/tutorial_app.ipynb)
31
+ """
32
+ )
33
+ return
34
+
35
+
36
+ @app.cell
37
+ def _(mo):
38
+ mo.md(r"""## 1. Description""")
39
+ return
40
+
41
+
42
+ @app.cell
43
+ def _(mo):
44
+ mo.md(
45
+ r"""
46
+ This project analyzes e-commerce data from a Brazilian marketplace to explore key business metrics related to **revenue** and **delivery performance**. Using an interactive Marimo application, the analysis provides insights into:
47
+
48
+ * **Revenue:** Annual revenue, popular product categories, and sales by state.
49
+ * **Delivery:** Delivery performance, including time-to-delivery and its correlation with public holidays.
50
+
51
+ The data pipeline processes information from multiple CSV files and a public API, storing and analyzing the results using Python. The final interactive report is presented as a Hugging Face Space built with Marimo.
52
+ """
53
+ )
54
+ return
55
+
56
+
57
+ @app.cell
58
+ def _(mo):
59
+ mo.md(r"""## 2. ETL""")
60
+ return
61
+
62
+
63
+ @app.cell
64
+ def _():
65
+ from pandas import DataFrame
66
+ from pathlib import Path
67
+ from sqlalchemy import create_engine
68
+
69
+ from src import config
70
+ from src.extract import extract
71
+ from src.load import load
72
+ from src.transform import QueryEnum, run_queries
73
+ return (
74
+ DataFrame,
75
+ Path,
76
+ QueryEnum,
77
+ config,
78
+ create_engine,
79
+ extract,
80
+ load,
81
+ run_queries,
82
+ )
83
+
84
+
85
+ @app.cell
86
+ def _(mo):
87
+ mo.md(r"""### 2.1 Extract and Load""")
88
+ return
89
+
90
+
91
+ @app.cell
92
+ def _(Path, config, create_engine, extract, load):
93
+ DB_PATH = Path(config.SQLITE_DB_ABSOLUTE_PATH)
94
+
95
+ if DB_PATH.exists() and DB_PATH.stat().st_size > 0:
96
+ print("Database found. Skipping ETL process.")
97
+ ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False)
98
+ else:
99
+ print("Database not found or empty. Starting ETL process...")
100
+ ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False)
101
+
102
+ csv_dataframes = extract(
103
+ csv_folder=config.DATASET_ROOT_PATH,
104
+ csv_table_mapping=config.get_csv_to_table_mapping(),
105
+ public_holidays_url=config.PUBLIC_HOLIDAYS_URL,
106
+ )
107
+
108
+ load(dataframes=csv_dataframes, database=ENGINE)
109
+ print("ETL process complete.")
110
+ return (ENGINE,)
111
+
112
+
113
+ @app.cell
114
+ def _(mo):
115
+ mo.md(r"""### 2.2 Transform""")
116
+ return
117
+
118
+
119
+ @app.cell
120
+ def _(DataFrame, ENGINE, run_queries):
121
+ query_results: dict[str, DataFrame] = run_queries(database=ENGINE)
122
+ return (query_results,)
123
+
124
+
125
+ @app.cell
126
+ def _(mo):
127
+ mo.md(r"""**A. Revenue by Month and Year**""")
128
+ return
129
+
130
+
131
+ @app.cell
132
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
133
+ revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value]
134
+ revenue_by_month_year
135
+ return (revenue_by_month_year,)
136
+
137
+
138
+ @app.cell
139
+ def _(mo):
140
+ mo.md(r"""**B. Top 10 Revenue by categories**""")
141
+ return
142
+
143
+
144
+ @app.cell
145
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
146
+ top_10_revenue_categories = query_results[
147
+ QueryEnum.TOP_10_REVENUE_CATEGORIES.value
148
+ ]
149
+ top_10_revenue_categories
150
+ return (top_10_revenue_categories,)
151
+
152
+
153
+ @app.cell
154
+ def _(mo):
155
+ mo.md(r"""**C. Top 10 Least Revenue by Categories**""")
156
+ return
157
+
158
+
159
+ @app.cell
160
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
161
+ top_10_least_revenue_categories = query_results[
162
+ QueryEnum.TOP_10_LEAST_REVENUE_CATEGORIES.value
163
+ ]
164
+ top_10_least_revenue_categories
165
+ return (top_10_least_revenue_categories,)
166
+
167
+
168
+ @app.cell
169
+ def _(mo):
170
+ mo.md(r"""**D. Revenue per State**""")
171
+ return
172
+
173
+
174
+ @app.cell
175
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
176
+ revenue_per_state = query_results[QueryEnum.REVENUE_PER_STATE.value]
177
+ revenue_per_state
178
+ return (revenue_per_state,)
179
+
180
+
181
+ @app.cell
182
+ def _(mo):
183
+ mo.md(r"""**E. Delivery Date Difference**""")
184
+ return
185
+
186
+
187
+ @app.cell
188
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
189
+ delivery_date_difference = query_results[
190
+ QueryEnum.DELIVERY_DATE_DIFFERENCE.value
191
+ ]
192
+ delivery_date_difference
193
+ return (delivery_date_difference,)
194
+
195
+
196
+ @app.cell
197
+ def _(mo):
198
+ mo.md(r"""**F. Real vs. Predicted Delivered Time**""")
199
+ return
200
+
201
+
202
+ @app.cell
203
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
204
+ real_vs_estimated_delivery_time = query_results[
205
+ QueryEnum.REAL_VS_ESTIMATED_DELIVERED_TIME.value
206
+ ]
207
+ real_vs_estimated_delivery_time
208
+ return (real_vs_estimated_delivery_time,)
209
+
210
+
211
+ @app.cell
212
+ def _(mo):
213
+ mo.md(r"""**G. Global Amount of Order Status**""")
214
+ return
215
+
216
+
217
+ @app.cell
218
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
219
+ global_amount_order_status = query_results[
220
+ QueryEnum.GLOBAL_AMOUNT_ORDER_STATUS.value
221
+ ]
222
+ global_amount_order_status
223
+ return (global_amount_order_status,)
224
+
225
+
226
+ @app.cell
227
+ def _(mo):
228
+ mo.md(r"""**H. Orders per Day and Holidays in 2017**""")
229
+ return
230
+
231
+
232
+ @app.cell
233
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
234
+ orders_per_day_and_holidays = query_results[
235
+ QueryEnum.ORDERS_PER_DAY_AND_HOLIDAYS_2017.value
236
+ ]
237
+ orders_per_day_and_holidays
238
+ return (orders_per_day_and_holidays,)
239
+
240
+
241
+ @app.cell
242
+ def _(mo):
243
+ mo.md(r"""**I. Freight Value Weight Relationship**""")
244
+ return
245
+
246
+
247
+ @app.cell
248
+ def _(QueryEnum, query_results: "dict[str, DataFrame]"):
249
+ freight_value_weight_relationship = query_results[
250
+ QueryEnum.GET_FREIGHT_VALUE_WEIGHT_RELATIONSHIP.value
251
+ ]
252
+ freight_value_weight_relationship
253
+ return (freight_value_weight_relationship,)
254
+
255
+
256
+ @app.cell
257
+ def _(mo):
258
+ mo.md(r"""## 3. Plots""")
259
+ return
260
+
261
+
262
+ @app.cell
263
+ def _():
264
+ from src.plots import (
265
+ plot_revenue_by_month_year,
266
+ plot_real_vs_predicted_delivered_time,
267
+ plot_global_amount_order_status,
268
+ plot_revenue_per_state,
269
+ plot_top_10_least_revenue_categories,
270
+ plot_top_10_revenue_categories_amount,
271
+ plot_top_10_revenue_categories,
272
+ plot_freight_value_weight_relationship,
273
+ plot_delivery_date_difference,
274
+ plot_order_amount_per_day_with_holidays,
275
+ )
276
+ return (
277
+ plot_delivery_date_difference,
278
+ plot_freight_value_weight_relationship,
279
+ plot_global_amount_order_status,
280
+ plot_order_amount_per_day_with_holidays,
281
+ plot_real_vs_predicted_delivered_time,
282
+ plot_revenue_by_month_year,
283
+ plot_revenue_per_state,
284
+ plot_top_10_least_revenue_categories,
285
+ plot_top_10_revenue_categories,
286
+ plot_top_10_revenue_categories_amount,
287
+ )
288
+
289
+
290
+ @app.cell
291
+ def _(mo):
292
+ mo.md(r"""**A. Revenue by Month in 2017**""")
293
+ return
294
+
295
+
296
+ @app.cell
297
+ def _(plot_revenue_by_month_year, revenue_by_month_year):
298
+ plot_revenue_by_month_year(df=revenue_by_month_year, year=2017)
299
+ return
300
+
301
+
302
+ @app.cell
303
+ def _(mo):
304
+ mo.md(r"""**B. Real vs. Predicted Delivered Time**""")
305
+ return
306
+
307
+
308
+ @app.cell
309
+ def _(plot_real_vs_predicted_delivered_time, real_vs_estimated_delivery_time):
310
+ plot_real_vs_predicted_delivered_time(
311
+ df=real_vs_estimated_delivery_time, year=2017
312
+ )
313
+ return
314
+
315
+
316
+ @app.cell
317
+ def _(mo):
318
+ mo.md(r"""**C. Global Amount of Order Status**""")
319
+ return
320
+
321
+
322
+ @app.cell
323
+ def _(global_amount_order_status, plot_global_amount_order_status):
324
+ plot_global_amount_order_status(df=global_amount_order_status)
325
+ return
326
+
327
+
328
+ @app.cell
329
+ def _(mo):
330
+ mo.md(r"""**D. Revenue per State**""")
331
+ return
332
+
333
+
334
+ @app.cell
335
+ def _(plot_revenue_per_state, revenue_per_state):
336
+ plot_revenue_per_state(df=revenue_per_state)
337
+ return
338
+
339
+
340
+ @app.cell
341
+ def _(mo):
342
+ mo.md(r"""**E. Top 10 Least Revenue by Categories**""")
343
+ return
344
+
345
+
346
+ @app.cell
347
+ def _(plot_top_10_least_revenue_categories, top_10_least_revenue_categories):
348
+ plot_top_10_least_revenue_categories(df=top_10_least_revenue_categories)
349
+ return
350
+
351
+
352
+ @app.cell
353
+ def _(mo):
354
+ mo.md(r"""**F. Top 10 Revenue Categories Amount**""")
355
+ return
356
+
357
+
358
+ @app.cell
359
+ def _(plot_top_10_revenue_categories_amount, top_10_revenue_categories):
360
+ plot_top_10_revenue_categories_amount(df=top_10_revenue_categories)
361
+ return
362
+
363
+
364
+ @app.cell
365
+ def _(mo):
366
+ mo.md(r"""**G. Top 10 Revenue by Categories**""")
367
+ return
368
+
369
+
370
+ @app.cell
371
+ def _(plot_top_10_revenue_categories, top_10_revenue_categories):
372
+ plot_top_10_revenue_categories(df=top_10_revenue_categories)
373
+ return
374
+
375
+
376
+ @app.cell
377
+ def _(mo):
378
+ mo.md(r"""**H. Freight Value vs. Product Weight**""")
379
+ return
380
+
381
+
382
+ @app.cell
383
+ def _(
384
+ freight_value_weight_relationship,
385
+ plot_freight_value_weight_relationship,
386
+ ):
387
+ plot_freight_value_weight_relationship(df=freight_value_weight_relationship)
388
+ return
389
+
390
+
391
+ @app.cell
392
+ def _(mo):
393
+ mo.md(r"""**I. Diffrence Between Deliver Estimated Date and Delivery Date**""")
394
+ return
395
+
396
+
397
+ @app.cell
398
+ def _(delivery_date_difference, plot_delivery_date_difference):
399
+ plot_delivery_date_difference(df=delivery_date_difference)
400
+ return
401
+
402
+
403
+ @app.cell
404
+ def _(mo):
405
+ mo.md(r"""**J. Order Amount per Day with Holidays**""")
406
+ return
407
+
408
+
409
+ @app.cell
410
+ def _(orders_per_day_and_holidays, plot_order_amount_per_day_with_holidays):
411
+ plot_order_amount_per_day_with_holidays(df=orders_per_day_and_holidays)
412
+ return
413
+
414
+
415
+ if __name__ == "__main__":
416
+ app.run()