File size: 9,549 Bytes
87c1f4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo

    # /// script
    # [tool.marimo.display]
    # theme = "dark"
    # ///
    return (mo,)


@app.cell
def _(mo):
    mo.md(r"""# E-Commerce ELT Pipeline""")
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    💡 Want a step-by-step walkthrough instead?

    You can check the Jupyter notebook version here: 👉 [Jupyter version](https://huggingface.co/spaces/iBrokeTheCode/E-Commerce_ELT/blob/main/tutorial_app.ipynb)
    """
    )
    return


@app.cell
def _(mo):
    mo.md(r"""## 1. Description""")
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    This project analyzes e-commerce data from a Brazilian marketplace to explore key business metrics related to **revenue** and **delivery performance**. Using an interactive Marimo application, the analysis provides insights into:

    * **Revenue:** Annual revenue, popular product categories, and sales by state.
    * **Delivery:** Delivery performance, including time-to-delivery and its correlation with public holidays.

    The data pipeline processes information from multiple CSV files and a public API, storing and analyzing the results using Python. The final interactive report is presented as a Hugging Face Space built with Marimo.
    """
    )
    return


@app.cell
def _(mo):
    mo.md(r"""## 2. ETL""")
    return


@app.cell
def _():
    from pandas import DataFrame
    from pathlib import Path
    from sqlalchemy import create_engine

    from src import config
    from src.extract import extract
    from src.load import load
    from src.transform import QueryEnum, run_queries
    return (
        DataFrame,
        Path,
        QueryEnum,
        config,
        create_engine,
        extract,
        load,
        run_queries,
    )


@app.cell
def _(mo):
    mo.md(r"""### 2.1 Extract and Load""")
    return


@app.cell
def _(Path, config, create_engine, extract, load):
    DB_PATH = Path(config.SQLITE_DB_ABSOLUTE_PATH)

    if DB_PATH.exists() and DB_PATH.stat().st_size > 0:
        print("Database found. Skipping ETL process.")
        ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False)
    else:
        print("Database not found or empty. Starting ETL process...")
        ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False)

        csv_dataframes = extract(
            csv_folder=config.DATASET_ROOT_PATH,
            csv_table_mapping=config.get_csv_to_table_mapping(),
            public_holidays_url=config.PUBLIC_HOLIDAYS_URL,
        )

        load(dataframes=csv_dataframes, database=ENGINE)
        print("ETL process complete.")
    return (ENGINE,)


@app.cell
def _(mo):
    mo.md(r"""### 2.2 Transform""")
    return


@app.cell
def _(DataFrame, ENGINE, run_queries):
    query_results: dict[str, DataFrame] = run_queries(database=ENGINE)
    return (query_results,)


@app.cell
def _(mo):
    mo.md(r"""**A. Revenue by Month and Year**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value]
    revenue_by_month_year
    return (revenue_by_month_year,)


@app.cell
def _(mo):
    mo.md(r"""**B. Top 10 Revenue by categories**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    top_10_revenue_categories = query_results[
        QueryEnum.TOP_10_REVENUE_CATEGORIES.value
    ]
    top_10_revenue_categories
    return (top_10_revenue_categories,)


@app.cell
def _(mo):
    mo.md(r"""**C. Top 10 Least Revenue by Categories**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    top_10_least_revenue_categories = query_results[
        QueryEnum.TOP_10_LEAST_REVENUE_CATEGORIES.value
    ]
    top_10_least_revenue_categories
    return (top_10_least_revenue_categories,)


@app.cell
def _(mo):
    mo.md(r"""**D. Revenue per State**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    revenue_per_state = query_results[QueryEnum.REVENUE_PER_STATE.value]
    revenue_per_state
    return (revenue_per_state,)


@app.cell
def _(mo):
    mo.md(r"""**E. Delivery Date Difference**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    delivery_date_difference = query_results[
        QueryEnum.DELIVERY_DATE_DIFFERENCE.value
    ]
    delivery_date_difference
    return (delivery_date_difference,)


@app.cell
def _(mo):
    mo.md(r"""**F. Real vs. Predicted Delivered Time**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    real_vs_estimated_delivery_time = query_results[
        QueryEnum.REAL_VS_ESTIMATED_DELIVERED_TIME.value
    ]
    real_vs_estimated_delivery_time
    return (real_vs_estimated_delivery_time,)


@app.cell
def _(mo):
    mo.md(r"""**G. Global Amount of Order Status**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    global_amount_order_status = query_results[
        QueryEnum.GLOBAL_AMOUNT_ORDER_STATUS.value
    ]
    global_amount_order_status
    return (global_amount_order_status,)


@app.cell
def _(mo):
    mo.md(r"""**H. Orders per Day and Holidays in 2017**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    orders_per_day_and_holidays = query_results[
        QueryEnum.ORDERS_PER_DAY_AND_HOLIDAYS_2017.value
    ]
    orders_per_day_and_holidays
    return (orders_per_day_and_holidays,)


@app.cell
def _(mo):
    mo.md(r"""**I. Freight Value Weight Relationship**""")
    return


@app.cell
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
    freight_value_weight_relationship = query_results[
        QueryEnum.GET_FREIGHT_VALUE_WEIGHT_RELATIONSHIP.value
    ]
    freight_value_weight_relationship
    return (freight_value_weight_relationship,)


@app.cell
def _(mo):
    mo.md(r"""## 3. Plots""")
    return


@app.cell
def _():
    from src.plots import (
        plot_revenue_by_month_year,
        plot_real_vs_predicted_delivered_time,
        plot_global_amount_order_status,
        plot_revenue_per_state,
        plot_top_10_least_revenue_categories,
        plot_top_10_revenue_categories_amount,
        plot_top_10_revenue_categories,
        plot_freight_value_weight_relationship,
        plot_delivery_date_difference,
        plot_order_amount_per_day_with_holidays,
    )
    return (
        plot_delivery_date_difference,
        plot_freight_value_weight_relationship,
        plot_global_amount_order_status,
        plot_order_amount_per_day_with_holidays,
        plot_real_vs_predicted_delivered_time,
        plot_revenue_by_month_year,
        plot_revenue_per_state,
        plot_top_10_least_revenue_categories,
        plot_top_10_revenue_categories,
        plot_top_10_revenue_categories_amount,
    )


@app.cell
def _(mo):
    mo.md(r"""**A. Revenue by Month in 2017**""")
    return


@app.cell
def _(plot_revenue_by_month_year, revenue_by_month_year):
    plot_revenue_by_month_year(df=revenue_by_month_year, year=2017)
    return


@app.cell
def _(mo):
    mo.md(r"""**B. Real vs. Predicted Delivered Time**""")
    return


@app.cell
def _(plot_real_vs_predicted_delivered_time, real_vs_estimated_delivery_time):
    plot_real_vs_predicted_delivered_time(
        df=real_vs_estimated_delivery_time, year=2017
    )
    return


@app.cell
def _(mo):
    mo.md(r"""**C. Global Amount of Order Status**""")
    return


@app.cell
def _(global_amount_order_status, plot_global_amount_order_status):
    plot_global_amount_order_status(df=global_amount_order_status)
    return


@app.cell
def _(mo):
    mo.md(r"""**D. Revenue per State**""")
    return


@app.cell
def _(plot_revenue_per_state, revenue_per_state):
    plot_revenue_per_state(df=revenue_per_state)
    return


@app.cell
def _(mo):
    mo.md(r"""**E. Top 10 Least Revenue by Categories**""")
    return


@app.cell
def _(plot_top_10_least_revenue_categories, top_10_least_revenue_categories):
    plot_top_10_least_revenue_categories(df=top_10_least_revenue_categories)
    return


@app.cell
def _(mo):
    mo.md(r"""**F. Top 10 Revenue Categories Amount**""")
    return


@app.cell
def _(plot_top_10_revenue_categories_amount, top_10_revenue_categories):
    plot_top_10_revenue_categories_amount(df=top_10_revenue_categories)
    return


@app.cell
def _(mo):
    mo.md(r"""**G. Top 10 Revenue by Categories**""")
    return


@app.cell
def _(plot_top_10_revenue_categories, top_10_revenue_categories):
    plot_top_10_revenue_categories(df=top_10_revenue_categories)
    return


@app.cell
def _(mo):
    mo.md(r"""**H. Freight Value vs. Product Weight**""")
    return


@app.cell
def _(
    freight_value_weight_relationship,
    plot_freight_value_weight_relationship,
):
    plot_freight_value_weight_relationship(df=freight_value_weight_relationship)
    return


@app.cell
def _(mo):
    mo.md(r"""**I. Diffrence Between Deliver Estimated Date and Delivery Date**""")
    return


@app.cell
def _(delivery_date_difference, plot_delivery_date_difference):
    plot_delivery_date_difference(df=delivery_date_difference)
    return


@app.cell
def _(mo):
    mo.md(r"""**J. Order Amount per Day with Holidays**""")
    return


@app.cell
def _(orders_per_day_and_holidays, plot_order_amount_per_day_with_holidays):
    plot_order_amount_per_day_with_holidays(df=orders_per_day_and_holidays)
    return


if __name__ == "__main__":
    app.run()