Spaces:
Sleeping
Sleeping
| import marimo | |
| __generated_with = "0.14.16" | |
| app = marimo.App(width="medium") | |
| def _(): | |
| import marimo as mo | |
| # /// script | |
| # [tool.marimo.display] | |
| # theme = "dark" | |
| # /// | |
| return (mo,) | |
| def _(mo): | |
| mo.md(r"""# E-Commerce ELT Pipeline""") | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| 💡 Want a step-by-step walkthrough instead? | |
| You can check the Jupyter notebook version here: 👉 [Jupyter version](https://huggingface.co/spaces/iBrokeTheCode/E-Commerce_ELT/blob/main/tutorial_app.ipynb) | |
| """ | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""## 1. Description""") | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| This project analyzes e-commerce data from a Brazilian marketplace to explore key business metrics related to **revenue** and **delivery performance**. Using an interactive Marimo application, the analysis provides insights into: | |
| * **Revenue:** Annual revenue, popular product categories, and sales by state. | |
| * **Delivery:** Delivery performance, including time-to-delivery and its correlation with public holidays. | |
| The data pipeline processes information from multiple CSV files and a public API, storing and analyzing the results using Python. The final interactive report is presented as a Hugging Face Space built with Marimo. | |
| """ | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""## 2. ETL""") | |
| return | |
| def _(): | |
| from pandas import DataFrame | |
| from pathlib import Path | |
| from sqlalchemy import create_engine | |
| from src import config | |
| from src.extract import extract | |
| from src.load import load | |
| from src.transform import QueryEnum, run_queries | |
| return ( | |
| DataFrame, | |
| Path, | |
| QueryEnum, | |
| config, | |
| create_engine, | |
| extract, | |
| load, | |
| run_queries, | |
| ) | |
| def _(mo): | |
| mo.md(r"""### 2.1 Extract and Load""") | |
| return | |
| def _(Path, config, create_engine, extract, load): | |
| DB_PATH = Path(config.SQLITE_DB_ABSOLUTE_PATH) | |
| if DB_PATH.exists() and DB_PATH.stat().st_size > 0: | |
| print("Database found. Skipping ETL process.") | |
| ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False) | |
| else: | |
| print("Database not found or empty. Starting ETL process...") | |
| ENGINE = create_engine(f"sqlite:///{DB_PATH}", echo=False) | |
| csv_dataframes = extract( | |
| csv_folder=config.DATASET_ROOT_PATH, | |
| csv_table_mapping=config.get_csv_to_table_mapping(), | |
| public_holidays_url=config.PUBLIC_HOLIDAYS_URL, | |
| ) | |
| load(dataframes=csv_dataframes, database=ENGINE) | |
| print("ETL process complete.") | |
| return (ENGINE,) | |
| def _(mo): | |
| mo.md(r"""### 2.2 Transform""") | |
| return | |
| def _(DataFrame, ENGINE, run_queries): | |
| query_results: dict[str, DataFrame] = run_queries(database=ENGINE) | |
| return (query_results,) | |
| def _(mo): | |
| mo.md(r"""**A. Revenue by Month and Year**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value] | |
| revenue_by_month_year | |
| return (revenue_by_month_year,) | |
| def _(mo): | |
| mo.md(r"""**B. Top 10 Revenue by categories**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| top_10_revenue_categories = query_results[ | |
| QueryEnum.TOP_10_REVENUE_CATEGORIES.value | |
| ] | |
| top_10_revenue_categories | |
| return (top_10_revenue_categories,) | |
| def _(mo): | |
| mo.md(r"""**C. Top 10 Least Revenue by Categories**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| top_10_least_revenue_categories = query_results[ | |
| QueryEnum.TOP_10_LEAST_REVENUE_CATEGORIES.value | |
| ] | |
| top_10_least_revenue_categories | |
| return (top_10_least_revenue_categories,) | |
| def _(mo): | |
| mo.md(r"""**D. Revenue per State**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| revenue_per_state = query_results[QueryEnum.REVENUE_PER_STATE.value] | |
| revenue_per_state | |
| return (revenue_per_state,) | |
| def _(mo): | |
| mo.md(r"""**E. Delivery Date Difference**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| delivery_date_difference = query_results[ | |
| QueryEnum.DELIVERY_DATE_DIFFERENCE.value | |
| ] | |
| delivery_date_difference | |
| return (delivery_date_difference,) | |
| def _(mo): | |
| mo.md(r"""**F. Real vs. Predicted Delivered Time**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| real_vs_estimated_delivery_time = query_results[ | |
| QueryEnum.REAL_VS_ESTIMATED_DELIVERED_TIME.value | |
| ] | |
| real_vs_estimated_delivery_time | |
| return (real_vs_estimated_delivery_time,) | |
| def _(mo): | |
| mo.md(r"""**G. Global Amount of Order Status**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| global_amount_order_status = query_results[ | |
| QueryEnum.GLOBAL_AMOUNT_ORDER_STATUS.value | |
| ] | |
| global_amount_order_status | |
| return (global_amount_order_status,) | |
| def _(mo): | |
| mo.md(r"""**H. Orders per Day and Holidays in 2017**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| orders_per_day_and_holidays = query_results[ | |
| QueryEnum.ORDERS_PER_DAY_AND_HOLIDAYS_2017.value | |
| ] | |
| orders_per_day_and_holidays | |
| return (orders_per_day_and_holidays,) | |
| def _(mo): | |
| mo.md(r"""**I. Freight Value Weight Relationship**""") | |
| return | |
| def _(QueryEnum, query_results: "dict[str, DataFrame]"): | |
| freight_value_weight_relationship = query_results[ | |
| QueryEnum.GET_FREIGHT_VALUE_WEIGHT_RELATIONSHIP.value | |
| ] | |
| freight_value_weight_relationship | |
| return (freight_value_weight_relationship,) | |
| def _(mo): | |
| mo.md(r"""## 3. Plots""") | |
| return | |
| def _(): | |
| from src.plots import ( | |
| plot_revenue_by_month_year, | |
| plot_real_vs_predicted_delivered_time, | |
| plot_global_amount_order_status, | |
| plot_revenue_per_state, | |
| plot_top_10_least_revenue_categories, | |
| plot_top_10_revenue_categories_amount, | |
| plot_top_10_revenue_categories, | |
| plot_freight_value_weight_relationship, | |
| plot_delivery_date_difference, | |
| plot_order_amount_per_day_with_holidays, | |
| ) | |
| return ( | |
| plot_delivery_date_difference, | |
| plot_freight_value_weight_relationship, | |
| plot_global_amount_order_status, | |
| plot_order_amount_per_day_with_holidays, | |
| plot_real_vs_predicted_delivered_time, | |
| plot_revenue_by_month_year, | |
| plot_revenue_per_state, | |
| plot_top_10_least_revenue_categories, | |
| plot_top_10_revenue_categories, | |
| plot_top_10_revenue_categories_amount, | |
| ) | |
| def _(mo): | |
| mo.md(r"""**A. Revenue by Month in 2017**""") | |
| return | |
| def _(plot_revenue_by_month_year, revenue_by_month_year): | |
| plot_revenue_by_month_year(df=revenue_by_month_year, year=2017) | |
| return | |
| def _(mo): | |
| mo.md(r"""**B. Real vs. Predicted Delivered Time**""") | |
| return | |
| def _(plot_real_vs_predicted_delivered_time, real_vs_estimated_delivery_time): | |
| plot_real_vs_predicted_delivered_time( | |
| df=real_vs_estimated_delivery_time, year=2017 | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(r"""**C. Global Amount of Order Status**""") | |
| return | |
| def _(global_amount_order_status, plot_global_amount_order_status): | |
| plot_global_amount_order_status(df=global_amount_order_status) | |
| return | |
| def _(mo): | |
| mo.md(r"""**D. Revenue per State**""") | |
| return | |
| def _(plot_revenue_per_state, revenue_per_state): | |
| plot_revenue_per_state(df=revenue_per_state) | |
| return | |
| def _(mo): | |
| mo.md(r"""**E. Top 10 Least Revenue by Categories**""") | |
| return | |
| def _(plot_top_10_least_revenue_categories, top_10_least_revenue_categories): | |
| plot_top_10_least_revenue_categories(df=top_10_least_revenue_categories) | |
| return | |
| def _(mo): | |
| mo.md(r"""**F. Top 10 Revenue Categories Amount**""") | |
| return | |
| def _(plot_top_10_revenue_categories_amount, top_10_revenue_categories): | |
| plot_top_10_revenue_categories_amount(df=top_10_revenue_categories) | |
| return | |
| def _(mo): | |
| mo.md(r"""**G. Top 10 Revenue by Categories**""") | |
| return | |
| def _(plot_top_10_revenue_categories, top_10_revenue_categories): | |
| plot_top_10_revenue_categories(df=top_10_revenue_categories) | |
| return | |
| def _(mo): | |
| mo.md(r"""**H. Freight Value vs. Product Weight**""") | |
| return | |
| def _( | |
| freight_value_weight_relationship, | |
| plot_freight_value_weight_relationship, | |
| ): | |
| plot_freight_value_weight_relationship(df=freight_value_weight_relationship) | |
| return | |
| def _(mo): | |
| mo.md(r"""**I. Diffrence Between Deliver Estimated Date and Delivery Date**""") | |
| return | |
| def _(delivery_date_difference, plot_delivery_date_difference): | |
| plot_delivery_date_difference(df=delivery_date_difference) | |
| return | |
| def _(mo): | |
| mo.md(r"""**J. Order Amount per Day with Holidays**""") | |
| return | |
| def _(orders_per_day_and_holidays, plot_order_amount_per_day_with_holidays): | |
| plot_order_amount_per_day_with_holidays(df=orders_per_day_and_holidays) | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |