Spaces:
Sleeping
Sleeping
Commit ·
cdd27bb
1
Parent(s): a312419
chore: Generate database
Browse files- app.py +102 -0
- src/extract.py +1 -1
app.py
CHANGED
|
@@ -49,5 +49,107 @@ def _(mo):
|
|
| 49 |
return
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
if __name__ == "__main__":
|
| 53 |
app.run()
|
|
|
|
| 49 |
return
|
| 50 |
|
| 51 |
|
| 52 |
+
@app.cell
|
| 53 |
+
def _(mo):
|
| 54 |
+
mo.md(r"""## 2. ETL""")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@app.cell
|
| 59 |
+
def _():
|
| 60 |
+
from pandas import DataFrame
|
| 61 |
+
from pathlib import Path
|
| 62 |
+
from sqlalchemy import create_engine
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
from src import config
|
| 66 |
+
from src.extract import extract
|
| 67 |
+
from src.load import load
|
| 68 |
+
from src.transform import QueryEnum, run_queries
|
| 69 |
+
return (
|
| 70 |
+
DataFrame,
|
| 71 |
+
Path,
|
| 72 |
+
QueryEnum,
|
| 73 |
+
config,
|
| 74 |
+
create_engine,
|
| 75 |
+
extract,
|
| 76 |
+
load,
|
| 77 |
+
run_queries,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@app.cell
|
| 82 |
+
def _(Path, config, create_engine):
|
| 83 |
+
# Create the sqlite database
|
| 84 |
+
Path(config.SQLITE_DB_ABSOLUTE_PATH).touch()
|
| 85 |
+
|
| 86 |
+
# Create the database connection
|
| 87 |
+
ENGINE = create_engine(
|
| 88 |
+
r"sqlite:///{}".format(config.SQLITE_DB_ABSOLUTE_PATH), echo=False
|
| 89 |
+
)
|
| 90 |
+
return (ENGINE,)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.cell
|
| 94 |
+
def _(mo):
|
| 95 |
+
mo.md(r"""### 2.1 Extract""")
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@app.cell
|
| 100 |
+
def _(config, extract):
|
| 101 |
+
csv_folder = config.DATASET_ROOT_PATH
|
| 102 |
+
public_holidays_url = config.PUBLIC_HOLIDAYS_URL
|
| 103 |
+
|
| 104 |
+
# Get the mapping of the csv files to the table names
|
| 105 |
+
csv_table_mapping = config.get_csv_to_table_mapping()
|
| 106 |
+
|
| 107 |
+
# Extract the data from the csv files, holidays and load them into the dataframes
|
| 108 |
+
csv_dataframes = extract(
|
| 109 |
+
csv_folder=csv_folder,
|
| 110 |
+
csv_table_mapping=csv_table_mapping,
|
| 111 |
+
public_holidays_url=public_holidays_url,
|
| 112 |
+
)
|
| 113 |
+
return (csv_dataframes,)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@app.cell
|
| 117 |
+
def _(mo):
|
| 118 |
+
mo.md(r"""### 2.2 Load""")
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@app.cell
|
| 123 |
+
def _(ENGINE, csv_dataframes, load):
|
| 124 |
+
# Store dataframes in SQLite database (our Data Warehouse in this case)
|
| 125 |
+
load(dataframes=csv_dataframes, database=ENGINE)
|
| 126 |
+
return
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@app.cell
|
| 130 |
+
def _(mo):
|
| 131 |
+
mo.md(r"""### 2.3 Transform""")
|
| 132 |
+
return
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@app.cell
|
| 136 |
+
def _(DataFrame, ENGINE, run_queries):
|
| 137 |
+
query_results: dict[str, DataFrame] = run_queries(database=ENGINE)
|
| 138 |
+
return (query_results,)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@app.cell
|
| 142 |
+
def _(QueryEnum, query_results: "dict[str, DataFrame]"):
|
| 143 |
+
# Transforming the revenue_by_month_year query to a table
|
| 144 |
+
revenue_by_month_year = query_results[QueryEnum.REVENUE_BY_MONTH_YEAR.value]
|
| 145 |
+
revenue_by_month_year
|
| 146 |
+
return
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@app.cell
|
| 150 |
+
def _():
|
| 151 |
+
return
|
| 152 |
+
|
| 153 |
+
|
| 154 |
if __name__ == "__main__":
|
| 155 |
app.run()
|
src/extract.py
CHANGED
|
@@ -25,7 +25,7 @@ def get_public_holidays(url: str, year: str) -> DataFrame:
|
|
| 25 |
data = DataFrame(response.json())
|
| 26 |
|
| 27 |
# Drop the columns types and countries
|
| 28 |
-
df = data.drop(["types", "
|
| 29 |
# Convert the date column to datetime
|
| 30 |
df["date"] = to_datetime(df["date"])
|
| 31 |
|
|
|
|
| 25 |
data = DataFrame(response.json())
|
| 26 |
|
| 27 |
# Drop the columns types and countries
|
| 28 |
+
df = data.drop(["types", "counties"], axis=1) # Miss spelling in the API
|
| 29 |
# Convert the date column to datetime
|
| 30 |
df["date"] = to_datetime(df["date"])
|
| 31 |
|