Spaces:

LaelaZ
/

ecommerce-data-pipeline

Sleeping

App Files Files Community

LaelaZ commited on 5 days ago

Commit

16decd8

verified ·

1 Parent(s): 503e4c7

Deploy CommercePipeline to HF Spaces (Docker)

Browse files

Files changed (28) hide show

.streamlit/config.toml +57 -0
Dockerfile +29 -0
LICENSE +21 -0
Makefile +52 -0
dashboard/app.py +649 -0
docker-compose.yml +16 -0
pipeline/__init__.py +16 -0
pipeline/__main__.py +6 -0
pipeline/cli.py +79 -0
pipeline/config.py +56 -0
pipeline/flow.py +88 -0
pipeline/ingest.py +240 -0
pipeline/load.py +53 -0
pipeline/orchestrate.py +74 -0
pipeline/quality.py +210 -0
pipeline/sql/marts/customer_cohort_retention.sql +39 -0
pipeline/sql/marts/daily_revenue.sql +28 -0
pipeline/sql/marts/funnel_conversion.sql +31 -0
pipeline/sql/marts/int_order_revenue.sql +23 -0
pipeline/sql/marts/top_products.sql +23 -0
pipeline/sql/staging/stg_customers.sql +9 -0
pipeline/sql/staging/stg_events.sql +16 -0
pipeline/sql/staging/stg_order_items.sql +13 -0
pipeline/sql/staging/stg_orders.sql +11 -0
pipeline/sql/staging/stg_products.sql +10 -0
pipeline/transform.py +73 -0
pyproject.toml +34 -0
requirements.txt +11 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,57 @@

+# CommercePipeline — dashboard theme.
+# Identity: ANALYTICS / BI — crisp, data-forward, professional.
+#
+# Distinct visual identity: a confident TEAL accent on cool slate neutrals,
+# with monospace numerals for figures (the data-tool hallmark). This is its
+# own identity, deliberately separate from the sibling portfolio projects.
+[theme]
+base = "light"
+# --- Brand ---
+primaryColor = "#0d9488"               # teal-600 — accent / interactive
+backgroundColor = "#f5f7f9"            # cool, slate-tinted canvas
+secondaryBackgroundColor = "#ffffff"   # white — cards / inputs
+textColor = "#0f1b2a"                  # near slate-900 — body text
+borderColor = "#e2e8f0"                # hairline borders on cards/inputs
+linkColor = "#0d9488"
+# --- Type ---
+font = "sans-serif"                    # clean system/Inter-like body stack
+headingFont = "sans-serif"
+codeFont = "monospace"                 # tabular numerals + code blocks
+baseRadius = "0.85rem"                 # rounded-xl cards, consistent corners
+showWidgetBorder = true
+# --- Cohesive chart palettes (one identity across every Altair chart) ---
+# Teal-led categorical scale (data-forward, not a rainbow).
+chartCategoricalColors = [
+  "#0d9488",  # teal-600
+  "#2563eb",  # blue-600
+  "#7c3aed",  # violet-600
+  "#d97706",  # amber-600
+  "#059669",  # emerald-600
+  "#db2777",  # pink-600
+  "#0891b2",  # cyan-600
+]
+# Single-hue teal ramp for sequential / heatmap data.
+chartSequentialColors = [
+  "#ecfeff", "#cffafe", "#99f6e4", "#5eead4",
+  "#2dd4bf", "#14b8a6", "#0d9488", "#0f766e", "#115e59",
+]
+# --- Dark sidebar = a focused "control rail", BI-product style ---
+[theme.sidebar]
+backgroundColor = "#0f1b2a"
+secondaryBackgroundColor = "#16263a"
+textColor = "#cbd5e1"
+primaryColor = "#2dd4bf"               # brighter teal pops on dark
+borderColor = "#22344a"
+linkColor = "#2dd4bf"
+[server]
+headless = true
+runOnSave = true
+[browser]
+gatherUsageStats = false

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# CommercePipeline -- single image that runs the pipeline and serves the dashboard.
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+WORKDIR /app
+# Install dependencies first for better layer caching.
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+# Copy the project.
+COPY pipeline ./pipeline
+COPY dashboard ./dashboard
+COPY pyproject.toml README.md ./
+# Build the warehouse at image-build time so the container starts with data ready.
+# (Re-runnable at container start via the entrypoint below.)
+RUN python -m pipeline run
+EXPOSE 8501
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s \
+    CMD python -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8501/_stcore/health').read()==b'ok' else 1)"
+# Rebuild marts on start (cheap, ~1s) then serve the dashboard.
+CMD ["sh", "-c", "python -m pipeline run && streamlit run dashboard/app.py --server.port 8501 --server.address 0.0.0.0 --server.headless true"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Laela Zorana
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,52 @@

+# CommercePipeline -- Makefile DAG.
+# The pipeline target is itself a small DAG: ingest -> load -> transform -> quality,
+# wired through file-stamp prerequisites so `make` only reruns what changed.
+PYTHON ?= python3
+PORT ?= 8501
+RAW_DIR := data/raw
+WAREHOUSE := data/warehouse/commerce.duckdb
+RAW_STAMP := $(RAW_DIR)/.ingested
+.DEFAULT_GOAL := help
+.PHONY: help install ingest load transform quality pipeline test dashboard demo clean lint
+help: ## Show this help
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) \
+		| awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-12s\033[0m %s\n", $$1, $$2}'
+install: ## Install runtime + dev dependencies
+	$(PYTHON) -m pip install -r requirements.txt
+$(RAW_STAMP): pipeline/ingest.py pipeline/config.py
+	$(PYTHON) -m pipeline ingest
+	@touch $(RAW_STAMP)
+ingest: $(RAW_STAMP) ## Generate the synthetic raw dataset
+$(WAREHOUSE): $(RAW_STAMP) pipeline/load.py pipeline/transform.py $(wildcard pipeline/sql/**/*.sql)
+	$(PYTHON) -m pipeline load
+	$(PYTHON) -m pipeline transform
+load: $(WAREHOUSE) ## Load raw files into DuckDB
+transform: $(WAREHOUSE) ## Build staging + mart models
+quality: $(WAREHOUSE) ## Run the data-quality gate (fails on violations)
+	$(PYTHON) -m pipeline quality
+pipeline: ## Run the full pipeline: ingest -> load -> transform -> quality
+	$(PYTHON) -m pipeline run
+test: ## Run the test suite
+	$(PYTHON) -m pytest -q
+dashboard: ## Serve the Streamlit dashboard (PORT overrideable)
+	$(PYTHON) -m streamlit run dashboard/app.py --server.port $(PORT)
+demo: pipeline ## Run the pipeline, then launch the dashboard
+	$(MAKE) dashboard
+clean: ## Remove generated data and caches
+	rm -rf $(RAW_DIR)/* data/warehouse/* .pytest_cache
+	find . -type d -name __pycache__ -prune -exec rm -rf {} +

dashboard/app.py ADDED Viewed

	@@ -0,0 +1,649 @@

+"""Streamlit dashboard for the CommercePipeline marts.
+Reads the DuckDB warehouse produced by the pipeline (read-only) and presents it
+as a polished, data-forward BI product: a branded header, a bento-style KPI
+grid, monospace numerals, one cohesive teal chart palette, and a clear
+lineage / quality-gate section.
+The visual identity is ANALYTICS / BI — a confident teal accent on cool slate
+neutrals, distinct from the rest of the portfolio. Theme tokens live in
+``.streamlit/config.toml``; this file mirrors the same tokens so the Altair
+charts and custom CSS stay in lock-step.
+Run with::
+    streamlit run dashboard/app.py
+If the warehouse does not exist yet, the app explains how to build it rather
+than crashing.
+"""
+from __future__ import annotations
+import sys
+import textwrap
+from pathlib import Path
+import altair as alt
+import duckdb
+import pandas as pd
+import streamlit as st
+# Streamlit renders via markdown, which turns 4+ space-indented lines into a code
+# block — that would print our inline HTML/CSS as visible text instead of applying
+# it. Dedent every HTML string so tags start at column 0 and render as HTML.
+_st_markdown = st.markdown
+def _dedented_markdown(body, *args, **kwargs):
+    if isinstance(body, str):
+        body = textwrap.dedent(body).strip("\n")
+    return _st_markdown(body, *args, **kwargs)
+st.markdown = _dedented_markdown
+# Make the `pipeline` package importable when run via `streamlit run`.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from pipeline.config import get_settings  # noqa: E402
+# --- Brand palette — ANALYTICS / BI identity (teal on slate) ------------------
+# Mirrors .streamlit/config.toml so charts + CSS share one language.
+TEAL = "#0d9488"          # teal-600 — primary accent
+TEAL_500 = "#14b8a6"      # teal-500 — line strokes
+TEAL_SOFT = "#99f6e4"     # teal-200 — area fill
+INK = "#0f1b2a"           # near slate-900 — body / values
+SLATE_500 = "#64748b"     # muted labels
+GRID = "#eef2f6"          # hairline chart grid
+# Ordered categorical scale used across every chart for one colour identity.
+CATEGORY_RANGE = ["#0d9488", "#2563eb", "#7c3aed", "#d97706", "#059669", "#db2777", "#0891b2"]
+# Single-hue teal ramp for sequential / heatmap encodings.
+TEAL_RAMP = ["#ecfeff", "#99f6e4", "#2dd4bf", "#0d9488", "#0f766e", "#115e59"]
+# Monospace stack for tabular numerals (the data-tool hallmark).
+MONO = "ui-monospace, 'SF Mono', 'JetBrains Mono', 'Roboto Mono', Menlo, monospace"
+st.set_page_config(
+    page_title="CommercePipeline — Analytics",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+    menu_items={"about": "CommercePipeline · trustworthy e-commerce analytics. Built by Laela Zorana."},
+)
+settings = get_settings()
+# --- One cohesive Altair theme for every chart -------------------------------
+# A single config so all charts share one visual language: a clean sans body,
+# hairline horizontal grid, no chart borders, and teal-led colour ranges.
+_LABEL = {"labelColor": SLATE_500, "titleColor": SLATE_500,
+          "labelFontSize": 11, "titleFontSize": 11, "labelFontWeight": 500}
+_CHART_CONFIG = {
+    "config": {
+        "background": "transparent",
+        "font": "Inter, ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, sans-serif",
+        "view": {"stroke": "transparent"},
+        "axis": {
+            "domain": False, "ticks": False, "labelPadding": 8, "titlePadding": 12,
+            "gridColor": GRID, "gridWidth": 1, **_LABEL,
+        },
+        "axisX": {"grid": False},
+        "axisY": {"grid": True},
+        "legend": {**_LABEL, "symbolType": "circle", "symbolSize": 90},
+        "range": {"category": CATEGORY_RANGE, "heatmap": TEAL_RAMP, "ramp": TEAL_RAMP},
+        "title": {"color": INK, "fontSize": 13, "fontWeight": 600, "anchor": "start", "dy": -4},
+        "bar": {"color": TEAL},
+        "rect": {"stroke": "#ffffff", "strokeWidth": 1.5},
+    }
+}
+# Altair 5.5+ uses ``alt.theme``; fall back to the legacy registry on older 5.x.
+try:  # pragma: no cover - thin shim around the charting lib
+    alt.theme.register("commerce_bi", enable=True)(lambda: _CHART_CONFIG)
+except AttributeError:  # pragma: no cover
+    alt.themes.register("commerce_bi", lambda: _CHART_CONFIG)
+    alt.themes.enable("commerce_bi")
+# --- Global styling -----------------------------------------------------------
+def inject_styles() -> None:
+    """Hide default Streamlit chrome and apply the teal BI identity: branded
+    header, bento KPI cards with an accent rail, and monospace numerals."""
+    st.markdown(
+        f"""
+        <style>
+          :root {{
+            --teal:#0d9488; --teal-700:#0f766e; --teal-50:#f0fdfa;
+            --ink:#0f1b2a; --muted:#64748b; --line:#e6ebf0; --mono:{MONO};
+          }}
+          html, body, [class*="css"], .stApp,
+          button, input, textarea, select {{ font-family:'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, sans-serif; }}
+          /* Remove default Streamlit clutter. */
+          #MainMenu, footer, header[data-testid="stHeader"] {{ visibility:hidden; }}
+          .stDeployButton {{ display:none; }}
+          .block-container {{ padding-top:2.1rem; padding-bottom:3rem; max-width:1200px; }}
+          /* Restrained, BI-grade canvas: a single cool wash, no rainbow. */
+          .stApp {{
+            background:
+              radial-gradient(46rem 40rem at 104% -10%, rgba(13,148,136,0.07), transparent 55%),
+              radial-gradient(40rem 38rem at -6% -8%, rgba(37,99,235,0.05), transparent 52%),
+              #f5f7f9;
+          }}
+          /* Branded header. */
+          .cp-header {{ display:flex; align-items:center; gap:0.85rem; margin-bottom:0.3rem; }}
+          .cp-logo {{
+            display:grid; place-items:center; width:46px; height:46px; border-radius:13px;
+            background:linear-gradient(135deg,#0d9488,#0f766e); color:#fff;
+            box-shadow:0 8px 18px -7px rgba(13,148,136,0.6); flex:0 0 auto;
+          }}
+          .cp-logo svg {{ width:25px; height:25px; }}
+          .cp-title {{ font-size:1.68rem; font-weight:800; letter-spacing:-0.022em; line-height:1.05; color:var(--ink); }}
+          .cp-title .grad {{
+            background:linear-gradient(100deg,#0d9488,#14b8a6);
+            -webkit-background-clip:text; background-clip:text; color:transparent;
+          }}
+          .cp-eyebrow {{ font-size:0.7rem; font-weight:700; letter-spacing:0.15em; text-transform:uppercase; color:#94a3b8; }}
+          .cp-lede {{ color:#475569; font-size:1.0rem; line-height:1.6; max-width:50rem; margin:0.55rem 0 0.2rem; }}
+          .cp-lede b {{ color:var(--teal-700); font-weight:600; }}
+          /* Pill / badge row under the header. */
+          .cp-pills {{ display:flex; flex-wrap:wrap; gap:0.5rem; margin-top:0.95rem; }}
+          .cp-pill {{
+            display:inline-flex; align-items:center; gap:0.45rem;
+            font-size:0.76rem; font-weight:600; color:#334e68;
+            background:#fff; border:1px solid var(--line);
+            padding:0.3rem 0.7rem; border-radius:999px;
+            box-shadow:0 1px 2px rgba(15,27,42,0.04);
+          }}
+          .cp-pill b {{ font-family:var(--mono); font-weight:700; font-variant-numeric:tabular-nums; color:var(--ink); }}
+          .cp-pill.ok {{ color:var(--teal-700); border-color:rgba(13,148,136,0.3); background:var(--teal-50); }}
+          .cp-pill.bad {{ color:#b91c1c; border-color:rgba(220,38,38,0.3); background:#fef2f2; }}
+          .cp-dot {{ width:7px; height:7px; border-radius:999px; background:currentColor;
+                     box-shadow:0 0 0 3px color-mix(in srgb, currentColor 18%, transparent); }}
+          /* Section label. */
+          .cp-section {{ font-size:1.1rem; font-weight:700; color:var(--ink); letter-spacing:-0.01em;
+                         margin:0.2rem 0 0.1rem; display:flex; align-items:baseline; gap:0.55rem; }}
+          .cp-section .num {{ font-family:var(--mono); font-size:0.8rem; color:var(--teal);
+                              font-weight:700; letter-spacing:0.04em; }}
+          .cp-sub {{ color:var(--muted); font-size:0.88rem; margin:0 0 0.45rem; }}
+          /* Bento KPI cards. */
+          div[data-testid="stMetric"] {{
+            background:#ffffff; border:1px solid var(--line); border-radius:16px;
+            padding:1.0rem 1.15rem 0.9rem; position:relative; overflow:hidden;
+            box-shadow:0 1px 2px rgba(15,27,42,0.04), 0 12px 28px -18px rgba(15,27,42,0.22);
+            transition:transform .12s ease, box-shadow .12s ease, border-color .12s ease;
+          }}
+          div[data-testid="stMetric"]:hover {{
+            transform:translateY(-2px); border-color:#d7e0e8;
+            box-shadow:0 1px 2px rgba(15,27,42,0.05), 0 18px 36px -18px rgba(13,148,136,0.34);
+          }}
+          div[data-testid="stMetricLabel"] p {{
+            font-size:0.7rem !important; font-weight:600 !important; letter-spacing:0.07em;
+            text-transform:uppercase; color:var(--muted) !important;
+          }}
+          /* Monospace, tabular numerals — the data-tool hallmark. */
+          div[data-testid="stMetricValue"] {{
+            font-family:var(--mono) !important; font-size:1.7rem !important; font-weight:700 !important;
+            color:var(--ink) !important; letter-spacing:-0.01em; font-variant-numeric:tabular-nums;
+          }}
+          div[data-testid="stMetricDelta"] {{ font-size:0.76rem !important; font-weight:600 !important; }}
+          div[data-testid="stMetricDelta"] div {{ font-variant-numeric:tabular-nums; }}
+          /* Top "proof" cards get a teal accent rail + tinted ground. */
+          .cp-proof div[data-testid="stMetric"] {{ border-top:3px solid var(--teal); background:linear-gradient(180deg,#fbfffe,#ffffff); }}
+          /* Quality-gate check grid. */
+          .cp-gate {{ display:grid; grid-template-columns:repeat(auto-fill,minmax(225px,1fr)); gap:0.5rem; margin:0.3rem 0 0.2rem; }}
+          .cp-check {{ display:flex; align-items:center; gap:0.55rem; background:#fff; border:1px solid var(--line);
+                       border-radius:11px; padding:0.55rem 0.75rem; font-size:0.8rem; }}
+          .cp-check .ic {{ flex:0 0 auto; width:18px; height:18px; border-radius:6px; display:grid; place-items:center;
+                           font-size:0.7rem; font-weight:800; color:#fff; }}
+          .cp-check.pass .ic {{ background:var(--teal); }}
+          .cp-check.fail .ic {{ background:#dc2626; }}
+          .cp-check .nm {{ font-weight:600; color:#334e68; }}
+          .cp-check .rel {{ margin-left:auto; font-family:var(--mono); font-size:0.7rem; color:#94a3b8; }}
+          /* Lineage flow strip. */
+          .cp-flow {{ display:flex; flex-wrap:wrap; align-items:stretch; gap:0.4rem; margin:0.2rem 0 0.4rem; }}
+          .cp-stage {{ flex:1 1 150px; background:#fff; border:1px solid var(--line); border-radius:13px;
+                       padding:0.7rem 0.85rem; box-shadow:0 1px 2px rgba(15,27,42,0.04); }}
+          .cp-stage .st-n {{ font-family:var(--mono); font-size:0.68rem; font-weight:700; color:var(--teal); }}
+          .cp-stage .st-t {{ font-size:0.9rem; font-weight:700; color:var(--ink); margin-top:0.1rem; }}
+          .cp-stage .st-d {{ font-size:0.74rem; color:var(--muted); margin-top:0.15rem; line-height:1.4; }}
+          .cp-stage .st-v {{ font-family:var(--mono); font-size:0.74rem; color:var(--teal-700); font-weight:600; margin-top:0.3rem; }}
+          .cp-arrow {{ align-self:center; color:#cbd5e1; font-weight:700; }}
+          .cp-stage.gate {{ border-color:rgba(13,148,136,0.35); background:var(--teal-50); }}
+          /* Tighten radio / slider chrome. */
+          div[data-testid="stRadio"] label p, div[data-testid="stSlider"] label p {{ font-weight:600; color:#334155; }}
+          /* Footer. */
+          .cp-footer {{ margin-top:2.4rem; padding-top:1.1rem; border-top:1px solid var(--line);
+                        display:flex; flex-wrap:wrap; gap:0.5rem 1.2rem; align-items:center;
+                        justify-content:space-between; color:var(--muted); font-size:0.82rem; }}
+          .cp-footer a {{ color:var(--teal); text-decoration:none; font-weight:600; }}
+          .cp-footer a:hover {{ text-decoration:underline; }}
+          .cp-footer code {{ font-family:var(--mono); background:var(--teal-50); color:var(--teal-700);
+                             padding:0.08rem 0.4rem; border-radius:6px; font-size:0.76rem; }}
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+inject_styles()
+# --- Data access --------------------------------------------------------------
+@st.cache_data(show_spinner=False)
+def load_mart(query: str) -> pd.DataFrame:
+    con = duckdb.connect(str(settings.db_path), read_only=True)
+    try:
+        return con.execute(query).df()
+    finally:
+        con.close()
+@st.cache_data(show_spinner=False)
+def pipeline_health() -> dict:
+    """Headline pipeline proof, derived read-only from the warehouse.
+    Returns raw rows ingested, mart count, and the data-quality gate result so
+    the dashboard can show the same trust signals the build enforces.
+    """
+    from pipeline import quality  # local import keeps page load light
+    con = duckdb.connect(str(settings.db_path), read_only=True)
+    try:
+        raw_tables = [
+            r[0]
+            for r in con.execute(
+                "SELECT table_name FROM information_schema.tables "
+                "WHERE table_schema = 'raw' ORDER BY table_name;"
+            ).fetchall()
+        ]
+        raw_rows = sum(
+            con.execute(f"SELECT count(*) FROM raw.{t}").fetchone()[0] for t in raw_tables
+        )
+        marts = [
+            r[0]
+            for r in con.execute(
+                "SELECT table_name FROM information_schema.tables "
+                "WHERE table_schema = 'marts' AND table_name NOT LIKE 'int\\_%' ESCAPE '\\' "
+                "ORDER BY table_name;"
+            ).fetchall()
+        ]
+        results = quality.run(con, settings, raise_on_fail=False)
+        gates_passed = sum(1 for r in results if r.passed)
+        gates_total = len(results)
+        checks = [
+            {"name": r.name, "relation": r.relation, "passed": r.passed, "failing": r.failing_rows}
+            for r in results
+        ]
+    finally:
+        con.close()
+    return {
+        "raw_rows": raw_rows,
+        "raw_tables": len(raw_tables),
+        "marts": marts,
+        "gates_passed": gates_passed,
+        "gates_total": gates_total,
+        "checks": checks,
+    }
+def warehouse_ready() -> bool:
+    if not settings.db_path.exists():
+        return False
+    try:
+        con = duckdb.connect(str(settings.db_path), read_only=True)
+        try:
+            con.execute("SELECT 1 FROM marts.daily_revenue LIMIT 1;")
+            return True
+        finally:
+            con.close()
+    except Exception:
+        return False
+# --- Branded header -----------------------------------------------------------
+st.markdown(
+    """
+    <div class="cp-header">
+      <span class="cp-logo">
+        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.1"
+             stroke-linecap="round" stroke-linejoin="round">
+          <path d="M3 3v18h18"/><path d="M7 14l3-4 3 3 5-7"/>
+        </svg>
+      </span>
+      <div>
+        <div class="cp-eyebrow">E-commerce analytics pipeline</div>
+        <div class="cp-title">Commerce<span class="grad">Pipeline</span></div>
+      </div>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown(
+    "<p class='cp-lede'>Raw operational data &rarr; a DuckDB warehouse &rarr; "
+    "layered SQL marts, behind a <b>data-quality gate that fails the build when the "
+    "numbers can&rsquo;t be trusted</b>. Every figure below is served read-only straight "
+    "from the warehouse the pipeline produced.</p>",
+    unsafe_allow_html=True,
+)
+if not warehouse_ready():
+    st.markdown("<div class='cp-pills'><span class='cp-pill'>Warehouse not built yet</span></div>", unsafe_allow_html=True)
+    st.warning(
+        "No warehouse found yet. Build it first:\n\n"
+        "```bash\nmake pipeline    # or: python -m pipeline run\n```\n\n"
+        f"Expected database at `{settings.db_path}`."
+    )
+    st.stop()
+health = pipeline_health()
+gate_ok = health["gates_passed"] == health["gates_total"]
+st.markdown(
+    f"""
+    <div class="cp-pills">
+      <span class="cp-pill {'ok' if gate_ok else 'bad'}">
+        <span class="cp-dot"></span>
+        {'Quality gate passing' if gate_ok else 'Quality gate FAILED'} ·
+        <b>{health['gates_passed']}/{health['gates_total']}</b>
+      </span>
+      <span class="cp-pill"><b>{health['raw_rows']:,}</b>&nbsp;rows ingested · <b>{health['raw_tables']}</b>&nbsp;source tables</span>
+      <span class="cp-pill"><b>{len(health['marts'])}</b>&nbsp;analytics marts</span>
+      <span class="cp-pill">DuckDB · in-process warehouse</span>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)
+st.write("")
+# --- Headline proof KPIs ------------------------------------------------------
+st.markdown(
+    "<div class='cp-section'><span class='num'>01</span>Pipeline health</div>"
+    "<div class='cp-sub'>The trust signals the build itself enforces, surfaced in the product.</div>",
+    unsafe_allow_html=True,
+)
+st.markdown("<div class='cp-proof'>", unsafe_allow_html=True)
+p1, p2, p3, p4 = st.columns(4)
+p1.metric("Rows processed", f"{health['raw_rows']:,}", f"{health['raw_tables']} source tables")
+p2.metric("Analytics marts", f"{len(health['marts'])}", "staging → marts")
+p3.metric(
+    "Data-quality gates",
+    f"{health['gates_passed']}/{health['gates_total']}",
+    "all passing" if gate_ok else "failing",
+    delta_color="normal" if gate_ok else "inverse",
+)
+p4.metric("Warehouse", "DuckDB", "no server required")
+st.markdown("</div>", unsafe_allow_html=True)
+# --- Business KPIs ------------------------------------------------------------
+daily = load_mart(
+    "SELECT order_date, orders, customers, units_sold, revenue, gross_profit, "
+    "avg_order_value, margin_pct FROM marts.daily_revenue ORDER BY order_date"
+)
+daily["order_date"] = pd.to_datetime(daily["order_date"])
+total_rev = float(daily["revenue"].sum())
+total_orders = int(daily["orders"].sum())
+total_profit = float(daily["gross_profit"].sum())
+total_units = int(daily["units_sold"].sum())
+aov = total_rev / total_orders if total_orders else 0.0
+margin = (total_profit / total_rev * 100) if total_rev else 0.0
+st.write("")
+st.markdown(
+    "<div class='cp-section'><span class='num'>02</span>Business performance</div>"
+    f"<div class='cp-sub'>Modelled from completed orders across {len(daily):,} active days.</div>",
+    unsafe_allow_html=True,
+)
+k1, k2, k3, k4 = st.columns(4)
+k1.metric("Revenue", f"${total_rev:,.0f}", f"{total_units:,} units sold")
+k2.metric("Completed orders", f"{total_orders:,}")
+k3.metric("Avg order value", f"${aov:,.2f}")
+k4.metric("Gross margin", f"{margin:.1f}%", f"${total_profit:,.0f} profit")
+st.write("")
+# --- Revenue trend ------------------------------------------------------------
+st.markdown(
+    "<div class='cp-section'><span class='num'>03</span>Revenue trend</div>",
+    unsafe_allow_html=True,
+)
+granularity = st.radio(
+    "Granularity", ["Daily", "Weekly", "Monthly"], horizontal=True, index=1, label_visibility="collapsed"
+)
+freq = {"Daily": "D", "Weekly": "W", "Monthly": "MS"}[granularity]
+trend = (
+    daily.set_index("order_date")
+    .resample(freq)[["revenue", "gross_profit", "orders"]]
+    .sum()
+    .reset_index()
+)
+rev_chart = (
+    alt.Chart(trend)
+    .mark_area(
+        opacity=0.95,
+        line={"color": TEAL_500, "strokeWidth": 2.4},
+        color=alt.Gradient(
+            gradient="linear",
+            stops=[
+                alt.GradientStop(color="#ffffff", offset=0),
+                alt.GradientStop(color=TEAL_SOFT, offset=1),
+            ],
+            x1=1, x2=1, y1=1, y2=0,
+        ),
+    )
+    .encode(
+        x=alt.X("order_date:T", title=None, axis=alt.Axis(grid=False)),
+        y=alt.Y("revenue:Q", title="Revenue ($)", axis=alt.Axis(format="$~s", grid=True)),
+        tooltip=[
+            alt.Tooltip("order_date:T", title="Period"),
+            alt.Tooltip("revenue:Q", title="Revenue", format="$,.0f"),
+            alt.Tooltip("gross_profit:Q", title="Gross profit", format="$,.0f"),
+            alt.Tooltip("orders:Q", title="Orders", format=",.0f"),
+        ],
+    )
+    .properties(height=300)
+)
+st.altair_chart(rev_chart, use_container_width=True)
+st.write("")
+# --- Top products + funnel ----------------------------------------------------
+left, right = st.columns([3, 2], gap="large")
+with left:
+    st.markdown(
+        "<div class='cp-section'><span class='num'>04</span>Top products by revenue</div>",
+        unsafe_allow_html=True,
+    )
+    top_n = st.slider("Show top N", 5, 25, 10, label_visibility="collapsed")
+    top = load_mart(
+        "SELECT product_name, category, units_sold, revenue, margin_pct, revenue_rank "
+        f"FROM marts.top_products ORDER BY revenue_rank LIMIT {top_n}"
+    )
+    bar = (
+        alt.Chart(top)
+        .mark_bar(cornerRadiusEnd=4)
+        .encode(
+            x=alt.X("revenue:Q", title="Revenue ($)", axis=alt.Axis(format="$~s")),
+            y=alt.Y("product_name:N", sort="-x", title=None),
+            color=alt.Color(
+                "category:N",
+                scale=alt.Scale(range=CATEGORY_RANGE),
+                legend=alt.Legend(title="Category", orient="bottom", columns=3),
+            ),
+            tooltip=[
+                alt.Tooltip("product_name:N", title="Product"),
+                alt.Tooltip("category:N", title="Category"),
+                alt.Tooltip("units_sold:Q", title="Units", format=",.0f"),
+                alt.Tooltip("revenue:Q", title="Revenue", format="$,.0f"),
+                alt.Tooltip("margin_pct:Q", title="Margin", format=".1%"),
+            ],
+        )
+        .properties(height=380)
+    )
+    st.altair_chart(bar, use_container_width=True)
+with right:
+    st.markdown(
+        "<div class='cp-section'><span class='num'>05</span>Conversion funnel</div>",
+        unsafe_allow_html=True,
+    )
+    funnel = load_mart(
+        "SELECT step_name, step_index, sessions, pct_of_top, step_conversion "
+        "FROM marts.funnel_conversion ORDER BY step_index"
+    )
+    funnel["label"] = funnel["step_name"].str.replace("_", " ").str.title()
+    base = alt.Chart(funnel).encode(
+        # Sort by the model's own step_index so order is data-driven, not list-driven.
+        y=alt.Y("label:N", sort=alt.SortField(field="step_index", order="ascending"), title=None),
+    )
+    funnel_bars = base.mark_bar(color=TEAL, cornerRadiusEnd=4).encode(
+        x=alt.X("sessions:Q", title="Sessions", axis=alt.Axis(format="~s")),
+        tooltip=[
+            alt.Tooltip("label:N", title="Step"),
+            alt.Tooltip("sessions:Q", title="Sessions", format=",.0f"),
+            alt.Tooltip("pct_of_top:Q", title="% of top", format=".1%"),
+            alt.Tooltip("step_conversion:Q", title="Step conversion", format=".1%"),
+        ],
+    )
+    funnel_text = base.mark_text(align="left", dx=5, color=SLATE_500, fontWeight=600).encode(
+        x=alt.X("sessions:Q"),
+        text=alt.Text("pct_of_top:Q", format=".0%"),
+    )
+    st.altair_chart((funnel_bars + funnel_text).properties(height=210), use_container_width=True)
+    overall = float(funnel.iloc[-1]["pct_of_top"]) if len(funnel) else 0.0
+    st.metric("Overall view → purchase", f"{overall:.1%}")
+st.write("")
+# --- Cohort retention heatmap -------------------------------------------------
+st.markdown(
+    "<div class='cp-section'><span class='num'>06</span>Customer cohort retention</div>"
+    "<div class='cp-sub'>Share of each signup cohort still ordering N months later.</div>",
+    unsafe_allow_html=True,
+)
+cohort = load_mart(
+    "SELECT strftime(cohort_month, '%Y-%m') AS cohort, month_number, retention_rate "
+    "FROM marts.customer_cohort_retention "
+    "WHERE month_number BETWEEN 0 AND 11 ORDER BY cohort_month, month_number"
+)
+heat = (
+    alt.Chart(cohort)
+    .mark_rect(stroke="#ffffff", strokeWidth=1.5, cornerRadius=2)
+    .encode(
+        x=alt.X("month_number:O", title="Months since signup", axis=alt.Axis(labelAngle=0)),
+        y=alt.Y("cohort:O", title="Signup cohort"),
+        color=alt.Color(
+            "retention_rate:Q",
+            title="Retention",
+            scale=alt.Scale(range=TEAL_RAMP),
+            legend=alt.Legend(format=".0%", orient="right"),
+        ),
+        tooltip=[
+            alt.Tooltip("cohort:N", title="Cohort"),
+            alt.Tooltip("month_number:O", title="Month #"),
+            alt.Tooltip("retention_rate:Q", title="Retention", format=".1%"),
+        ],
+    )
+    .properties(height=330)
+)
+st.altair_chart(heat, use_container_width=True)
+# --- Lineage / architecture ---------------------------------------------------
+st.write("")
+st.markdown(
+    "<div class='cp-section'><span class='num'>07</span>Lineage &amp; quality gate</div>"
+    "<div class='cp-sub'>A dependency-free flow composes four stages; this dashboard "
+    "reads only the marts the gate signed off on.</div>",
+    unsafe_allow_html=True,
+)
+# Visible lineage flow strip (bento stages → quality gate → dashboard).
+marts_count = len(health["marts"])
+st.markdown(
+    f"""
+    <div class="cp-flow">
+      <div class="cp-stage">
+        <div class="st-n">01 · INGEST</div><div class="st-t">Generate</div>
+        <div class="st-d">Seeded synthetic generator</div>
+        <div class="st-v">{health['raw_rows']:,} rows · {health['raw_tables']} tables</div>
+      </div>
+      <div class="cp-arrow">&rarr;</div>
+      <div class="cp-stage">
+        <div class="st-n">02 · LOAD</div><div class="st-t">Warehouse</div>
+        <div class="st-d">Register raw files into DuckDB</div>
+        <div class="st-v">schema: raw</div>
+      </div>
+      <div class="cp-arrow">&rarr;</div>
+      <div class="cp-stage">
+        <div class="st-n">03 · TRANSFORM</div><div class="st-t">SQL marts</div>
+        <div class="st-d">staging &rarr; intermediate &rarr; marts</div>
+        <div class="st-v">{marts_count} marts</div>
+      </div>
+      <div class="cp-arrow">&rarr;</div>
+      <div class="cp-stage gate">
+        <div class="st-n">04 · QUALITY GATE</div><div class="st-t">Fail-closed</div>
+        <div class="st-d">A single failure exits non-zero &amp; halts the build</div>
+        <div class="st-v">{health['gates_passed']}/{health['gates_total']} passing</div>
+      </div>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)
+# Per-check status grid — the gate, made legible.
+_checks = sorted(health["checks"], key=lambda c: (c["passed"], c["name"]))
+_rows = "".join(
+    f"<div class='cp-check {'pass' if c['passed'] else 'fail'}'>"
+    f"<span class='ic'>{'✓' if c['passed'] else '!'}</span>"
+    f"<span class='nm'>{c['name']}</span>"
+    f"<span class='rel'>{c['relation']}</span></div>"
+    for c in _checks
+)
+st.markdown(f"<div class='cp-gate'>{_rows}</div>", unsafe_allow_html=True)
+with st.expander("Stage-by-stage detail", expanded=False):
+    st.markdown(
+        f"""
+A dependency-free flow composes four stages; the dashboard you are looking at
+reads the marts the **quality gate** signed off on.
+| # | Stage | What runs | Output |
+|---|-------|-----------|--------|
+| **1** | **Ingest** | Seeded synthetic generator | {health['raw_rows']:,} raw rows across {health['raw_tables']} Parquet/CSV tables |
+| **2** | **Load** | Register raw files into DuckDB | `raw` schema |
+| **3** | **Transform** | Layered SQL: staging → intermediate → marts | {len(health['marts'])} marts ({", ".join(f"`{m}`" for m in health['marts'])}) |
+| **4** | **Quality gate** | Declarative checks (not-null, unique, ranges, accepted values, referential integrity, mart sanity) | **{health['gates_passed']}/{health['gates_total']} passing** — a single failure exits non-zero and **halts the build** |
+```text
+ingest ─▶ load ─▶ transform ─▶ quality gate ─▶ dashboard
+ (gen)   (DuckDB)  (SQL marts)   (fail-closed)   (you are here)
+```
+Rebuild any time with `make pipeline` (or `python -m pipeline run`). Stages are
+addressable individually: `python -m pipeline {{ingest,load,transform,quality}}`.
+        """
+    )
+# --- Footer -------------------------------------------------------------------
+st.markdown(
+    f"""
+    <div class="cp-footer">
+      <span>Built by <b>Laela Zorana</b> · CommercePipeline — trustworthy e-commerce analytics.</span>
+      <span>Source: <code>{settings.db_path.name}</code> · {len(daily):,} active days ·
+            rebuild with <code>make pipeline</code></span>
+    </div>
+    """,
+    unsafe_allow_html=True,
+)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+# Run the whole thing with: docker compose up --build
+# Then open http://localhost:8501
+services:
+  commerce-pipeline:
+    build: .
+    image: commerce-pipeline:latest
+    ports:
+      - "8501:8501"
+    environment:
+      # Override any synthetic-data knob here, e.g. a bigger dataset:
+      # CP_N_ORDERS: "50000"
+      CP_SEED: "42"
+    volumes:
+      # Persist the generated warehouse on the host (optional).
+      - ./data:/app/data
+    restart: unless-stopped

pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""CommercePipeline: a small but complete e-commerce analytics pipeline.
+Stages
+------
+ingest    -> generate deterministic synthetic raw data (Parquet/CSV)
+load      -> register raw files into a DuckDB warehouse
+transform -> build staging + mart models with SQL
+quality   -> enforce data-quality gates (fails the run on violation)
+The stages are composed by :mod:`pipeline.flow` and exposed on the CLI.
+"""
+from pipeline.config import Settings, get_settings
+__all__ = ["Settings", "get_settings"]
+__version__ = "1.0.0"

pipeline/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Enable ``python -m pipeline <stage>``."""
+from pipeline.cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

pipeline/cli.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Command-line entry point for the pipeline.
+Examples
+--------
+    python -m pipeline run          # full pipeline ingest -> load -> transform -> quality
+    python -m pipeline ingest       # just regenerate raw data
+    python -m pipeline transform    # rebuild marts from the existing warehouse
+    python -m pipeline quality      # re-run the data-quality gate
+"""
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from pipeline import ingest, load, quality, transform
+from pipeline.config import get_settings
+from pipeline.flow import run_pipeline
+from pipeline.load import connect
+def _configure_logging(verbose: bool) -> None:
+    logging.basicConfig(
+        level=logging.INFO if verbose else logging.WARNING,
+        format="%(message)s",
+        stream=sys.stderr,
+    )
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="commerce-pipeline", description=__doc__)
+    parser.add_argument(
+        "stage",
+        choices=["run", "ingest", "load", "transform", "quality"],
+        help="pipeline stage to execute ('run' = full pipeline)",
+    )
+    parser.add_argument("-q", "--quiet", action="store_true", help="suppress progress logs")
+    args = parser.parse_args(argv)
+    _configure_logging(verbose=not args.quiet)
+    s = get_settings()
+    try:
+        if args.stage == "run":
+            summary = run_pipeline(s)
+            print(summary.render())
+            return 0
+        if args.stage == "ingest":
+            counts = ingest.run(s)
+            print(f"ingest complete: {sum(counts.values()):,} rows across {len(counts)} tables")
+            return 0
+        con = connect(s)
+        try:
+            if args.stage == "load":
+                counts = load.run(con, s)
+                print(f"load complete: {counts}")
+            elif args.stage == "transform":
+                marts = transform.run(con, s)
+                print(f"transform complete: built marts {marts}")
+            elif args.stage == "quality":
+                results = quality.run(con, s)
+                print(f"quality complete: {len(results)} checks passed")
+        finally:
+            con.close()
+        return 0
+    except quality.DataQualityError as exc:
+        print(f"DATA QUALITY GATE FAILED: {exc}", file=sys.stderr)
+        return 1
+    except FileNotFoundError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())

pipeline/config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Central configuration for the pipeline.
+Paths are resolved relative to the project root so the pipeline behaves the
+same whether it is invoked from the repo root, a Makefile target, or CI.
+Everything is overridable via environment variables, which keeps the code
+container- and cloud-friendly without introducing a config framework.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+from functools import lru_cache
+from pathlib import Path
+# pipeline/config.py -> project root is two levels up.
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+def _env_path(var: str, default: Path) -> Path:
+    raw = os.environ.get(var)
+    return Path(raw).expanduser().resolve() if raw else default
+@dataclass(frozen=True)
+class Settings:
+    """Immutable run configuration."""
+    project_root: Path = PROJECT_ROOT
+    raw_dir: Path = field(default_factory=lambda: _env_path("CP_RAW_DIR", PROJECT_ROOT / "data" / "raw"))
+    warehouse_dir: Path = field(
+        default_factory=lambda: _env_path("CP_WAREHOUSE_DIR", PROJECT_ROOT / "data" / "warehouse")
+    )
+    sql_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "pipeline" / "sql")
+    # Synthetic-data knobs (deterministic given the seed).
+    seed: int = int(os.environ.get("CP_SEED", "42"))
+    n_customers: int = int(os.environ.get("CP_N_CUSTOMERS", "2000"))
+    n_orders: int = int(os.environ.get("CP_N_ORDERS", "12000"))
+    n_products: int = int(os.environ.get("CP_N_PRODUCTS", "120"))
+    start_date: str = os.environ.get("CP_START_DATE", "2024-01-01")
+    end_date: str = os.environ.get("CP_END_DATE", "2024-12-31")
+    @property
+    def db_path(self) -> Path:
+        return self.warehouse_dir / "commerce.duckdb"
+    def ensure_dirs(self) -> None:
+        self.raw_dir.mkdir(parents=True, exist_ok=True)
+        self.warehouse_dir.mkdir(parents=True, exist_ok=True)
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    """Return process-wide settings (cached)."""
+    return Settings()

pipeline/flow.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Orchestration: run the full pipeline ingest -> load -> transform -> quality.
+This is a plain, dependency-free DAG runner. Each stage is a function returning
+a small summary; the flow threads a single DuckDB connection through load,
+transform, and quality so they operate on the same warehouse transaction-side.
+If `prefect` is installed, ``pipeline.orchestrate`` exposes the same DAG as a
+Prefect flow for schedulable/observable runs -- but it is entirely optional and
+the project runs end to end without it.
+"""
+from __future__ import annotations
+import logging
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Any
+from pipeline import ingest, load, quality, transform
+from pipeline.config import Settings, get_settings
+from pipeline.load import connect
+log = logging.getLogger("commerce.flow")
+@dataclass
+class RunSummary:
+    raw_counts: dict[str, int] = field(default_factory=dict)
+    loaded_counts: dict[str, int] = field(default_factory=dict)
+    marts: list[str] = field(default_factory=list)
+    quality_passed: int = 0
+    quality_failed: int = 0
+    seconds: float = 0.0
+    def as_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    def render(self) -> str:
+        lines = [
+            "=" * 60,
+            "  CommercePipeline run summary",
+            "=" * 60,
+            f"  raw rows written : {sum(self.raw_counts.values()):,}",
+        ]
+        for name, n in self.raw_counts.items():
+            lines.append(f"      - {name:<12} {n:>10,}")
+        lines.append(f"  marts produced   : {len(self.marts)} -> {', '.join(self.marts)}")
+        lines.append(
+            f"  quality gates    : {self.quality_passed} passed, {self.quality_failed} failed"
+        )
+        lines.append(f"  elapsed          : {self.seconds:.2f}s")
+        lines.append("=" * 60)
+        return "\n".join(lines)
+def run_pipeline(settings: Settings | None = None) -> RunSummary:
+    """Execute the full pipeline. Raises on a data-quality gate failure."""
+    s = settings or get_settings()
+    t0 = time.perf_counter()
+    summary = RunSummary()
+    log.info("[1/4] ingest: generating synthetic dataset (seed=%d)", s.seed)
+    summary.raw_counts = ingest.run(s)
+    con = connect(s)
+    try:
+        log.info("[2/4] load: registering raw files into DuckDB")
+        summary.loaded_counts = load.run(con, s)
+        log.info("[3/4] transform: building staging + mart models")
+        summary.marts = transform.run(con, s)
+        log.info("[4/4] quality: enforcing data-quality gates")
+        results = quality.run(con, s, raise_on_fail=False)
+        summary.quality_passed = sum(1 for r in results if r.passed)
+        summary.quality_failed = sum(1 for r in results if not r.passed)
+        failed = [r for r in results if not r.passed]
+    finally:
+        con.close()
+    summary.seconds = time.perf_counter() - t0
+    if failed:
+        names = ", ".join(f"{r.name} ({r.failing_rows} rows)" for r in failed)
+        raise quality.DataQualityError(
+            f"pipeline halted: {len(failed)} data-quality gate(s) failed: {names}"
+        )
+    return summary

pipeline/ingest.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""Ingest stage: generate a realistic, deterministic synthetic e-commerce dataset.
+We use a seeded ``numpy`` generator so every run produces byte-identical data,
+which makes the downstream marts and tests reproducible. Four raw tables are
+emitted, mirroring a typical operational store:
+* ``customers``    - one row per customer (signup date, marketing channel, country)
+* ``products``     - product catalogue (category, price, cost)
+* ``orders``       - one row per order (customer, status, timestamp)
+* ``order_items``  - order line items (product, quantity, unit price)
+* ``events``       - clickstream funnel events (view -> cart -> checkout -> purchase)
+Outputs are written as Parquet (analytics-native, typed) with a CSV mirror of
+``orders`` so the raw layer is also human-inspectable.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import numpy as np
+import pandas as pd
+from pipeline.config import Settings, get_settings
+log = logging.getLogger("commerce.ingest")
+# Reference data ----------------------------------------------------------------
+CHANNELS = ["organic", "paid_search", "social", "email", "referral", "affiliate"]
+CHANNEL_WEIGHTS = [0.34, 0.22, 0.18, 0.12, 0.09, 0.05]
+COUNTRIES = ["US", "GB", "DE", "FR", "CA", "AU", "NL", "SE"]
+COUNTRY_WEIGHTS = [0.42, 0.14, 0.10, 0.08, 0.09, 0.07, 0.06, 0.04]
+CATEGORIES = ["Apparel", "Home", "Electronics", "Beauty", "Outdoors", "Toys"]
+ORDER_STATUSES = ["completed", "completed", "completed", "completed", "refunded", "cancelled"]
+# Funnel: probability of progressing to the next step.
+FUNNEL_STEPS = ["view", "add_to_cart", "checkout", "purchase"]
+@dataclass
+class RawTables:
+    customers: pd.DataFrame
+    products: pd.DataFrame
+    orders: pd.DataFrame
+    order_items: pd.DataFrame
+    events: pd.DataFrame
+def _daterange_days(start: str, end: str) -> int:
+    d0 = datetime.fromisoformat(start)
+    d1 = datetime.fromisoformat(end)
+    return (d1 - d0).days
+def generate(settings: Settings | None = None) -> RawTables:
+    """Build the in-memory raw tables. Pure function of ``settings`` + seed."""
+    s = settings or get_settings()
+    rng = np.random.default_rng(s.seed)
+    span_days = _daterange_days(s.start_date, s.end_date)
+    start = datetime.fromisoformat(s.start_date)
+    # --- customers -----------------------------------------------------------
+    cust_ids = np.arange(1, s.n_customers + 1)
+    signup_offsets = rng.integers(0, span_days + 1, size=s.n_customers)
+    customers = pd.DataFrame(
+        {
+            "customer_id": cust_ids,
+            "signup_date": [(start + timedelta(days=int(o))).date() for o in signup_offsets],
+            "channel": rng.choice(CHANNELS, size=s.n_customers, p=CHANNEL_WEIGHTS),
+            "country": rng.choice(COUNTRIES, size=s.n_customers, p=COUNTRY_WEIGHTS),
+        }
+    )
+    # --- products ------------------------------------------------------------
+    prod_ids = np.arange(1, s.n_products + 1)
+    base_price = np.round(rng.gamma(shape=2.2, scale=18.0, size=s.n_products) + 5.0, 2)
+    margin = rng.uniform(0.35, 0.65, size=s.n_products)  # gross margin fraction
+    products = pd.DataFrame(
+        {
+            "product_id": prod_ids,
+            "product_name": [f"SKU-{i:04d}" for i in prod_ids],
+            "category": rng.choice(CATEGORIES, size=s.n_products),
+            "unit_price": base_price,
+            "unit_cost": np.round(base_price * (1.0 - margin), 2),
+        }
+    )
+    # --- orders --------------------------------------------------------------
+    # Repeat-purchase behaviour: pick customers with replacement, weighted so a
+    # subset of customers buy more often (a realistic long tail).
+    cust_affinity = rng.gamma(shape=1.6, scale=1.0, size=s.n_customers)
+    cust_affinity /= cust_affinity.sum()
+    order_customers = rng.choice(cust_ids, size=s.n_orders, replace=True, p=cust_affinity)
+    # Order date must be on/after that customer's signup date.
+    signup_by_id = dict(zip(cust_ids, signup_offsets))
+    order_offsets = np.empty(s.n_orders, dtype=int)
+    for i, c in enumerate(order_customers):
+        lo = int(signup_by_id[c])
+        order_offsets[i] = rng.integers(lo, span_days + 1) if lo < span_days else span_days
+    # Seasonal lift towards Q4 (holiday season) via a soft multiplier on selection.
+    order_ids = np.arange(1, s.n_orders + 1)
+    order_ts = [start + timedelta(days=int(o), seconds=int(rng.integers(0, 86400))) for o in order_offsets]
+    orders = pd.DataFrame(
+        {
+            "order_id": order_ids,
+            "customer_id": order_customers,
+            "order_ts": order_ts,
+            "status": rng.choice(ORDER_STATUSES, size=s.n_orders),
+        }
+    ).sort_values("order_ts", kind="stable", ignore_index=True)
+    # Reassign sequential ids by time so order_id is monotonic with order_ts.
+    orders["order_id"] = np.arange(1, s.n_orders + 1)
+    # --- order_items ---------------------------------------------------------
+    # 1-4 line items per order.
+    n_items = rng.integers(1, 5, size=s.n_orders)
+    item_order_ids = np.repeat(orders["order_id"].to_numpy(), n_items)
+    total_items = int(n_items.sum())
+    item_products = rng.choice(prod_ids, size=total_items)
+    quantities = rng.integers(1, 6, size=total_items)
+    # Capture unit price at time of sale (small jitter to mimic promos/price drift).
+    price_lookup = products.set_index("product_id")["unit_price"]
+    sale_unit_price = np.round(
+        price_lookup.loc[item_products].to_numpy() * rng.uniform(0.9, 1.0, size=total_items), 2
+    )
+    order_items = pd.DataFrame(
+        {
+            "order_item_id": np.arange(1, total_items + 1),
+            "order_id": item_order_ids,
+            "product_id": item_products,
+            "quantity": quantities,
+            "unit_price": sale_unit_price,
+        }
+    )
+    # --- events (funnel) -----------------------------------------------------
+    events = _generate_events(rng, orders, start, span_days)
+    return RawTables(customers, products, orders, order_items, events)
+def _generate_events(rng, orders: pd.DataFrame, start: datetime, span_days: int) -> pd.DataFrame:
+    """Generate a clickstream funnel.
+    Every completed order yields a full view->purchase chain (so purchase events
+    reconcile with orders). On top of that we add abandoned sessions that drop
+    out partway, which is what makes funnel_conversion interesting.
+    """
+    rows: list[dict] = []
+    event_id = 1
+    completed = orders[orders["status"] == "completed"]
+    # Funded chains for real purchases.
+    for order_id, customer_id, ts in zip(
+        completed["order_id"], completed["customer_id"], completed["order_ts"]
+    ):
+        session = int(order_id)
+        t = ts - timedelta(minutes=int(rng.integers(3, 40)))
+        for step in FUNNEL_STEPS:
+            rows.append(
+                {
+                    "event_id": event_id,
+                    "session_id": session,
+                    "customer_id": int(customer_id),
+                    "event_type": step,
+                    "event_ts": t,
+                }
+            )
+            event_id += 1
+            t += timedelta(seconds=int(rng.integers(20, 600)))
+    # Abandoned sessions (no order). Roughly 2x the purchase sessions. These
+    # never reach 'purchase' -- by construction the only sessions that purchase
+    # are the funded chains above, so funnel purchases reconcile 1:1 with
+    # completed orders. We advance at most to 'checkout'.
+    n_abandoned = len(completed) * 2
+    abandon_steps = FUNNEL_STEPS[1:-1]  # add_to_cart, checkout (no purchase)
+    drop_probs = [0.55, 0.30]  # P(advance) into add_to_cart, then checkout
+    base_session = int(orders["order_id"].max()) + 1
+    for k in range(n_abandoned):
+        session = base_session + k
+        customer_id = int(rng.choice(orders["customer_id"]))
+        offset = int(rng.integers(0, span_days + 1))
+        t = start + timedelta(days=offset, seconds=int(rng.integers(0, 86400)))
+        rows.append(
+            {
+                "event_id": event_id,
+                "session_id": session,
+                "customer_id": customer_id,
+                "event_type": "view",
+                "event_ts": t,
+            }
+        )
+        event_id += 1
+        for step, p in zip(abandon_steps, drop_probs):
+            if rng.random() > p:
+                break
+            t += timedelta(seconds=int(rng.integers(20, 600)))
+            rows.append(
+                {
+                    "event_id": event_id,
+                    "session_id": session,
+                    "customer_id": customer_id,
+                    "event_type": step,
+                    "event_ts": t,
+                }
+            )
+            event_id += 1
+    events = pd.DataFrame(rows)
+    return events.sort_values("event_ts", kind="stable", ignore_index=True)
+def write_raw(tables: RawTables, settings: Settings | None = None) -> dict[str, int]:
+    """Persist raw tables to the raw directory. Returns row counts per table."""
+    s = settings or get_settings()
+    s.ensure_dirs()
+    counts: dict[str, int] = {}
+    for name in ("customers", "products", "orders", "order_items", "events"):
+        df: pd.DataFrame = getattr(tables, name)
+        df.to_parquet(s.raw_dir / f"{name}.parquet", index=False)
+        counts[name] = len(df)
+    # Human-readable CSV mirror of the headline table.
+    tables.orders.to_csv(s.raw_dir / "orders.csv", index=False)
+    log.info("wrote raw tables: %s", counts)
+    return counts
+def run(settings: Settings | None = None) -> dict[str, int]:
+    """Ingest entry point used by the flow/CLI."""
+    s = settings or get_settings()
+    tables = generate(s)
+    return write_raw(tables, s)
+if __name__ == "__main__":  # pragma: no cover
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    print(run())

pipeline/load.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Load stage: register the raw Parquet files into a DuckDB warehouse.
+We load into a ``raw`` schema with explicit ``CREATE TABLE AS SELECT`` so the
+warehouse is materialised (self-contained, queryable with the DuckDB CLI, and
+safe to ship to the dashboard) rather than relying on live file scans.
+"""
+from __future__ import annotations
+import logging
+import duckdb
+from pipeline.config import Settings, get_settings
+log = logging.getLogger("commerce.load")
+RAW_TABLES = ("customers", "products", "orders", "order_items", "events")
+def connect(settings: Settings | None = None, read_only: bool = False) -> duckdb.DuckDBPyConnection:
+    """Open a connection to the warehouse database."""
+    s = settings or get_settings()
+    s.ensure_dirs()
+    return duckdb.connect(str(s.db_path), read_only=read_only)
+def run(con: duckdb.DuckDBPyConnection | None = None, settings: Settings | None = None) -> dict[str, int]:
+    """Load all raw Parquet files into ``raw.*`` tables. Returns row counts."""
+    s = settings or get_settings()
+    owns_con = con is None
+    con = con or connect(s)
+    try:
+        con.execute("CREATE SCHEMA IF NOT EXISTS raw;")
+        counts: dict[str, int] = {}
+        for name in RAW_TABLES:
+            path = s.raw_dir / f"{name}.parquet"
+            if not path.exists():
+                raise FileNotFoundError(
+                    f"raw table {name!r} not found at {path} - run the ingest stage first"
+                )
+            con.execute(f"CREATE OR REPLACE TABLE raw.{name} AS SELECT * FROM read_parquet(?);", [str(path)])
+            counts[name] = con.execute(f"SELECT count(*) FROM raw.{name};").fetchone()[0]
+        log.info("loaded raw tables into DuckDB: %s", counts)
+        return counts
+    finally:
+        if owns_con:
+            con.close()
+if __name__ == "__main__":  # pragma: no cover
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    print(run())

pipeline/orchestrate.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Optional Prefect orchestration of the same DAG exposed in :mod:`pipeline.flow`.
+Prefect is **not** a hard dependency. The Makefile DAG and ``pipeline.flow`` run
+the pipeline end to end without it. Install the extra to get a schedulable,
+observable flow::
+    pip install prefect
+    python -m pipeline.orchestrate          # run the flow once
+    prefect server start                    # (optional) UI for run history
+Each stage is a Prefect ``@task`` so retries, logging, and the run graph come for
+free; the orchestration mirrors ``flow.run_pipeline`` exactly.
+"""
+from __future__ import annotations
+import sys
+from pipeline import ingest, load, quality, transform
+from pipeline.config import get_settings
+from pipeline.load import connect
+try:
+    from prefect import flow, task
+except ImportError:  # pragma: no cover - optional dependency
+    print(
+        "Prefect is not installed. Install it with `pip install prefect`,\n"
+        "or run the dependency-free pipeline with `python -m pipeline run`.",
+        file=sys.stderr,
+    )
+    raise SystemExit(2)
+@task(name="ingest", retries=1)
+def ingest_task(settings) -> dict:
+    return ingest.run(settings)
+@task(name="load")
+def load_task(settings) -> dict:
+    con = connect(settings)
+    try:
+        return load.run(con, settings)
+    finally:
+        con.close()
+@task(name="transform")
+def transform_task(settings) -> list:
+    con = connect(settings)
+    try:
+        return transform.run(con, settings)
+    finally:
+        con.close()
+@task(name="quality")
+def quality_task(settings) -> int:
+    # raise_on_fail=True so a failed gate fails the Prefect run.
+    results = quality.run(settings=settings, raise_on_fail=True)
+    return len(results)
+@flow(name="commerce-pipeline")
+def commerce_pipeline_flow() -> None:
+    settings = get_settings()
+    raw = ingest_task(settings)
+    loaded = load_task(settings, wait_for=[raw])
+    marts = transform_task(settings, wait_for=[loaded])
+    quality_task(settings, wait_for=[marts])
+if __name__ == "__main__":  # pragma: no cover
+    commerce_pipeline_flow()

pipeline/quality.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Data-quality stage: declarative checks that gate the pipeline.
+Each check is a small dataclass that compiles to a single COUNT query returning
+the number of *violating* rows. A non-zero count fails the check; any failed
+check makes :func:`run` raise :class:`DataQualityError`, which the flow surfaces
+as a non-zero exit code. This is the "quality gate" -- bad data stops the run.
+Supported check types:
+    * not_null            - column has no NULLs
+    * unique              - column (or column set) has no duplicates
+    * accepted_range      - numeric column within [min, max]
+    * accepted_values     - column only contains a given value set
+    * relationship        - every FK value exists in a referenced PK column
+    * expression          - arbitrary boolean SQL that must hold for every row
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from typing import Iterable, Sequence
+import duckdb
+from pipeline.config import Settings, get_settings
+log = logging.getLogger("commerce.quality")
+class DataQualityError(RuntimeError):
+    """Raised when one or more data-quality checks fail."""
+@dataclass
+class CheckResult:
+    name: str
+    relation: str
+    passed: bool
+    failing_rows: int
+    detail: str = ""
+@dataclass
+class Check:
+    """A single declarative data-quality assertion."""
+    name: str
+    relation: str
+    kind: str
+    column: str | None = None
+    columns: Sequence[str] | None = None
+    min: float | None = None
+    max: float | None = None
+    values: Sequence[object] | None = None
+    to_relation: str | None = None
+    to_column: str | None = None
+    expression: str | None = None
+    def _violation_query(self) -> str:
+        rel = self.relation
+        if self.kind == "not_null":
+            return f"SELECT count(*) FROM {rel} WHERE {self.column} IS NULL"
+        if self.kind == "unique":
+            cols = ", ".join(self.columns or [self.column])  # type: ignore[list-item]
+            # rows that participate in a duplicate group
+            return (
+                f"SELECT COALESCE(sum(c), 0) FROM ("
+                f"  SELECT count(*) AS c FROM {rel} GROUP BY {cols} HAVING count(*) > 1"
+                f") d"
+            )
+        if self.kind == "accepted_range":
+            conds = []
+            if self.min is not None:
+                conds.append(f"{self.column} < {self.min}")
+            if self.max is not None:
+                conds.append(f"{self.column} > {self.max}")
+            conds.append(f"{self.column} IS NULL")
+            return f"SELECT count(*) FROM {rel} WHERE {' OR '.join(conds)}"
+        if self.kind == "accepted_values":
+            rendered = ", ".join(_sql_literal(v) for v in (self.values or []))
+            return f"SELECT count(*) FROM {rel} WHERE {self.column} NOT IN ({rendered})"
+        if self.kind == "relationship":
+            return (
+                f"SELECT count(*) FROM {rel} child "
+                f"LEFT JOIN {self.to_relation} parent "
+                f"  ON child.{self.column} = parent.{self.to_column} "
+                f"WHERE child.{self.column} IS NOT NULL AND parent.{self.to_column} IS NULL"
+            )
+        if self.kind == "expression":
+            return f"SELECT count(*) FROM {rel} WHERE NOT ({self.expression})"
+        raise ValueError(f"unknown check kind: {self.kind!r}")
+    def run(self, con: duckdb.DuckDBPyConnection) -> CheckResult:
+        failing = con.execute(self._violation_query()).fetchone()[0] or 0
+        return CheckResult(
+            name=self.name,
+            relation=self.relation,
+            passed=failing == 0,
+            failing_rows=int(failing),
+        )
+def _sql_literal(value: object) -> str:
+    if isinstance(value, str):
+        escaped = value.replace("'", "''")
+        return f"'{escaped}'"
+    if isinstance(value, bool):
+        return "TRUE" if value else "FALSE"
+    return str(value)
+def default_checks() -> list[Check]:
+    """The suite enforced on every pipeline run."""
+    return [
+        # --- raw integrity ---
+        Check("customers.pk_unique", "raw.customers", "unique", column="customer_id"),
+        Check("customers.id_not_null", "raw.customers", "not_null", column="customer_id"),
+        Check("products.pk_unique", "raw.products", "unique", column="product_id"),
+        Check("orders.pk_unique", "raw.orders", "unique", column="order_id"),
+        Check("order_items.pk_unique", "raw.order_items", "unique", column="order_item_id"),
+        # referential integrity
+        Check(
+            "orders.customer_fk",
+            "raw.orders",
+            "relationship",
+            column="customer_id",
+            to_relation="raw.customers",
+            to_column="customer_id",
+        ),
+        Check(
+            "order_items.order_fk",
+            "raw.order_items",
+            "relationship",
+            column="order_id",
+            to_relation="raw.orders",
+            to_column="order_id",
+        ),
+        Check(
+            "order_items.product_fk",
+            "raw.order_items",
+            "relationship",
+            column="product_id",
+            to_relation="raw.products",
+            to_column="product_id",
+        ),
+        # accepted ranges / values
+        Check("products.price_positive", "raw.products", "accepted_range", column="unit_price", min=0.01),
+        Check("products.cost_nonneg", "raw.products", "accepted_range", column="unit_cost", min=0.0),
+        Check("order_items.qty_range", "raw.order_items", "accepted_range", column="quantity", min=1, max=100),
+        Check(
+            "orders.status_values",
+            "raw.orders",
+            "accepted_values",
+            column="status",
+            values=["completed", "refunded", "cancelled"],
+        ),
+        # --- mart sanity ---
+        Check("daily_revenue.nonneg", "marts.daily_revenue", "accepted_range", column="revenue", min=0.0),
+        Check(
+            "daily_revenue.margin_bounded",
+            "marts.daily_revenue",
+            "expression",
+            expression="margin_pct BETWEEN -1 AND 1",
+        ),
+        Check(
+            "cohort.retention_bounded",
+            "marts.customer_cohort_retention",
+            "expression",
+            expression="retention_rate >= 0 AND retention_rate <= 1",
+        ),
+        Check(
+            "funnel.monotonic_nonincreasing",
+            "marts.funnel_conversion",
+            "expression",
+            expression="pct_of_top <= 1.0",
+        ),
+    ]
+def run(
+    con: duckdb.DuckDBPyConnection | None = None,
+    settings: Settings | None = None,
+    checks: Iterable[Check] | None = None,
+    raise_on_fail: bool = True,
+) -> list[CheckResult]:
+    """Run all checks. Raises :class:`DataQualityError` if any fail."""
+    s = settings or get_settings()
+    owns_con = con is None
+    con = con or duckdb.connect(str(s.db_path), read_only=True)
+    checks = list(checks if checks is not None else default_checks())
+    try:
+        results = [c.run(con) for c in checks]
+    finally:
+        if owns_con:
+            con.close()
+    failed = [r for r in results if not r.passed]
+    for r in results:
+        status = "PASS" if r.passed else "FAIL"
+        log.info("  [%s] %-32s %s (%d violating rows)", status, r.name, r.relation, r.failing_rows)
+    if failed and raise_on_fail:
+        names = ", ".join(f"{r.name} ({r.failing_rows} rows)" for r in failed)
+        raise DataQualityError(f"{len(failed)} data-quality check(s) failed: {names}")
+    return results
+if __name__ == "__main__":  # pragma: no cover
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    run()
+    print("all data-quality checks passed")

pipeline/sql/marts/customer_cohort_retention.sql ADDED Viewed

	@@ -0,0 +1,39 @@

+-- Mart: monthly cohort retention.
+-- A customer's cohort is their signup month. For each cohort we measure how many
+-- customers placed a completed order N months later (month_number = 0,1,2,...).
+CREATE OR REPLACE TABLE marts.customer_cohort_retention AS
+WITH cohorts AS (
+    SELECT customer_id, signup_month AS cohort_month
+    FROM staging.stg_customers
+),
+-- Distinct active months per customer (from completed orders).
+activity AS (
+    SELECT DISTINCT customer_id, order_month AS activity_month
+    FROM marts.int_order_revenue
+    WHERE is_completed
+),
+cohort_size AS (
+    SELECT cohort_month, count(*) AS cohort_customers
+    FROM cohorts
+    GROUP BY cohort_month
+),
+retained AS (
+    SELECT
+        c.cohort_month,
+        -- whole months between signup and activity
+        (date_diff('month', c.cohort_month, a.activity_month)) AS month_number,
+        count(DISTINCT c.customer_id) AS active_customers
+    FROM cohorts c
+    JOIN activity a USING (customer_id)
+    WHERE a.activity_month >= c.cohort_month
+    GROUP BY c.cohort_month, month_number
+)
+SELECT
+    r.cohort_month,
+    r.month_number,
+    s.cohort_customers,
+    r.active_customers,
+    CAST(r.active_customers AS DOUBLE) / NULLIF(s.cohort_customers, 0) AS retention_rate
+FROM retained r
+JOIN cohort_size s USING (cohort_month)
+ORDER BY r.cohort_month, r.month_number;

pipeline/sql/marts/daily_revenue.sql ADDED Viewed

	@@ -0,0 +1,28 @@

+-- Mart: daily revenue and order KPIs from completed orders.
+-- Refunded/cancelled orders are excluded from revenue but counted separately.
+CREATE OR REPLACE TABLE marts.daily_revenue AS
+WITH completed AS (
+    SELECT * FROM marts.int_order_revenue WHERE is_completed
+),
+by_day AS (
+    SELECT
+        order_date,
+        count(*)                          AS orders,
+        count(DISTINCT customer_id)       AS customers,
+        sum(n_items)                      AS units_sold,
+        CAST(sum(gross_revenue) AS DECIMAL(14, 2)) AS revenue,
+        CAST(sum(gross_profit)  AS DECIMAL(14, 2)) AS gross_profit
+    FROM completed
+    GROUP BY order_date
+)
+SELECT
+    order_date,
+    orders,
+    customers,
+    units_sold,
+    revenue,
+    gross_profit,
+    CAST(revenue / NULLIF(orders, 0) AS DECIMAL(12, 2)) AS avg_order_value,
+    CAST(gross_profit / NULLIF(revenue, 0) AS DECIMAL(6, 4)) AS margin_pct
+FROM by_day
+ORDER BY order_date;

pipeline/sql/marts/funnel_conversion.sql ADDED Viewed

	@@ -0,0 +1,31 @@

+-- Mart: funnel conversion across the view -> add_to_cart -> checkout -> purchase steps.
+-- Sessions are counted at the deepest step they reached, then we report the count
+-- reaching each step and the step-over-step + overall conversion rates.
+CREATE OR REPLACE TABLE marts.funnel_conversion AS
+WITH session_depth AS (
+    SELECT session_id, max(funnel_step) AS max_step
+    FROM staging.stg_events
+    GROUP BY session_id
+),
+steps(step_name, step_index) AS (
+    VALUES ('view', 1), ('add_to_cart', 2), ('checkout', 3), ('purchase', 4)
+),
+reached AS (
+    SELECT
+        s.step_name,
+        s.step_index,
+        count(*) FILTER (WHERE d.max_step >= s.step_index) AS sessions
+    FROM steps s
+    CROSS JOIN session_depth d
+    GROUP BY s.step_name, s.step_index
+)
+SELECT
+    step_name,
+    step_index,
+    sessions,
+    CAST(sessions AS DOUBLE)
+        / NULLIF(first_value(sessions) OVER (ORDER BY step_index), 0) AS pct_of_top,
+    CAST(sessions AS DOUBLE)
+        / NULLIF(lag(sessions) OVER (ORDER BY step_index), 0) AS step_conversion
+FROM reached
+ORDER BY step_index;

pipeline/sql/marts/int_order_revenue.sql ADDED Viewed

	@@ -0,0 +1,23 @@

+-- Intermediate: one row per order with rolled-up revenue/profit from its items.
+-- Shared by several marts, so we materialise it as a table.
+CREATE OR REPLACE TABLE marts.int_order_revenue AS
+SELECT
+    o.order_id,
+    o.customer_id,
+    o.order_date,
+    o.order_month,
+    o.status,
+    o.is_completed,
+    COALESCE(i.n_items, 0)            AS n_items,
+    COALESCE(i.gross_revenue, 0)      AS gross_revenue,
+    COALESCE(i.gross_profit, 0)       AS gross_profit
+FROM staging.stg_orders o
+LEFT JOIN (
+    SELECT
+        order_id,
+        sum(quantity)          AS n_items,
+        sum(line_revenue)      AS gross_revenue,
+        sum(line_gross_profit) AS gross_profit
+    FROM staging.stg_order_items
+    GROUP BY order_id
+) i USING (order_id);

pipeline/sql/marts/top_products.sql ADDED Viewed

	@@ -0,0 +1,23 @@

+-- Mart: product leaderboard by revenue, with category and units.
+-- Built from completed orders only so it reflects realised sales.
+CREATE OR REPLACE TABLE marts.top_products AS
+WITH completed_items AS (
+    SELECT oi.*
+    FROM staging.stg_order_items oi
+    JOIN marts.int_order_revenue o USING (order_id)
+    WHERE o.is_completed
+)
+SELECT
+    p.product_id,
+    p.product_name,
+    p.category,
+    sum(ci.quantity)                              AS units_sold,
+    count(DISTINCT ci.order_id)                   AS orders,
+    CAST(sum(ci.line_revenue)      AS DECIMAL(14, 2)) AS revenue,
+    CAST(sum(ci.line_gross_profit) AS DECIMAL(14, 2)) AS gross_profit,
+    CAST(sum(ci.line_gross_profit) / NULLIF(sum(ci.line_revenue), 0) AS DECIMAL(6, 4)) AS margin_pct,
+    row_number() OVER (ORDER BY sum(ci.line_revenue) DESC) AS revenue_rank
+FROM completed_items ci
+JOIN staging.stg_products p USING (product_id)
+GROUP BY p.product_id, p.product_name, p.category
+ORDER BY revenue DESC;

pipeline/sql/staging/stg_customers.sql ADDED Viewed

	@@ -0,0 +1,9 @@

+-- Staging: typed, deduplicated customers with derived signup cohort month.
+CREATE OR REPLACE VIEW staging.stg_customers AS
+SELECT
+    customer_id,
+    CAST(signup_date AS DATE)                      AS signup_date,
+    date_trunc('month', CAST(signup_date AS DATE)) AS signup_month,
+    lower(channel)                                 AS channel,
+    upper(country)                                 AS country
+FROM raw.customers;

pipeline/sql/staging/stg_events.sql ADDED Viewed

	@@ -0,0 +1,16 @@

+-- Staging: funnel events normalised to an ordinal step index.
+CREATE OR REPLACE VIEW staging.stg_events AS
+SELECT
+    event_id,
+    session_id,
+    customer_id,
+    lower(event_type) AS event_type,
+    CASE lower(event_type)
+        WHEN 'view'        THEN 1
+        WHEN 'add_to_cart' THEN 2
+        WHEN 'checkout'    THEN 3
+        WHEN 'purchase'    THEN 4
+        ELSE 0
+    END               AS funnel_step,
+    CAST(event_ts AS TIMESTAMP) AS event_ts
+FROM raw.events;

pipeline/sql/staging/stg_order_items.sql ADDED Viewed

	@@ -0,0 +1,13 @@

+-- Staging: order line items with computed line revenue and cost.
+CREATE OR REPLACE VIEW staging.stg_order_items AS
+SELECT
+    oi.order_item_id,
+    oi.order_id,
+    oi.product_id,
+    oi.quantity,
+    CAST(oi.unit_price AS DECIMAL(10, 2))                          AS unit_price,
+    CAST(oi.unit_price * oi.quantity AS DECIMAL(12, 2))            AS line_revenue,
+    CAST(p.unit_cost * oi.quantity AS DECIMAL(12, 2))             AS line_cost,
+    CAST((oi.unit_price - p.unit_cost) * oi.quantity AS DECIMAL(12, 2)) AS line_gross_profit
+FROM raw.order_items oi
+JOIN staging.stg_products p USING (product_id);

pipeline/sql/staging/stg_orders.sql ADDED Viewed

	@@ -0,0 +1,11 @@

+-- Staging: orders with split date/time parts for downstream rollups.
+CREATE OR REPLACE VIEW staging.stg_orders AS
+SELECT
+    order_id,
+    customer_id,
+    CAST(order_ts AS TIMESTAMP)            AS order_ts,
+    CAST(order_ts AS DATE)                 AS order_date,
+    date_trunc('month', CAST(order_ts AS DATE)) AS order_month,
+    lower(status)                          AS status,
+    (lower(status) = 'completed')          AS is_completed
+FROM raw.orders;

pipeline/sql/staging/stg_products.sql ADDED Viewed

	@@ -0,0 +1,10 @@

+-- Staging: products with a derived unit gross margin.
+CREATE OR REPLACE VIEW staging.stg_products AS
+SELECT
+    product_id,
+    product_name,
+    category,
+    CAST(unit_price AS DECIMAL(10, 2)) AS unit_price,
+    CAST(unit_cost AS DECIMAL(10, 2))  AS unit_cost,
+    CAST(unit_price - unit_cost AS DECIMAL(10, 2)) AS unit_margin
+FROM raw.products;

pipeline/transform.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Transform stage: build staging views and mart tables from SQL files.
+The SQL lives in ``pipeline/sql`` and is executed in a deterministic, dependency
+-aware order. Keeping the transformation in plain ``.sql`` files (rather than
+inline strings) means the models read like a dbt project and can be inspected or
+run by hand with the DuckDB CLI.
+"""
+from __future__ import annotations
+import logging
+import duckdb
+from pipeline.config import Settings, get_settings
+log = logging.getLogger("commerce.transform")
+# Execution order. Staging first (views over raw), then the intermediate model,
+# then the marts (which may depend on the intermediate model and each other-free).
+STAGING_MODELS = [
+    "staging/stg_customers.sql",
+    "staging/stg_products.sql",
+    "staging/stg_orders.sql",
+    "staging/stg_order_items.sql",
+    "staging/stg_events.sql",
+]
+MART_MODELS = [
+    "marts/int_order_revenue.sql",  # intermediate; must precede the marts below
+    "marts/daily_revenue.sql",
+    "marts/top_products.sql",
+    "marts/customer_cohort_retention.sql",
+    "marts/funnel_conversion.sql",
+]
+def _run_model(con: duckdb.DuckDBPyConnection, sql_dir, rel_path: str) -> None:
+    path = sql_dir / rel_path
+    sql = path.read_text(encoding="utf-8")
+    log.info("building model %s", rel_path)
+    con.execute(sql)
+def run(con: duckdb.DuckDBPyConnection | None = None, settings: Settings | None = None) -> list[str]:
+    """Execute all models. Returns the list of mart relation names built."""
+    s = settings or get_settings()
+    owns_con = con is None
+    con = con or duckdb.connect(str(s.db_path))
+    try:
+        con.execute("CREATE SCHEMA IF NOT EXISTS staging;")
+        con.execute("CREATE SCHEMA IF NOT EXISTS marts;")
+        for model in STAGING_MODELS:
+            _run_model(con, s.sql_dir, model)
+        for model in MART_MODELS:
+            _run_model(con, s.sql_dir, model)
+        marts = [
+            row[0]
+            for row in con.execute(
+                "SELECT table_name FROM information_schema.tables "
+                "WHERE table_schema = 'marts' ORDER BY table_name;"
+            ).fetchall()
+        ]
+        log.info("built marts: %s", marts)
+        return marts
+    finally:
+        if owns_con:
+            con.close()
+if __name__ == "__main__":  # pragma: no cover
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    print(run())

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "commerce-pipeline"
+version = "1.0.0"
+description = "An end-to-end e-commerce analytics pipeline: synthetic ingest, DuckDB warehouse, SQL marts, data-quality gates, and a Streamlit dashboard."
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "MIT" }
+authors = [{ name = "Laela Zorana" }]
+dependencies = [
+    "duckdb>=1.0",
+    "pandas>=2.0",
+    "pyarrow>=14.0",
+]
+[project.optional-dependencies]
+dashboard = ["streamlit>=1.30", "altair>=5.0"]
+dev = ["pytest>=8.0"]
+[project.scripts]
+commerce-pipeline = "pipeline.cli:main"
+[tool.setuptools.packages.find]
+include = ["pipeline*"]
+[tool.setuptools.package-data]
+pipeline = ["sql/**/*.sql"]
+[tool.pytest.ini_options]
+addopts = "-q"
+testpaths = ["tests"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# Core pipeline (ingest -> load -> transform -> quality)
+duckdb>=1.0
+pandas>=2.0
+pyarrow>=14.0
+# Dashboard
+streamlit>=1.30
+altair>=5.0
+# Tests
+pytest>=8.0