{ "dag_id": "ecommerce_etl_pipeline", "description": "Daily ETL pipeline for e-commerce data warehouse", "schedule_interval": "0 2 * * *", "start_date": "2025-01-01", "catchup": false, "tags": ["etl", "ecommerce", "daily"], "default_args": { "owner": "data_engineering", "retries": 3, "retry_delay_minutes": 5, "email_on_failure": true }, "tasks": [ { "task_id": "extract_customers", "operator": "PythonOperator", "description": "Extract customer data from source database", "upstream_dependencies": [], "downstream_dependencies": ["transform_customers"], "source": "postgres://source_db/customers", "target": "s3://data-lake/raw/customers/" }, { "task_id": "extract_orders", "operator": "PythonOperator", "description": "Extract orders data from source database", "upstream_dependencies": [], "downstream_dependencies": ["transform_orders"], "source": "postgres://source_db/orders", "target": "s3://data-lake/raw/orders/" }, { "task_id": "extract_products", "operator": "PythonOperator", "description": "Extract products data from source database", "upstream_dependencies": [], "downstream_dependencies": ["transform_products"], "source": "postgres://source_db/products", "target": "s3://data-lake/raw/products/" }, { "task_id": "extract_order_items", "operator": "PythonOperator", "description": "Extract order items from source database", "upstream_dependencies": [], "downstream_dependencies": ["transform_order_items"], "source": "postgres://source_db/order_items", "target": "s3://data-lake/raw/order_items/" }, { "task_id": "transform_customers", "operator": "SparkSubmitOperator", "description": "Clean and transform customer data", "upstream_dependencies": ["extract_customers"], "downstream_dependencies": ["load_dim_customers"], "source": "s3://data-lake/raw/customers/", "target": "s3://data-lake/transformed/customers/" }, { "task_id": "transform_orders", "operator": "SparkSubmitOperator", "description": "Clean and transform orders data", "upstream_dependencies": ["extract_orders"], "downstream_dependencies": ["load_fct_orders"], "source": "s3://data-lake/raw/orders/", "target": "s3://data-lake/transformed/orders/" }, { "task_id": "transform_products", "operator": "SparkSubmitOperator", "description": "Clean and transform products data", "upstream_dependencies": ["extract_products"], "downstream_dependencies": ["load_dim_products"], "source": "s3://data-lake/raw/products/", "target": "s3://data-lake/transformed/products/" }, { "task_id": "transform_order_items", "operator": "SparkSubmitOperator", "description": "Clean and transform order items data", "upstream_dependencies": ["extract_order_items"], "downstream_dependencies": ["load_fct_orders"], "source": "s3://data-lake/raw/order_items/", "target": "s3://data-lake/transformed/order_items/" }, { "task_id": "load_dim_customers", "operator": "SnowflakeOperator", "description": "Load customer dimension to Snowflake", "upstream_dependencies": ["transform_customers"], "downstream_dependencies": ["build_customer_metrics"], "source": "s3://data-lake/transformed/customers/", "target": "snowflake://warehouse/analytics.dim_customers" }, { "task_id": "load_dim_products", "operator": "SnowflakeOperator", "description": "Load product dimension to Snowflake", "upstream_dependencies": ["transform_products"], "downstream_dependencies": ["build_sales_report"], "source": "s3://data-lake/transformed/products/", "target": "snowflake://warehouse/analytics.dim_products" }, { "task_id": "load_fct_orders", "operator": "SnowflakeOperator", "description": "Load orders fact table to Snowflake", "upstream_dependencies": ["transform_orders", "transform_order_items"], "downstream_dependencies": ["build_customer_metrics", "build_sales_report"], "source": ["s3://data-lake/transformed/orders/", "s3://data-lake/transformed/order_items/"], "target": "snowflake://warehouse/analytics.fct_orders" }, { "task_id": "build_customer_metrics", "operator": "SnowflakeOperator", "description": "Calculate customer lifetime value and metrics", "upstream_dependencies": ["load_dim_customers", "load_fct_orders"], "downstream_dependencies": ["publish_to_bi"], "source": ["analytics.dim_customers", "analytics.fct_orders"], "target": "snowflake://warehouse/analytics.rpt_customer_metrics" }, { "task_id": "build_sales_report", "operator": "SnowflakeOperator", "description": "Build daily sales report", "upstream_dependencies": ["load_dim_products", "load_fct_orders"], "downstream_dependencies": ["publish_to_bi"], "source": ["analytics.dim_products", "analytics.fct_orders"], "target": "snowflake://warehouse/analytics.rpt_daily_sales" }, { "task_id": "publish_to_bi", "operator": "PythonOperator", "description": "Publish reports to BI tool", "upstream_dependencies": ["build_customer_metrics", "build_sales_report"], "downstream_dependencies": ["notify_stakeholders"], "source": ["analytics.rpt_customer_metrics", "analytics.rpt_daily_sales"], "target": "tableau://server/ecommerce_dashboard" }, { "task_id": "notify_stakeholders", "operator": "EmailOperator", "description": "Send completion notification", "upstream_dependencies": ["publish_to_bi"], "downstream_dependencies": [] } ], "notes": "Sample Airflow DAG representing a complete ETL pipeline with extract, transform, load, and reporting stages." }