Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- .env.example +34 -0
- .gitignore +63 -0
- Dockerfile +38 -0
- README.md +163 -7
- app.py +145 -0
- docker-compose.yml +111 -0
- dvc.yaml +51 -0
- params.yaml +24 -0
- requirements.txt +47 -0
.env.example
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# .env.example — Copy to .env and fill in your values.
|
| 2 |
+
# Never commit .env to Git.
|
| 3 |
+
|
| 4 |
+
# Database
|
| 5 |
+
POSTGRES_PASSWORD=your_secure_password_here
|
| 6 |
+
DATABASE_URL=postgresql://admin:your_secure_password_here@localhost:5432/flights
|
| 7 |
+
|
| 8 |
+
# AWS
|
| 9 |
+
AWS_ACCESS_KEY_ID=AKIA...
|
| 10 |
+
AWS_SECRET_ACCESS_KEY=...
|
| 11 |
+
AWS_DEFAULT_REGION=us-east-1
|
| 12 |
+
S3_RAW_BUCKET=flight-delay-raw-data
|
| 13 |
+
S3_PROCESSED_BUCKET=flight-delay-processed
|
| 14 |
+
|
| 15 |
+
# MLflow
|
| 16 |
+
MLFLOW_TRACKING_URI=http://localhost:5000
|
| 17 |
+
|
| 18 |
+
# BTS API (optional — bulk CSV download doesn't need it)
|
| 19 |
+
BTS_API_KEY=
|
| 20 |
+
|
| 21 |
+
# Airflow
|
| 22 |
+
AIRFLOW_FERNET_KEY=
|
| 23 |
+
AIRFLOW_SECRET_KEY=changeme_in_production
|
| 24 |
+
|
| 25 |
+
# Grafana
|
| 26 |
+
GRAFANA_PASSWORD=admin
|
| 27 |
+
|
| 28 |
+
# Monitoring
|
| 29 |
+
DRIFT_SHARE_THRESHOLD=0.30
|
| 30 |
+
SQS_RETRAIN_QUEUE_URL=https://sqs.us-east-1.amazonaws.com/123456789/retrain-queue
|
| 31 |
+
|
| 32 |
+
# Deployment
|
| 33 |
+
API_ENDPOINT=https://your-alb-dns.us-east-1.elb.amazonaws.com
|
| 34 |
+
MODEL_VERSION=v1.0.0
|
.gitignore
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.eggs/
|
| 6 |
+
dist/
|
| 7 |
+
build/
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
|
| 12 |
+
# Data (tracked by DVC, not Git)
|
| 13 |
+
data/raw/
|
| 14 |
+
data/processed/
|
| 15 |
+
!data/.gitkeep
|
| 16 |
+
|
| 17 |
+
# Models (tracked by DVC or MLflow)
|
| 18 |
+
models/*.pkl
|
| 19 |
+
models/*.joblib
|
| 20 |
+
!models/.gitkeep
|
| 21 |
+
|
| 22 |
+
# MLflow
|
| 23 |
+
mlruns/
|
| 24 |
+
mlartifacts/
|
| 25 |
+
|
| 26 |
+
# Reports
|
| 27 |
+
reports/*.png
|
| 28 |
+
reports/*.html
|
| 29 |
+
reports/*.json
|
| 30 |
+
!reports/.gitkeep
|
| 31 |
+
|
| 32 |
+
# Metrics (DVC manages)
|
| 33 |
+
metrics/
|
| 34 |
+
|
| 35 |
+
# Env
|
| 36 |
+
.env
|
| 37 |
+
.env.*
|
| 38 |
+
!.env.example
|
| 39 |
+
|
| 40 |
+
# Editors
|
| 41 |
+
.vscode/
|
| 42 |
+
.idea/
|
| 43 |
+
*.swp
|
| 44 |
+
|
| 45 |
+
# Testing
|
| 46 |
+
.pytest_cache/
|
| 47 |
+
.coverage
|
| 48 |
+
coverage.xml
|
| 49 |
+
htmlcov/
|
| 50 |
+
|
| 51 |
+
# OS
|
| 52 |
+
.DS_Store
|
| 53 |
+
Thumbs.db
|
| 54 |
+
|
| 55 |
+
# Jupyter
|
| 56 |
+
.ipynb_checkpoints/
|
| 57 |
+
*.ipynb
|
| 58 |
+
|
| 59 |
+
# Terraform
|
| 60 |
+
terraform/.terraform/
|
| 61 |
+
terraform/*.tfstate
|
| 62 |
+
terraform/*.tfstate.backup
|
| 63 |
+
terraform/.terraform.lock.hcl
|
Dockerfile
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build — keeps final image small and dependency-free from build tools
|
| 2 |
+
# ── Stage 1: Build ────────────────────────────────────────────────────────────
|
| 3 |
+
FROM python:3.11-slim AS builder
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 8 |
+
|
| 9 |
+
# ── Stage 2: Runtime ──────────────────────────────────────────────────────────
|
| 10 |
+
FROM python:3.11-slim
|
| 11 |
+
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
# Non-root user for security
|
| 15 |
+
RUN useradd -m -u 1001 appuser
|
| 16 |
+
|
| 17 |
+
# Copy installed packages from builder
|
| 18 |
+
COPY --from=builder /root/.local /home/appuser/.local
|
| 19 |
+
|
| 20 |
+
# Copy application source
|
| 21 |
+
COPY --chown=appuser:appuser . .
|
| 22 |
+
|
| 23 |
+
# Create writable dirs the app needs
|
| 24 |
+
RUN mkdir -p models data/raw data/processed reports metrics \
|
| 25 |
+
&& chown -R appuser:appuser models data reports metrics
|
| 26 |
+
|
| 27 |
+
USER appuser
|
| 28 |
+
|
| 29 |
+
ENV PATH=/home/appuser/.local/bin:$PATH \
|
| 30 |
+
PYTHONUNBUFFERED=1 \
|
| 31 |
+
PYTHONDONTWRITEBYTECODE=1
|
| 32 |
+
|
| 33 |
+
EXPOSE 8000
|
| 34 |
+
|
| 35 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
|
| 36 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
|
| 37 |
+
|
| 38 |
+
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
|
README.md
CHANGED
|
@@ -1,13 +1,169 @@
|
|
| 1 |
---
|
| 2 |
-
title: Flight Delay Prediction
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Flight Delay Prediction Platform
|
| 3 |
+
emoji: ✈️
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.31.0
|
| 8 |
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
license: mit
|
| 11 |
+
tags:
|
| 12 |
+
- machine-learning
|
| 13 |
+
- tabular-classification
|
| 14 |
+
- xgboost
|
| 15 |
+
- flight-delay
|
| 16 |
+
- mlops
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# ✈️ Flight Delay Prediction Platform
|
| 20 |
+
|
| 21 |
+
> End-to-end ML system: ETL → Training → MLOps → FastAPI → Cloud Deploy → Monitoring
|
| 22 |
+
|
| 23 |
+
[](https://github.com/YOUR_USERNAME/flight-delay-platform/actions)
|
| 24 |
+
[](https://python.org)
|
| 25 |
+
[](LICENSE)
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## 🗺 Architecture
|
| 30 |
+
|
| 31 |
+
```
|
| 32 |
+
BTS API / OpenSky
|
| 33 |
+
│
|
| 34 |
+
▼
|
| 35 |
+
┌─────────┐ ┌──────────┐ ┌──────────┐
|
| 36 |
+
│ ETL │───▶│ Postgres│───▶│ Feature │
|
| 37 |
+
│ Airflow │ │ + S3 │ │ Store │
|
| 38 |
+
└─────────┘ └──────────┘ └──────────┘
|
| 39 |
+
│
|
| 40 |
+
▼
|
| 41 |
+
┌──────────────┐
|
| 42 |
+
│ XGBoost / │
|
| 43 |
+
│ LightGBM │
|
| 44 |
+
│ Training │
|
| 45 |
+
└──────┬───────┘
|
| 46 |
+
│ MLflow
|
| 47 |
+
▼
|
| 48 |
+
┌──────────────┐
|
| 49 |
+
│ FastAPI │
|
| 50 |
+
│ Inference │──▶ AWS ECS / GCP Cloud Run
|
| 51 |
+
└──────┬───────┘
|
| 52 |
+
│
|
| 53 |
+
▼
|
| 54 |
+
┌──────────────┐
|
| 55 |
+
│ Evidently │
|
| 56 |
+
│ + Grafana │
|
| 57 |
+
└──────────────┘
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## 🚀 Quick Start
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
# 1. Clone
|
| 64 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/flight-delay-platform
|
| 65 |
+
cd flight-delay-platform
|
| 66 |
+
|
| 67 |
+
# 2. Install
|
| 68 |
+
pip install -r requirements.txt
|
| 69 |
+
|
| 70 |
+
# 3. Run local stack
|
| 71 |
+
docker-compose up -d
|
| 72 |
+
|
| 73 |
+
# 4. Trigger ETL
|
| 74 |
+
python -m etl.extract --start-date 2024-01-01 --end-date 2024-06-30
|
| 75 |
+
|
| 76 |
+
# 5. Train models
|
| 77 |
+
python -m ml.train
|
| 78 |
+
|
| 79 |
+
# 6. Start API
|
| 80 |
+
uvicorn api.main:app --reload --port 8000
|
| 81 |
+
|
| 82 |
+
# 7. Try a prediction
|
| 83 |
+
curl -X POST http://localhost:8000/predict \
|
| 84 |
+
-H "Content-Type: application/json" \
|
| 85 |
+
-d '{"dep_hour":8,"dep_dayofweek":1,"dep_month":3,
|
| 86 |
+
"carrier_enc":3,"origin_enc":10,"dest_enc":25,
|
| 87 |
+
"crs_elapsed_time":185,"distance":1400,
|
| 88 |
+
"origin_delay_rate":0.22,"is_weekend":0,"is_peak_hour":1}'
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## 📂 Project Structure
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
flight-delay-platform/
|
| 95 |
+
├── app.py # Gradio demo (HF Spaces entry point)
|
| 96 |
+
├── etl/ # Data pipeline
|
| 97 |
+
│ ├── extract.py
|
| 98 |
+
│ ├── transform.py
|
| 99 |
+
│ ├── validate.py
|
| 100 |
+
│ └── load.py
|
| 101 |
+
├── ml/ # Model training
|
| 102 |
+
│ ├── train.py
|
| 103 |
+
│ ├── evaluate.py
|
| 104 |
+
│ ├── features.py
|
| 105 |
+
│ └── config.py
|
| 106 |
+
├── mlops/ # Model registry & promotion
|
| 107 |
+
│ ├── model_registry.py
|
| 108 |
+
│ └── promote.py
|
| 109 |
+
├── api/ # FastAPI serving
|
| 110 |
+
│ ├── main.py
|
| 111 |
+
│ ├── schemas.py
|
| 112 |
+
│ └── middleware.py
|
| 113 |
+
├── monitoring/ # Drift detection & metrics
|
| 114 |
+
│ ├── drift_detector.py
|
| 115 |
+
│ └── prometheus.yml
|
| 116 |
+
├── dags/ # Airflow DAGs
|
| 117 |
+
│ ├── etl_dag.py
|
| 118 |
+
│ └── retrain_dag.py
|
| 119 |
+
├── tests/ # Pytest suite
|
| 120 |
+
│ ├── test_etl.py
|
| 121 |
+
│ ├── test_model.py
|
| 122 |
+
│ └── test_api.py
|
| 123 |
+
├── terraform/ # IaC (AWS ECS)
|
| 124 |
+
│ ├── main.tf
|
| 125 |
+
│ └── variables.tf
|
| 126 |
+
├── .github/workflows/ # CI/CD
|
| 127 |
+
│ └── deploy.yml
|
| 128 |
+
├── docker-compose.yml
|
| 129 |
+
├── Dockerfile
|
| 130 |
+
├── dvc.yaml
|
| 131 |
+
├── params.yaml
|
| 132 |
+
└── requirements.txt
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
## 🧠 Models
|
| 136 |
+
|
| 137 |
+
| Model | CV AUC | F1 | Notes |
|
| 138 |
+
|---|---|---|---|
|
| 139 |
+
| Logistic Regression | ~0.72 | ~0.65 | Baseline |
|
| 140 |
+
| LightGBM | ~0.83 | ~0.74 | Fast, good default |
|
| 141 |
+
| **XGBoost (tuned)** | **~0.86** | **~0.77** | **Production model** |
|
| 142 |
+
|
| 143 |
+
Target: predict whether a flight will be **≥15 minutes late** (FAA standard).
|
| 144 |
+
|
| 145 |
+
## 🔑 Key Features Used
|
| 146 |
+
|
| 147 |
+
- `dep_hour`, `dep_dayofweek`, `dep_month`, `is_weekend`, `is_peak_hour`
|
| 148 |
+
- `carrier_enc`, `origin_enc`, `dest_enc`
|
| 149 |
+
- `crs_elapsed_time`, `distance`
|
| 150 |
+
- `origin_delay_rate` (rolling 30-day historical delay rate per airport)
|
| 151 |
+
- `weather_wind_speed`, `weather_precip_mm`
|
| 152 |
+
|
| 153 |
+
## 🛠 Tech Stack
|
| 154 |
+
|
| 155 |
+
| Layer | Tools |
|
| 156 |
+
|---|---|
|
| 157 |
+
| Orchestration | Apache Airflow 2.8 |
|
| 158 |
+
| ML | XGBoost, LightGBM, Scikit-learn |
|
| 159 |
+
| Experiment Tracking | MLflow + DVC |
|
| 160 |
+
| Serving | FastAPI + Uvicorn |
|
| 161 |
+
| Containerization | Docker + Docker Compose |
|
| 162 |
+
| Cloud | AWS ECS Fargate + S3 + RDS |
|
| 163 |
+
| IaC | Terraform |
|
| 164 |
+
| Monitoring | Evidently AI + Prometheus + Grafana |
|
| 165 |
+
| CI/CD | GitHub Actions |
|
| 166 |
+
|
| 167 |
+
## 📄 License
|
| 168 |
+
|
| 169 |
+
MIT © 2024
|
app.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py — Gradio demo for HF Spaces.
|
| 3 |
+
Loads the trained model (or a stub) and exposes a live prediction UI.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pickle
|
| 10 |
+
import os
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# ── Load model (falls back to a stub if not yet trained) ──────────────────────
|
| 14 |
+
MODEL_PATH = Path("models/best_model.pkl")
|
| 15 |
+
|
| 16 |
+
def load_model():
|
| 17 |
+
if MODEL_PATH.exists():
|
| 18 |
+
with open(MODEL_PATH, "rb") as f:
|
| 19 |
+
return pickle.load(f)
|
| 20 |
+
return None # stub mode
|
| 21 |
+
|
| 22 |
+
model = load_model()
|
| 23 |
+
|
| 24 |
+
CARRIER_MAP = {"AA": 0, "DL": 1, "UA": 2, "WN": 3, "B6": 4, "AS": 5, "F9": 6, "NK": 7}
|
| 25 |
+
AIRPORT_STUB = { # small lookup for demo
|
| 26 |
+
"ATL": 0, "LAX": 1, "ORD": 2, "DFW": 3, "DEN": 4,
|
| 27 |
+
"JFK": 5, "SFO": 6, "SEA": 7, "LAS": 8, "MIA": 9,
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def predict_delay(
|
| 31 |
+
dep_hour, dep_dayofweek, dep_month,
|
| 32 |
+
carrier, origin, dest,
|
| 33 |
+
crs_elapsed_time, distance,
|
| 34 |
+
origin_delay_rate,
|
| 35 |
+
is_weekend, is_peak_hour,
|
| 36 |
+
):
|
| 37 |
+
carrier_enc = CARRIER_MAP.get(carrier, 0)
|
| 38 |
+
origin_enc = AIRPORT_STUB.get(origin, 0)
|
| 39 |
+
dest_enc = AIRPORT_STUB.get(dest, 1)
|
| 40 |
+
|
| 41 |
+
features = pd.DataFrame([{
|
| 42 |
+
"dep_hour": int(dep_hour),
|
| 43 |
+
"dep_dayofweek": int(dep_dayofweek),
|
| 44 |
+
"dep_month": int(dep_month),
|
| 45 |
+
"carrier_enc": carrier_enc,
|
| 46 |
+
"origin_enc": origin_enc,
|
| 47 |
+
"dest_enc": dest_enc,
|
| 48 |
+
"crs_elapsed_time": float(crs_elapsed_time),
|
| 49 |
+
"distance": float(distance),
|
| 50 |
+
"origin_delay_rate": float(origin_delay_rate),
|
| 51 |
+
"is_weekend": int(is_weekend == "Yes"),
|
| 52 |
+
"is_peak_hour": int(is_peak_hour == "Yes"),
|
| 53 |
+
}])
|
| 54 |
+
|
| 55 |
+
if model is not None:
|
| 56 |
+
prob = float(model.predict_proba(features)[0, 1])
|
| 57 |
+
else:
|
| 58 |
+
# Demo stub: simple heuristic
|
| 59 |
+
prob = min(1.0, (
|
| 60 |
+
origin_delay_rate * 0.5 +
|
| 61 |
+
(0.15 if is_peak_hour == "Yes" else 0) +
|
| 62 |
+
(0.1 if is_weekend == "Yes" else 0) +
|
| 63 |
+
(dep_month in [6, 7, 12]) * 0.1
|
| 64 |
+
) + np.random.normal(0, 0.05))
|
| 65 |
+
prob = max(0.0, prob)
|
| 66 |
+
|
| 67 |
+
label = "🔴 LIKELY DELAYED" if prob >= 0.5 else "🟢 LIKELY ON TIME"
|
| 68 |
+
confidence = "HIGH" if abs(prob - 0.5) > 0.25 else "MEDIUM" if abs(prob - 0.5) > 0.1 else "LOW"
|
| 69 |
+
bar = "█" * int(prob * 20) + "░" * (20 - int(prob * 20))
|
| 70 |
+
|
| 71 |
+
return (
|
| 72 |
+
f"{label}\n\n"
|
| 73 |
+
f"Delay probability : {prob:.1%}\n"
|
| 74 |
+
f"Confidence : {confidence}\n"
|
| 75 |
+
f"[{bar}] {prob:.1%}"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ── UI ────────────────────────────────────────────────────────────────────────
|
| 80 |
+
with gr.Blocks(
|
| 81 |
+
title="✈️ Flight Delay Predictor",
|
| 82 |
+
theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
|
| 83 |
+
) as demo:
|
| 84 |
+
|
| 85 |
+
gr.Markdown("""
|
| 86 |
+
# ✈️ Flight Delay Prediction
|
| 87 |
+
Predict whether a flight will be **≥ 15 minutes late** using the trained XGBoost model.
|
| 88 |
+
> Part of the [Flight Delay ML Platform](https://github.com/YOUR_USERNAME/flight-delay-platform)
|
| 89 |
+
""")
|
| 90 |
+
|
| 91 |
+
with gr.Row():
|
| 92 |
+
with gr.Column():
|
| 93 |
+
gr.Markdown("### ✈️ Flight Details")
|
| 94 |
+
carrier = gr.Dropdown(list(CARRIER_MAP.keys()), value="AA", label="Airline")
|
| 95 |
+
origin = gr.Dropdown(list(AIRPORT_STUB.keys()), value="ATL", label="Origin Airport")
|
| 96 |
+
dest = gr.Dropdown(list(AIRPORT_STUB.keys()), value="LAX", label="Destination Airport")
|
| 97 |
+
distance = gr.Slider(100, 5000, value=1400, step=50, label="Distance (miles)")
|
| 98 |
+
crs_elapsed_time = gr.Slider(30, 600, value=185, step=5, label="Scheduled Duration (min)")
|
| 99 |
+
|
| 100 |
+
with gr.Column():
|
| 101 |
+
gr.Markdown("### 🕐 Schedule")
|
| 102 |
+
dep_hour = gr.Slider(0, 23, value=8, step=1, label="Departure Hour (0–23)")
|
| 103 |
+
dep_dayofweek = gr.Slider(0, 6, value=1, step=1, label="Day of Week (0=Mon, 6=Sun)")
|
| 104 |
+
dep_month = gr.Slider(1, 12, value=3, step=1, label="Month")
|
| 105 |
+
is_weekend = gr.Radio(["Yes", "No"], value="No", label="Weekend Flight?")
|
| 106 |
+
is_peak_hour = gr.Radio(["Yes", "No"], value="Yes", label="Peak Hour? (7–9am / 5–8pm)")
|
| 107 |
+
|
| 108 |
+
with gr.Column():
|
| 109 |
+
gr.Markdown("### 🌦 Airport History")
|
| 110 |
+
origin_delay_rate = gr.Slider(
|
| 111 |
+
0.0, 1.0, value=0.22, step=0.01,
|
| 112 |
+
label="Origin Airport 30-Day Delay Rate"
|
| 113 |
+
)
|
| 114 |
+
gr.Markdown("### 📊 Prediction")
|
| 115 |
+
output = gr.Textbox(label="Result", lines=5, interactive=False)
|
| 116 |
+
predict_btn = gr.Button("Predict Delay →", variant="primary")
|
| 117 |
+
|
| 118 |
+
predict_btn.click(
|
| 119 |
+
fn=predict_delay,
|
| 120 |
+
inputs=[
|
| 121 |
+
dep_hour, dep_dayofweek, dep_month,
|
| 122 |
+
carrier, origin, dest,
|
| 123 |
+
crs_elapsed_time, distance,
|
| 124 |
+
origin_delay_rate, is_weekend, is_peak_hour,
|
| 125 |
+
],
|
| 126 |
+
outputs=output,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
gr.Examples(
|
| 130 |
+
examples=[
|
| 131 |
+
[8, 1, 3, "AA", "ATL", "LAX", 185, 1400, 0.22, "No", "Yes"],
|
| 132 |
+
[18, 4, 7, "UA", "ORD", "JFK", 140, 780, 0.38, "No", "Yes"],
|
| 133 |
+
[6, 6, 1, "WN", "DEN", "SFO", 95, 950, 0.12, "Yes", "No" ],
|
| 134 |
+
[14, 3, 12,"DL", "ATL", "MIA", 75, 660, 0.45, "No", "No" ],
|
| 135 |
+
],
|
| 136 |
+
inputs=[
|
| 137 |
+
dep_hour, dep_dayofweek, dep_month,
|
| 138 |
+
carrier, origin, dest,
|
| 139 |
+
crs_elapsed_time, distance,
|
| 140 |
+
origin_delay_rate, is_weekend, is_peak_hour,
|
| 141 |
+
],
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
demo.launch()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.9"
|
| 2 |
+
|
| 3 |
+
# Full local development stack
|
| 4 |
+
# Start everything: docker-compose up -d
|
| 5 |
+
# View logs: docker-compose logs -f inference-api
|
| 6 |
+
# Tear down: docker-compose down -v
|
| 7 |
+
|
| 8 |
+
services:
|
| 9 |
+
|
| 10 |
+
# ── Postgres ────────────────────────────────────────────────────────────────
|
| 11 |
+
postgres:
|
| 12 |
+
image: postgres:15-alpine
|
| 13 |
+
environment:
|
| 14 |
+
POSTGRES_DB: flights
|
| 15 |
+
POSTGRES_USER: admin
|
| 16 |
+
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-localpassword}
|
| 17 |
+
ports: ["5432:5432"]
|
| 18 |
+
volumes: [postgres_data:/var/lib/postgresql/data]
|
| 19 |
+
healthcheck:
|
| 20 |
+
test: ["CMD-SHELL", "pg_isready -U admin -d flights"]
|
| 21 |
+
interval: 10s
|
| 22 |
+
timeout: 5s
|
| 23 |
+
retries: 5
|
| 24 |
+
|
| 25 |
+
# ── MLflow tracking server ───────────────────────────────────────────────────
|
| 26 |
+
mlflow:
|
| 27 |
+
image: python:3.11-slim
|
| 28 |
+
depends_on:
|
| 29 |
+
postgres: { condition: service_healthy }
|
| 30 |
+
command: >
|
| 31 |
+
sh -c "pip install -q mlflow psycopg2-binary boto3 &&
|
| 32 |
+
mlflow server
|
| 33 |
+
--host 0.0.0.0
|
| 34 |
+
--port 5000
|
| 35 |
+
--backend-store-uri postgresql://admin:${POSTGRES_PASSWORD:-localpassword}@postgres:5432/flights
|
| 36 |
+
--default-artifact-root /mlruns"
|
| 37 |
+
ports: ["5000:5000"]
|
| 38 |
+
volumes: [mlruns:/mlruns]
|
| 39 |
+
environment:
|
| 40 |
+
MLFLOW_TRACKING_URI: http://localhost:5000
|
| 41 |
+
|
| 42 |
+
# ── Airflow (single-container quickstart) ────────────────────────────────────
|
| 43 |
+
airflow:
|
| 44 |
+
image: apache/airflow:2.9.1-python3.11
|
| 45 |
+
depends_on:
|
| 46 |
+
postgres: { condition: service_healthy }
|
| 47 |
+
environment:
|
| 48 |
+
AIRFLOW__CORE__EXECUTOR: LocalExecutor
|
| 49 |
+
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://admin:${POSTGRES_PASSWORD:-localpassword}@postgres:5432/flights
|
| 50 |
+
AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
|
| 51 |
+
AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-changeme}
|
| 52 |
+
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
|
| 53 |
+
MLFLOW_TRACKING_URI: http://mlflow:5000
|
| 54 |
+
ports: ["8080:8080"]
|
| 55 |
+
volumes:
|
| 56 |
+
- ./dags:/opt/airflow/dags
|
| 57 |
+
- ./etl:/opt/airflow/etl
|
| 58 |
+
- ./ml:/opt/airflow/ml
|
| 59 |
+
- ./mlops:/opt/airflow/mlops
|
| 60 |
+
- ./data:/opt/airflow/data
|
| 61 |
+
command: >
|
| 62 |
+
bash -c "airflow db init &&
|
| 63 |
+
airflow users create --username admin --password admin
|
| 64 |
+
--firstname Admin --lastname User --role Admin --email admin@example.com &&
|
| 65 |
+
airflow webserver & airflow scheduler"
|
| 66 |
+
|
| 67 |
+
# ── Inference API ────────────────────────────────────────────────────────────
|
| 68 |
+
inference-api:
|
| 69 |
+
build: .
|
| 70 |
+
depends_on:
|
| 71 |
+
postgres: { condition: service_healthy }
|
| 72 |
+
ports: ["8000:8000"]
|
| 73 |
+
environment:
|
| 74 |
+
DATABASE_URL: postgresql://admin:${POSTGRES_PASSWORD:-localpassword}@postgres:5432/flights
|
| 75 |
+
MLFLOW_TRACKING_URI: http://mlflow:5000
|
| 76 |
+
MODEL_VERSION: local
|
| 77 |
+
volumes:
|
| 78 |
+
- ./models:/app/models # mount trained models
|
| 79 |
+
- ./data:/app/data
|
| 80 |
+
healthcheck:
|
| 81 |
+
test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
|
| 82 |
+
interval: 30s
|
| 83 |
+
timeout: 10s
|
| 84 |
+
retries: 3
|
| 85 |
+
|
| 86 |
+
# ── Prometheus ───────────────────────────────────────────────────────────────
|
| 87 |
+
prometheus:
|
| 88 |
+
image: prom/prometheus:v2.51.0
|
| 89 |
+
ports: ["9090:9090"]
|
| 90 |
+
volumes:
|
| 91 |
+
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 92 |
+
- prometheus_data:/prometheus
|
| 93 |
+
command:
|
| 94 |
+
- '--config.file=/etc/prometheus/prometheus.yml'
|
| 95 |
+
- '--storage.tsdb.retention.time=15d'
|
| 96 |
+
|
| 97 |
+
# ── Grafana ──────────────────────────────────────────────────────────────────
|
| 98 |
+
grafana:
|
| 99 |
+
image: grafana/grafana:10.4.0
|
| 100 |
+
depends_on: [prometheus]
|
| 101 |
+
ports: ["3000:3000"]
|
| 102 |
+
environment:
|
| 103 |
+
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin}
|
| 104 |
+
volumes:
|
| 105 |
+
- grafana_data:/var/lib/grafana
|
| 106 |
+
|
| 107 |
+
volumes:
|
| 108 |
+
postgres_data:
|
| 109 |
+
mlruns:
|
| 110 |
+
prometheus_data:
|
| 111 |
+
grafana_data:
|
dvc.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stages:
|
| 2 |
+
extract:
|
| 3 |
+
cmd: python -m etl.extract --start-date ${extract.start_date} --end-date ${extract.end_date}
|
| 4 |
+
params:
|
| 5 |
+
- params.yaml:
|
| 6 |
+
- extract.start_date
|
| 7 |
+
- extract.end_date
|
| 8 |
+
outs:
|
| 9 |
+
- data/raw/flights_raw.parquet
|
| 10 |
+
|
| 11 |
+
transform:
|
| 12 |
+
cmd: python -m etl.transform
|
| 13 |
+
deps:
|
| 14 |
+
- data/raw/flights_raw.parquet
|
| 15 |
+
- etl/transform.py
|
| 16 |
+
outs:
|
| 17 |
+
- data/processed/features.parquet
|
| 18 |
+
- models/label_encoders.pkl
|
| 19 |
+
|
| 20 |
+
train:
|
| 21 |
+
cmd: python -m ml.train --data data/processed/features.parquet
|
| 22 |
+
deps:
|
| 23 |
+
- data/processed/features.parquet
|
| 24 |
+
- ml/train.py
|
| 25 |
+
- ml/config.py
|
| 26 |
+
params:
|
| 27 |
+
- params.yaml:
|
| 28 |
+
- model.n_estimators
|
| 29 |
+
- model.max_depth
|
| 30 |
+
- model.learning_rate
|
| 31 |
+
- model.subsample
|
| 32 |
+
- model.colsample_bytree
|
| 33 |
+
metrics:
|
| 34 |
+
- metrics/scores.json:
|
| 35 |
+
cache: false
|
| 36 |
+
outs:
|
| 37 |
+
- models/best_model.pkl
|
| 38 |
+
|
| 39 |
+
evaluate:
|
| 40 |
+
cmd: python -m ml.evaluate
|
| 41 |
+
deps:
|
| 42 |
+
- models/best_model.pkl
|
| 43 |
+
- data/processed/features.parquet
|
| 44 |
+
- ml/evaluate.py
|
| 45 |
+
outs:
|
| 46 |
+
- reports/roc_curve.png:
|
| 47 |
+
cache: false
|
| 48 |
+
- reports/pr_curve.png:
|
| 49 |
+
cache: false
|
| 50 |
+
- reports/eval_metrics.json:
|
| 51 |
+
cache: false
|
params.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# params.yaml — Single source of truth for pipeline parameters.
|
| 2 |
+
# DVC reads this file to detect when stages need to re-run.
|
| 3 |
+
# MLflow logs these automatically during training.
|
| 4 |
+
|
| 5 |
+
extract:
|
| 6 |
+
start_date: "2023-01-01"
|
| 7 |
+
end_date: "2024-01-01"
|
| 8 |
+
airline: null # null = all airlines
|
| 9 |
+
|
| 10 |
+
model:
|
| 11 |
+
n_estimators: 500
|
| 12 |
+
max_depth: 6
|
| 13 |
+
learning_rate: 0.05
|
| 14 |
+
subsample: 0.8
|
| 15 |
+
colsample_bytree: 0.8
|
| 16 |
+
scale_pos_weight: 2.5
|
| 17 |
+
cv_folds: 5
|
| 18 |
+
random_state: 42
|
| 19 |
+
|
| 20 |
+
evaluate:
|
| 21 |
+
threshold: 0.5
|
| 22 |
+
|
| 23 |
+
monitoring:
|
| 24 |
+
drift_share_threshold: 0.30
|
requirements.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Core ──────────────────────────────────────────────────────────────────────
|
| 2 |
+
pandas==2.2.2
|
| 3 |
+
numpy==1.26.4
|
| 4 |
+
scikit-learn==1.4.2
|
| 5 |
+
|
| 6 |
+
# ── ML Models ─────────────────────────────────────────────────────────────────
|
| 7 |
+
xgboost==2.0.3
|
| 8 |
+
lightgbm==4.3.0
|
| 9 |
+
hyperopt==0.2.7
|
| 10 |
+
shap==0.45.0
|
| 11 |
+
|
| 12 |
+
# ── MLOps ─────────────────────────────────────────────────────────────────────
|
| 13 |
+
mlflow==2.13.0
|
| 14 |
+
dvc[s3]==3.50.1
|
| 15 |
+
|
| 16 |
+
# ── API ───────────────────────────────────────────────────────────────────────
|
| 17 |
+
fastapi==0.111.0
|
| 18 |
+
uvicorn[standard]==0.29.0
|
| 19 |
+
pydantic==2.7.1
|
| 20 |
+
python-multipart==0.0.9
|
| 21 |
+
|
| 22 |
+
# ── ETL / Data ────────────────────────────────────────────────────────────────
|
| 23 |
+
requests==2.32.2
|
| 24 |
+
sqlalchemy==2.0.30
|
| 25 |
+
psycopg2-binary==2.9.9
|
| 26 |
+
pyarrow==16.0.0
|
| 27 |
+
great-expectations==0.18.19
|
| 28 |
+
apache-airflow==2.9.1
|
| 29 |
+
|
| 30 |
+
# ── Cloud ─────────────────────────────────────────────────────────────────────
|
| 31 |
+
boto3==1.34.106
|
| 32 |
+
|
| 33 |
+
# ── Monitoring ────────────────────────────────────────────────────────────────
|
| 34 |
+
evidently==0.4.30
|
| 35 |
+
prometheus-client==0.20.0
|
| 36 |
+
|
| 37 |
+
# ── UI (HF Spaces) ────────────────────────────────────────────────────────────
|
| 38 |
+
gradio==4.31.0
|
| 39 |
+
|
| 40 |
+
# ── Dev / Testing ─────────────────────────────────────────────────────────────
|
| 41 |
+
pytest==8.2.0
|
| 42 |
+
pytest-cov==5.0.0
|
| 43 |
+
httpx==0.27.0 # FastAPI TestClient
|
| 44 |
+
ruff==0.4.4
|
| 45 |
+
mypy==1.10.0
|
| 46 |
+
loguru==0.7.2
|
| 47 |
+
python-dotenv==1.0.1
|