Spaces:
Sleeping
Sleeping
Deploy app with models and artifacts (force update)
Browse files- .env +7 -0
- .gitattributes +1 -0
- Dockerfile +25 -0
- README.md +47 -0
- app/__init__.py +6 -0
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/__pycache__/config.cpython-312.pyc +0 -0
- app/__pycache__/model_loader.cpython-312.pyc +0 -0
- app/__pycache__/routes.cpython-312.pyc +0 -0
- app/__pycache__/schemas.cpython-312.pyc +0 -0
- app/config.py +19 -0
- app/model_loader.py +127 -0
- app/routes.py +244 -0
- app/schemas.py +80 -0
- artifacts/champion_meta.json +49 -0
- artifacts/classification_report.json +27 -0
- artifacts/confusion_matrix.png +0 -0
- artifacts/cv_metrics.json +161 -0
- artifacts/distribution_baselines.json +239 -0
- artifacts/dropped_columns.json +11 -0
- artifacts/fairness_group_metrics.json +5 -0
- artifacts/fairness_group_outcome_only.json +62 -0
- artifacts/fairness_summary.md +10 -0
- artifacts/feature_contract.json +89 -0
- artifacts/feature_importance.json +166 -0
- artifacts/feature_name_map.json +43 -0
- artifacts/feature_rules.json +53 -0
- artifacts/feature_schema.json +242 -0
- artifacts/mte_mappings.json +76 -0
- artifacts/pr_curve.json +0 -0
- artifacts/roc_curve.json +0 -0
- artifacts/shap_importance_bar.png +0 -0
- artifacts/shap_summary.png +3 -0
- artifacts/shap_values_sample.json +152 -0
- artifacts/threshold_sweep.csv +102 -0
- artifacts/value_domains.json +284 -0
- main.py +21 -0
- requirements.txt +10 -0
- src/__init__.py +1 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/preprocessing.cpython-312.pyc +0 -0
- src/preprocessing.py +240 -0
.env
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Production configuration
|
| 2 |
+
MODEL_PATH=models/
|
| 3 |
+
PREPROCESSOR_PATH=models/preprocessor.pkl
|
| 4 |
+
ARTIFACT_DIR=artifacts
|
| 5 |
+
LOCAL_MODEL_PATH=models/champion_model.pkl
|
| 6 |
+
LOCAL_PREPROCESSOR_PATH=models/preprocessor.pkl
|
| 7 |
+
ALLOW_START_WITHOUT_MODEL=false
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
artifacts/shap_summary.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Copy requirements and install
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 13 |
+
pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application files
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Expose port 7860 (Hugging Face Spaces default)
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
# Set environment variable for Hugging Face Spaces
|
| 22 |
+
ENV PORT=7860
|
| 23 |
+
|
| 24 |
+
# Run the application
|
| 25 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Hotel Booking Cancellation Prediction API
|
| 3 |
+
emoji: 🏨
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Hotel Booking Cancellation Prediction API
|
| 12 |
+
|
| 13 |
+
This is a FastAPI-based prediction service that estimates the probability of hotel booking cancellations.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- **POST /predict** - Predict cancellation probability for a single booking
|
| 18 |
+
- **GET /health** - Health check endpoint
|
| 19 |
+
- **GET /** - API information
|
| 20 |
+
|
| 21 |
+
## Example Usage
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
import requests
|
| 25 |
+
|
| 26 |
+
payload = {
|
| 27 |
+
"lead_time": 30,
|
| 28 |
+
"arrival_month": 7,
|
| 29 |
+
"adults": 2,
|
| 30 |
+
"children": 0,
|
| 31 |
+
"adr": 120.0
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
response = requests.post("https://huggingface.co/spaces/j2damax/boking-cancelation-api/predict", json=payload)
|
| 35 |
+
print(response.json())
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Model Information
|
| 39 |
+
|
| 40 |
+
The API uses a machine learning model trained on hotel booking data with features like:
|
| 41 |
+
- Lead time (days before arrival)
|
| 42 |
+
- Guest composition (adults, children)
|
| 43 |
+
- Pricing (average daily rate)
|
| 44 |
+
- Stay duration
|
| 45 |
+
- And more...
|
| 46 |
+
|
| 47 |
+
Check `/docs` for the interactive API documentation.
|
app/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lightweight modular FastAPI application components for hotel cancellation prediction.
|
| 2 |
+
|
| 3 |
+
This package isolates configuration, model loading, schemas, and routes so that
|
| 4 |
+
`main.py` can remain minimal. Only essential functionality (health + predict)
|
| 5 |
+
is kept per the user's simplification request.
|
| 6 |
+
"""
|
app/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (481 Bytes). View file
|
|
|
app/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (1.14 kB). View file
|
|
|
app/__pycache__/model_loader.cpython-312.pyc
ADDED
|
Binary file (6.97 kB). View file
|
|
|
app/__pycache__/routes.cpython-312.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
app/__pycache__/schemas.cpython-312.pyc
ADDED
|
Binary file (3.14 kB). View file
|
|
|
app/config.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application configuration and environment variable management."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# AWS / S3 model fetching removed – artifacts now sourced from local paths or Hugging Face Hub only.
|
| 9 |
+
MODEL_VERSION = os.getenv("MODEL_VERSION", "latest") # retained for potential future tagging (not used for HF snapshot)
|
| 10 |
+
DECISION_THRESHOLD_ENV = os.getenv("DECISION_THRESHOLD") # optional override
|
| 11 |
+
ALLOW_START_WITHOUT_MODEL = os.getenv("ALLOW_START_WITHOUT_MODEL", "false").lower() == "true"
|
| 12 |
+
ARTIFACT_DIR = os.getenv("ARTIFACT_DIR", "artifacts")
|
| 13 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO") # e.g. j2damax/hotel-cancel-model
|
| 14 |
+
|
| 15 |
+
# Local fallback paths (used if artifacts baked into image or mounted)
|
| 16 |
+
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH", "models/champion_model.pkl")
|
| 17 |
+
LOCAL_PREPROCESSOR_PATH = os.getenv("LOCAL_PREPROCESSOR_PATH", "models/preprocessor.pkl")
|
| 18 |
+
|
| 19 |
+
APP_VERSION = "1.0.0"
|
app/model_loader.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model + preprocessor loading utilities (local + Hugging Face Hub).
|
| 2 |
+
|
| 3 |
+
S3 support removed as project no longer uses AWS. Loading order:
|
| 4 |
+
1. Local baked artifacts (if present)
|
| 5 |
+
2. Hugging Face Hub (HF_MODEL_REPO)
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
import os, json, time
|
| 9 |
+
import joblib
|
| 10 |
+
from typing import Optional, Tuple
|
| 11 |
+
|
| 12 |
+
from . import config
|
| 13 |
+
from src.preprocessing import PreprocessingPipeline
|
| 14 |
+
|
| 15 |
+
model = None
|
| 16 |
+
preprocessor: Optional[PreprocessingPipeline] = None
|
| 17 |
+
model_version: Optional[str] = None
|
| 18 |
+
champion_meta_threshold: Optional[float] = None
|
| 19 |
+
_last_reload_time: float | None = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _resolve_git_sha() -> str | None:
|
| 23 |
+
git_sha = os.getenv('GIT_SHA')
|
| 24 |
+
if git_sha:
|
| 25 |
+
return git_sha[:12]
|
| 26 |
+
head_path = os.path.join('.git','HEAD')
|
| 27 |
+
try:
|
| 28 |
+
if os.path.exists(head_path):
|
| 29 |
+
with open(head_path) as hf:
|
| 30 |
+
ref = hf.read().strip()
|
| 31 |
+
if ref.startswith('ref:'):
|
| 32 |
+
ref_file = ref.split(' ',1)[1]
|
| 33 |
+
ref_path = os.path.join('.git', ref_file)
|
| 34 |
+
if os.path.exists(ref_path):
|
| 35 |
+
with open(ref_path) as rf:
|
| 36 |
+
return rf.read().strip()[:12]
|
| 37 |
+
else:
|
| 38 |
+
return ref[:12]
|
| 39 |
+
except Exception:
|
| 40 |
+
return None
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def load_model() -> None:
|
| 45 |
+
"""Idempotent loading routine (local first, then HF Hub)."""
|
| 46 |
+
global model, preprocessor, model_version, champion_meta_threshold, _last_reload_time
|
| 47 |
+
# Local fallback
|
| 48 |
+
if model is None and os.path.exists(config.LOCAL_MODEL_PATH):
|
| 49 |
+
try:
|
| 50 |
+
model_candidate = joblib.load(config.LOCAL_MODEL_PATH)
|
| 51 |
+
if hasattr(model_candidate, 'predict'):
|
| 52 |
+
model = model_candidate
|
| 53 |
+
# pseudo version from mtime
|
| 54 |
+
mtime = int(os.path.getmtime(config.LOCAL_MODEL_PATH))
|
| 55 |
+
model_version = f"local_{mtime}"
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Local model load failed: {e}")
|
| 58 |
+
if preprocessor is None and os.path.exists(config.LOCAL_PREPROCESSOR_PATH):
|
| 59 |
+
try:
|
| 60 |
+
preprocessor = PreprocessingPipeline.load(config.LOCAL_PREPROCESSOR_PATH)
|
| 61 |
+
except Exception:
|
| 62 |
+
preprocessor = None
|
| 63 |
+
# Hugging Face Hub fallback if HF_MODEL_REPO is set
|
| 64 |
+
if (model is None or preprocessor is None) and getattr(config, 'HF_MODEL_REPO', None):
|
| 65 |
+
try:
|
| 66 |
+
from huggingface_hub import snapshot_download
|
| 67 |
+
repo_id = config.HF_MODEL_REPO
|
| 68 |
+
cache_dir = os.path.join('models','hf', repo_id.replace('/','__'))
|
| 69 |
+
local_dir = snapshot_download(repo_id=repo_id, local_dir=cache_dir, local_dir_use_symlinks=False)
|
| 70 |
+
model_path = os.path.join(local_dir, 'champion_model.pkl')
|
| 71 |
+
preproc_path = os.path.join(local_dir, 'preprocessor.pkl')
|
| 72 |
+
meta_path = os.path.join(local_dir, 'champion_meta.json')
|
| 73 |
+
if model is None and os.path.exists(model_path):
|
| 74 |
+
m_candidate = joblib.load(model_path)
|
| 75 |
+
if hasattr(m_candidate, 'predict'):
|
| 76 |
+
model = m_candidate
|
| 77 |
+
model_version = f"hf_{os.path.getmtime(model_path):.0f}"
|
| 78 |
+
if preprocessor is None and os.path.exists(preproc_path):
|
| 79 |
+
try:
|
| 80 |
+
preprocessor = PreprocessingPipeline.load(preproc_path)
|
| 81 |
+
except Exception:
|
| 82 |
+
preprocessor = None
|
| 83 |
+
if os.path.exists(meta_path):
|
| 84 |
+
try:
|
| 85 |
+
with open(meta_path) as mf:
|
| 86 |
+
meta = json.load(mf)
|
| 87 |
+
if 'decision_threshold' in meta:
|
| 88 |
+
champion_meta_threshold = meta['decision_threshold']
|
| 89 |
+
except Exception:
|
| 90 |
+
pass
|
| 91 |
+
if model is not None:
|
| 92 |
+
print(f"Loaded model (HF) repo={repo_id} version={model_version}")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"HF load failed: {e}")
|
| 95 |
+
if model is None:
|
| 96 |
+
print("No model loaded (checked local + HF). API will report model_not_loaded.")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def resolve_threshold() -> tuple[float, str]:
|
| 100 |
+
if config.DECISION_THRESHOLD_ENV is not None:
|
| 101 |
+
try:
|
| 102 |
+
return float(config.DECISION_THRESHOLD_ENV), 'env'
|
| 103 |
+
except ValueError:
|
| 104 |
+
pass
|
| 105 |
+
if champion_meta_threshold is not None:
|
| 106 |
+
try:
|
| 107 |
+
return float(champion_meta_threshold), 'champion_meta'
|
| 108 |
+
except ValueError:
|
| 109 |
+
pass
|
| 110 |
+
return 0.5, 'default'
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def load_model_and_preprocessor():
|
| 114 |
+
"""Convenience helper to ensure artifacts are loaded and return them with minimal metadata.
|
| 115 |
+
|
| 116 |
+
Returns (model, preprocessor, metadata_dict)
|
| 117 |
+
metadata_dict contains keys: version, threshold, threshold_source
|
| 118 |
+
"""
|
| 119 |
+
if model is None or preprocessor is None:
|
| 120 |
+
load_model()
|
| 121 |
+
thr, source = resolve_threshold()
|
| 122 |
+
meta = {
|
| 123 |
+
'version': model_version,
|
| 124 |
+
'threshold': thr,
|
| 125 |
+
'threshold_source': source
|
| 126 |
+
}
|
| 127 |
+
return model, preprocessor, meta
|
app/routes.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI routes (health + predict) using simplified pipeline."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from fastapi import APIRouter, HTTPException
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from .schemas import BookingFeatures, PredictionResponse, HealthResponse
|
| 7 |
+
import json, os
|
| 8 |
+
from . import config
|
| 9 |
+
from . import model_loader
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@router.get('/health', response_model=HealthResponse)
|
| 15 |
+
async def health():
|
| 16 |
+
thr, _src = model_loader.resolve_threshold()
|
| 17 |
+
loaded = model_loader.model is not None
|
| 18 |
+
return HealthResponse(
|
| 19 |
+
status='healthy' if loaded else 'model_not_loaded',
|
| 20 |
+
model_loaded=loaded,
|
| 21 |
+
model_version=model_loader.model_version,
|
| 22 |
+
decision_threshold=thr if loaded else None
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _prepare(df: pd.DataFrame) -> pd.DataFrame:
|
| 27 |
+
"""Minimal inference-time feature alignment.
|
| 28 |
+
|
| 29 |
+
Injects placeholder raw & engineered columns so the persisted preprocessor
|
| 30 |
+
(trained with target encoding on several categorical columns) can operate.
|
| 31 |
+
|
| 32 |
+
We intentionally provide conservative defaults for fields not exposed via
|
| 33 |
+
the public API schema. These defaults should be business-plausible and
|
| 34 |
+
neutral (e.g., zeros, most-common style fallbacks) while allowing the
|
| 35 |
+
preprocessor to apply target encodings and scaling without missing-column
|
| 36 |
+
errors.
|
| 37 |
+
"""
|
| 38 |
+
df = df.copy()
|
| 39 |
+
|
| 40 |
+
# 1. Rename incoming simplified fields to training schema equivalents
|
| 41 |
+
if 'arrival_month' in df.columns:
|
| 42 |
+
df['arrival_date_month'] = df['arrival_month']
|
| 43 |
+
if 'stays_weekend_nights' in df.columns:
|
| 44 |
+
df['stays_in_weekend_nights'] = df['stays_weekend_nights']
|
| 45 |
+
if 'stays_week_nights' in df.columns:
|
| 46 |
+
df['stays_in_week_nights'] = df['stays_week_nights']
|
| 47 |
+
if 'total_of_special_requests' in df.columns:
|
| 48 |
+
df['total_of_special_requests'] = df['total_of_special_requests'] # idempotent clarity
|
| 49 |
+
|
| 50 |
+
# 2. Add placeholder raw columns expected by feature contract / preprocessor
|
| 51 |
+
placeholder_defaults = {
|
| 52 |
+
'hotel': 0,
|
| 53 |
+
'arrival_date_year': 2025,
|
| 54 |
+
'arrival_date_week_number': 1,
|
| 55 |
+
'arrival_date_day_of_month': 1,
|
| 56 |
+
'babies': 0,
|
| 57 |
+
'meal': 0,
|
| 58 |
+
'country': 'UNK',
|
| 59 |
+
'market_segment': 0,
|
| 60 |
+
'distribution_channel': 0,
|
| 61 |
+
'previous_bookings_not_canceled': 0,
|
| 62 |
+
'reserved_room_type': 0,
|
| 63 |
+
'assigned_room_type': 0,
|
| 64 |
+
'deposit_type': 0,
|
| 65 |
+
'days_in_waiting_list': 0,
|
| 66 |
+
'customer_type': 0,
|
| 67 |
+
}
|
| 68 |
+
for col, default in placeholder_defaults.items():
|
| 69 |
+
if col not in df.columns:
|
| 70 |
+
df[col] = default
|
| 71 |
+
|
| 72 |
+
# 3. Engineered features reproduced (subset)
|
| 73 |
+
if {'stays_in_weekend_nights','stays_in_week_nights'}.issubset(df.columns):
|
| 74 |
+
df['total_stay_duration'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
|
| 75 |
+
if {'adults','children','babies'}.issubset(df.columns):
|
| 76 |
+
df['total_guests'] = df['adults'] + df['children'].fillna(0) + df['babies']
|
| 77 |
+
else:
|
| 78 |
+
df['total_guests'] = df.get('adults', 1)
|
| 79 |
+
# is_family heuristic (children or babies) match training logic closely
|
| 80 |
+
if {'children','babies'}.issubset(df.columns):
|
| 81 |
+
df['is_family'] = ((df['children'] > 0) | (df['babies'] > 0)).astype(int)
|
| 82 |
+
else:
|
| 83 |
+
df['is_family'] = 0
|
| 84 |
+
# guest_type (mirrors feature_engineering logic simplified)
|
| 85 |
+
def _guest_type(row):
|
| 86 |
+
if row.get('babies',0) > 0:
|
| 87 |
+
return 'family_with_babies'
|
| 88 |
+
if row.get('children',0) > 0:
|
| 89 |
+
return 'family_with_children'
|
| 90 |
+
if row.get('adults',0) == 1:
|
| 91 |
+
return 'solo_traveler'
|
| 92 |
+
if row.get('adults',0) == 2:
|
| 93 |
+
return 'couple'
|
| 94 |
+
return 'group'
|
| 95 |
+
if 'guest_type' not in df.columns:
|
| 96 |
+
df['guest_type'] = df.apply(_guest_type, axis=1)
|
| 97 |
+
|
| 98 |
+
# 4. Seasonal & temporal flags
|
| 99 |
+
if 'arrival_date_month' in df.columns:
|
| 100 |
+
m = df['arrival_date_month']
|
| 101 |
+
# Normalize numeric months (1-12). If user supplied 0-11 adjust (+1).
|
| 102 |
+
if set(m.unique()).issubset(set(range(0,12))):
|
| 103 |
+
m_norm = m + 1
|
| 104 |
+
else:
|
| 105 |
+
m_norm = m
|
| 106 |
+
season_map = {12:'winter',1:'winter',2:'winter',3:'spring',4:'spring',5:'spring',6:'summer',7:'summer',8:'summer',9:'autumn',10:'autumn',11:'autumn'}
|
| 107 |
+
df['arrival_season'] = m_norm.map(season_map)
|
| 108 |
+
df['is_peak_season'] = m_norm.isin([5,6,7,8,9]).astype(int)
|
| 109 |
+
# Quarter flag and additional temporal flags
|
| 110 |
+
def _quarter(x):
|
| 111 |
+
if pd.isna(x):
|
| 112 |
+
return None
|
| 113 |
+
return f"Q{int((int(x)-1)//3)+1}"
|
| 114 |
+
df['arrival_quarter'] = m_norm.apply(_quarter)
|
| 115 |
+
df['is_summer_peak'] = m_norm.isin([7,8]).astype(int)
|
| 116 |
+
df['is_holiday_season'] = m_norm.isin([12,1]).astype(int)
|
| 117 |
+
else:
|
| 118 |
+
for col, default in {
|
| 119 |
+
'arrival_season': 'winter',
|
| 120 |
+
'is_peak_season': 0,
|
| 121 |
+
'arrival_quarter': 'Q1',
|
| 122 |
+
'is_summer_peak': 0,
|
| 123 |
+
'is_holiday_season': 0,
|
| 124 |
+
}.items():
|
| 125 |
+
if col not in df.columns:
|
| 126 |
+
df[col] = default
|
| 127 |
+
|
| 128 |
+
# 5. Ensure columns required for target encoding exist (placeholders already above)
|
| 129 |
+
for te_col in ['country','guest_type','arrival_season','arrival_quarter']:
|
| 130 |
+
if te_col not in df.columns:
|
| 131 |
+
df[te_col] = 'UNK'
|
| 132 |
+
|
| 133 |
+
return df
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@router.post('/predict', response_model=PredictionResponse)
|
| 137 |
+
async def predict(booking: BookingFeatures):
|
| 138 |
+
if model_loader.model is None:
|
| 139 |
+
raise HTTPException(status_code=503, detail='Model not loaded')
|
| 140 |
+
if model_loader.preprocessor is None:
|
| 141 |
+
raise HTTPException(status_code=503, detail='Preprocessor not loaded')
|
| 142 |
+
raw_df = pd.DataFrame([booking.model_dump()])
|
| 143 |
+
prep_df = _prepare(raw_df)
|
| 144 |
+
try:
|
| 145 |
+
processed = model_loader.preprocessor.transform(prep_df)
|
| 146 |
+
except Exception as e:
|
| 147 |
+
raise HTTPException(status_code=500, detail=f'Preprocessor transform failed: {e}')
|
| 148 |
+
if hasattr(model_loader.model, 'predict_proba'):
|
| 149 |
+
prob = float(model_loader.model.predict_proba(processed)[0,1])
|
| 150 |
+
else:
|
| 151 |
+
prob = float(model_loader.model.predict(processed)[0])
|
| 152 |
+
thr, src = model_loader.resolve_threshold()
|
| 153 |
+
pred = int(prob >= thr)
|
| 154 |
+
return PredictionResponse(prediction=pred, probability=prob, model_version=model_loader.model_version, applied_threshold=thr, threshold_source=src)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@router.post('/predict/batch', response_model=list[PredictionResponse])
|
| 158 |
+
async def predict_batch(bookings: list[BookingFeatures]):
|
| 159 |
+
if model_loader.model is None:
|
| 160 |
+
raise HTTPException(status_code=503, detail='Model not loaded')
|
| 161 |
+
if model_loader.preprocessor is None:
|
| 162 |
+
raise HTTPException(status_code=503, detail='Preprocessor not loaded')
|
| 163 |
+
raw_df = pd.DataFrame([b.model_dump() for b in bookings])
|
| 164 |
+
prep_df = _prepare(raw_df)
|
| 165 |
+
try:
|
| 166 |
+
processed = model_loader.preprocessor.transform(prep_df)
|
| 167 |
+
except Exception as e:
|
| 168 |
+
raise HTTPException(status_code=500, detail=f'Preprocessor transform failed: {e}')
|
| 169 |
+
if hasattr(model_loader.model, 'predict_proba'):
|
| 170 |
+
probs = model_loader.model.predict_proba(processed)[:,1]
|
| 171 |
+
else:
|
| 172 |
+
probs = model_loader.model.predict(processed).astype(float)
|
| 173 |
+
thr, src = model_loader.resolve_threshold()
|
| 174 |
+
preds = (probs >= thr).astype(int)
|
| 175 |
+
return [PredictionResponse(prediction=int(p), probability=float(pr), model_version=model_loader.model_version, applied_threshold=thr, threshold_source=src) for p, pr in zip(preds, probs)]
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def startup_load():
|
| 179 |
+
model_loader.load_model()
|
| 180 |
+
if model_loader.model is None and not config.ALLOW_START_WITHOUT_MODEL:
|
| 181 |
+
raise RuntimeError('Model not loaded at startup. Provide S3 env or mount local artifacts.')
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@router.get('/model/interpretability')
|
| 185 |
+
async def interpretability(top_k: int = 10):
|
| 186 |
+
"""Lightweight interpretability stub reading precomputed artifacts if available.
|
| 187 |
+
|
| 188 |
+
Returns minimal structure expected by existing tests; if artifacts missing,
|
| 189 |
+
degrades gracefully with empty lists.
|
| 190 |
+
"""
|
| 191 |
+
artifacts_dir = config.ARTIFACT_DIR
|
| 192 |
+
feature_importance_path = os.path.join(artifacts_dir, 'feature_importance.json')
|
| 193 |
+
champion_meta_path = os.path.join(artifacts_dir, 'champion_meta.json')
|
| 194 |
+
shap_values_sample_path = os.path.join(artifacts_dir, 'shap_values_sample.json')
|
| 195 |
+
fi = []
|
| 196 |
+
champion_model = None
|
| 197 |
+
decision_threshold = None
|
| 198 |
+
shap_generated = False
|
| 199 |
+
local_examples = []
|
| 200 |
+
if os.path.exists(feature_importance_path):
|
| 201 |
+
try:
|
| 202 |
+
with open(feature_importance_path) as f:
|
| 203 |
+
raw = json.load(f)
|
| 204 |
+
fi = raw[:top_k]
|
| 205 |
+
except Exception:
|
| 206 |
+
fi = []
|
| 207 |
+
if os.path.exists(champion_meta_path):
|
| 208 |
+
try:
|
| 209 |
+
with open(champion_meta_path) as f:
|
| 210 |
+
meta = json.load(f)
|
| 211 |
+
champion_model = meta.get('model_name')
|
| 212 |
+
decision_threshold = meta.get('decision_threshold')
|
| 213 |
+
shap_generated = bool(meta.get('shap_generated'))
|
| 214 |
+
except Exception:
|
| 215 |
+
pass
|
| 216 |
+
if os.path.exists(shap_values_sample_path):
|
| 217 |
+
try:
|
| 218 |
+
with open(shap_values_sample_path) as f:
|
| 219 |
+
raw_local = json.load(f)[:3]
|
| 220 |
+
# Adapt shape: ensure keys top_positive_contributors / top_negative_contributors
|
| 221 |
+
adapted = []
|
| 222 |
+
for rec in raw_local:
|
| 223 |
+
shap_vals = rec.get('shap_values', {})
|
| 224 |
+
positives = sorted([(k,v) for k,v in shap_vals.items() if v > 0], key=lambda x: x[1], reverse=True)[:5]
|
| 225 |
+
negatives = sorted([(k,v) for k,v in shap_vals.items() if v < 0], key=lambda x: x[1])[:5]
|
| 226 |
+
adapted.append({
|
| 227 |
+
'category': rec.get('category','sample'),
|
| 228 |
+
'probability': rec.get('probability'),
|
| 229 |
+
'top_positive_contributors': [{'feature': f, 'shap': v} for f,v in positives],
|
| 230 |
+
'top_negative_contributors': [{'feature': f, 'shap': v} for f,v in negatives]
|
| 231 |
+
})
|
| 232 |
+
local_examples = adapted
|
| 233 |
+
except Exception:
|
| 234 |
+
local_examples = []
|
| 235 |
+
return {
|
| 236 |
+
'champion_model': champion_model,
|
| 237 |
+
'shap_generated': shap_generated and bool(fi),
|
| 238 |
+
'shap_timestamp': None,
|
| 239 |
+
'decision_threshold': decision_threshold,
|
| 240 |
+
'top_features': fi,
|
| 241 |
+
'local_examples': local_examples,
|
| 242 |
+
'feature_name_map': {},
|
| 243 |
+
'artifacts_available': []
|
| 244 |
+
}
|
app/schemas.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic request/response models."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from pydantic import BaseModel, Field, ConfigDict
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class BookingFeatures(BaseModel):
|
| 8 |
+
"""Public prediction payload schema.
|
| 9 |
+
|
| 10 |
+
Many training-time features are internal or engineered; to keep the public
|
| 11 |
+
contract lightweight we make several fields optional with neutral defaults.
|
| 12 |
+
This enables a *minimal* JSON payload such as:
|
| 13 |
+
{"lead_time": 30, "arrival_month": 7, "adults": 2, "children": 0, "adr": 120.0}
|
| 14 |
+
The `_prepare` function supplements / engineers the remaining columns.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
lead_time: int = Field(..., ge=0)
|
| 18 |
+
arrival_month: int = Field(..., ge=1, le=12)
|
| 19 |
+
# Stay details (optional, defaulting to a short weekday stay)
|
| 20 |
+
stays_weekend_nights: int | None = Field(0, ge=0)
|
| 21 |
+
stays_week_nights: int | None = Field(1, ge=0)
|
| 22 |
+
# Guest composition
|
| 23 |
+
adults: int = Field(..., ge=1)
|
| 24 |
+
children: int | None = Field(0, ge=0)
|
| 25 |
+
# Historical / behavioral signals
|
| 26 |
+
is_repeated_guest: int | None = Field(0, ge=0, le=1)
|
| 27 |
+
previous_cancellations: int | None = Field(0, ge=0)
|
| 28 |
+
booking_changes: int | None = Field(0, ge=0)
|
| 29 |
+
# Pricing
|
| 30 |
+
adr: float = Field(..., ge=0, description="Average daily rate (numeric, required)")
|
| 31 |
+
# Amenities / request counts
|
| 32 |
+
required_car_parking_spaces: int | None = Field(0, ge=0)
|
| 33 |
+
total_of_special_requests: int | None = Field(0, ge=0)
|
| 34 |
+
|
| 35 |
+
model_config = ConfigDict(json_schema_extra={
|
| 36 |
+
"examples": [
|
| 37 |
+
{
|
| 38 |
+
"summary": "Minimal",
|
| 39 |
+
"value": {
|
| 40 |
+
"lead_time": 30,
|
| 41 |
+
"arrival_month": 7,
|
| 42 |
+
"adults": 2,
|
| 43 |
+
"children": 0,
|
| 44 |
+
"adr": 120.0
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"summary": "Extended",
|
| 49 |
+
"value": {
|
| 50 |
+
"lead_time": 120,
|
| 51 |
+
"arrival_month": 7,
|
| 52 |
+
"stays_weekend_nights": 2,
|
| 53 |
+
"stays_week_nights": 3,
|
| 54 |
+
"adults": 2,
|
| 55 |
+
"children": 1,
|
| 56 |
+
"is_repeated_guest": 0,
|
| 57 |
+
"previous_cancellations": 0,
|
| 58 |
+
"booking_changes": 1,
|
| 59 |
+
"adr": 95.5,
|
| 60 |
+
"required_car_parking_spaces": 0,
|
| 61 |
+
"total_of_special_requests": 2
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
]
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class PredictionResponse(BaseModel):
|
| 69 |
+
prediction: int
|
| 70 |
+
probability: float
|
| 71 |
+
model_version: str | None = None
|
| 72 |
+
applied_threshold: float | None = None
|
| 73 |
+
threshold_source: str | None = None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class HealthResponse(BaseModel):
|
| 77 |
+
status: str
|
| 78 |
+
model_loaded: bool
|
| 79 |
+
model_version: Optional[str] = None
|
| 80 |
+
decision_threshold: Optional[float] = None
|
artifacts/champion_meta.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"selection_metric": "f1_score_mean",
|
| 3 |
+
"tie_breaker": "roc_auc_mean",
|
| 4 |
+
"model_name": "XGBoost",
|
| 5 |
+
"aggregate": {
|
| 6 |
+
"accuracy_mean": 0.8612111567132926,
|
| 7 |
+
"accuracy_std": 0.001627924593816666,
|
| 8 |
+
"precision_mean": 0.8386227780432687,
|
| 9 |
+
"precision_std": 0.002350895530949529,
|
| 10 |
+
"recall_mean": 0.7743306030104373,
|
| 11 |
+
"recall_std": 0.005137556231553914,
|
| 12 |
+
"f1_score_mean": 0.8051857268662059,
|
| 13 |
+
"f1_score_std": 0.0027598960622244877,
|
| 14 |
+
"roc_auc_mean": 0.9376353669900619,
|
| 15 |
+
"roc_auc_std": 0.0010218562950094605
|
| 16 |
+
},
|
| 17 |
+
"cv_folds": 5,
|
| 18 |
+
"timestamp": "2025-10-05T12:26:19.676359+00:00",
|
| 19 |
+
"notes": "Model will be (re)trained on training split below; final persisted champion artifact occurs after training.",
|
| 20 |
+
"persisted_path": "models/champion_model.pkl",
|
| 21 |
+
"holdout_metrics": {
|
| 22 |
+
"accuracy": 0.8613786749308987,
|
| 23 |
+
"precision": 0.841708852944808,
|
| 24 |
+
"recall": 0.7707179197286602,
|
| 25 |
+
"f1_score": 0.8046506137865911,
|
| 26 |
+
"roc_auc": 0.9384035807110922
|
| 27 |
+
},
|
| 28 |
+
"holdout_timestamp": "2025-10-05T16:43:04.352749+00:00",
|
| 29 |
+
"decision_threshold": 0.35000000000000003,
|
| 30 |
+
"decision_threshold_metrics": {
|
| 31 |
+
"precision": 0.7663852030558906,
|
| 32 |
+
"recall": 0.8619559072922555,
|
| 33 |
+
"f1_score": 0.811365934124408
|
| 34 |
+
},
|
| 35 |
+
"diagnostics_generated": "2025-10-05T16:43:06.126491+00:00",
|
| 36 |
+
"shap_generated": true,
|
| 37 |
+
"shap_timestamp": "2025-10-05T16:43:10.217872+00:00",
|
| 38 |
+
"library_versions": {
|
| 39 |
+
"python": "3.12.7",
|
| 40 |
+
"pandas": "2.3.3",
|
| 41 |
+
"numpy": "2.3.3",
|
| 42 |
+
"sklearn": "1.7.2",
|
| 43 |
+
"xgboost": "3.0.5",
|
| 44 |
+
"torch": "2.8.0",
|
| 45 |
+
"mlflow": "3.4.0",
|
| 46 |
+
"shap": "0.48.0",
|
| 47 |
+
"fastapi": "0.118.0"
|
| 48 |
+
}
|
| 49 |
+
}
|
artifacts/classification_report.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": {
|
| 3 |
+
"precision": 0.8714747449141264,
|
| 4 |
+
"recall": 0.9147209472493847,
|
| 5 |
+
"f1-score": 0.8925743216928469,
|
| 6 |
+
"support": 15033.0
|
| 7 |
+
},
|
| 8 |
+
"1": {
|
| 9 |
+
"precision": 0.841708852944808,
|
| 10 |
+
"recall": 0.7707179197286602,
|
| 11 |
+
"f1-score": 0.8046506137865911,
|
| 12 |
+
"support": 8845.0
|
| 13 |
+
},
|
| 14 |
+
"accuracy": 0.8613786749308987,
|
| 15 |
+
"macro avg": {
|
| 16 |
+
"precision": 0.8565917989294671,
|
| 17 |
+
"recall": 0.8427194334890225,
|
| 18 |
+
"f1-score": 0.848612467739719,
|
| 19 |
+
"support": 23878.0
|
| 20 |
+
},
|
| 21 |
+
"weighted avg": {
|
| 22 |
+
"precision": 0.8604487245410373,
|
| 23 |
+
"recall": 0.8613786749308987,
|
| 24 |
+
"f1-score": 0.8600052122016485,
|
| 25 |
+
"support": 23878.0
|
| 26 |
+
}
|
| 27 |
+
}
|
artifacts/confusion_matrix.png
ADDED
|
artifacts/cv_metrics.json
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"folds": 5,
|
| 3 |
+
"categorical_strategy": "target",
|
| 4 |
+
"include_mlp": false,
|
| 5 |
+
"results": {
|
| 6 |
+
"LogisticRegression": {
|
| 7 |
+
"folds": [
|
| 8 |
+
{
|
| 9 |
+
"accuracy": 0.8070190133176983,
|
| 10 |
+
"precision": 0.796804932735426,
|
| 11 |
+
"recall": 0.6429217548620534,
|
| 12 |
+
"f1_score": 0.711639549436796,
|
| 13 |
+
"roc_auc": 0.8867040527525519
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"accuracy": 0.8130496691515202,
|
| 17 |
+
"precision": 0.8030994880309948,
|
| 18 |
+
"recall": 0.6561899378179763,
|
| 19 |
+
"f1_score": 0.7222498755599801,
|
| 20 |
+
"roc_auc": 0.890793004589075
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"accuracy": 0.8100762207890108,
|
| 24 |
+
"precision": 0.8030089988751407,
|
| 25 |
+
"recall": 0.6456755228942905,
|
| 26 |
+
"f1_score": 0.7157987090305196,
|
| 27 |
+
"roc_auc": 0.889186762553699
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"accuracy": 0.8098249434626016,
|
| 31 |
+
"precision": 0.7980609418282548,
|
| 32 |
+
"recall": 0.6514414923685699,
|
| 33 |
+
"f1_score": 0.7173358232181761,
|
| 34 |
+
"roc_auc": 0.8909674615600719
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"accuracy": 0.8133009464779295,
|
| 38 |
+
"precision": 0.8094230497954578,
|
| 39 |
+
"recall": 0.648728094968909,
|
| 40 |
+
"f1_score": 0.7202209112589432,
|
| 41 |
+
"roc_auc": 0.8904796972569523
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"aggregate": {
|
| 45 |
+
"accuracy_mean": 0.8106541586397521,
|
| 46 |
+
"accuracy_std": 0.0025971064646111972,
|
| 47 |
+
"precision_mean": 0.8020794822530547,
|
| 48 |
+
"precision_std": 0.0049950459694398566,
|
| 49 |
+
"recall_mean": 0.6489913605823598,
|
| 50 |
+
"recall_std": 0.005141178414132198,
|
| 51 |
+
"f1_score_mean": 0.717448973700883,
|
| 52 |
+
"f1_score_std": 0.004099325654318532,
|
| 53 |
+
"roc_auc_mean": 0.88962619574247,
|
| 54 |
+
"roc_auc_std": 0.0017762969131121652
|
| 55 |
+
}
|
| 56 |
+
},
|
| 57 |
+
"RandomForest": {
|
| 58 |
+
"folds": [
|
| 59 |
+
{
|
| 60 |
+
"accuracy": 0.8455063238127146,
|
| 61 |
+
"precision": 0.8715583105088655,
|
| 62 |
+
"recall": 0.6836273179556762,
|
| 63 |
+
"f1_score": 0.7662378809961345,
|
| 64 |
+
"roc_auc": 0.926749180073486
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"accuracy": 0.8513275818745288,
|
| 68 |
+
"precision": 0.8743107592252227,
|
| 69 |
+
"recall": 0.699152063312606,
|
| 70 |
+
"f1_score": 0.7769820329187084,
|
| 71 |
+
"roc_auc": 0.9279393624961584
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"accuracy": 0.8490242063824441,
|
| 75 |
+
"precision": 0.8785033227390927,
|
| 76 |
+
"recall": 0.6875070661390617,
|
| 77 |
+
"f1_score": 0.7713578994101604,
|
| 78 |
+
"roc_auc": 0.9265822313578301
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"accuracy": 0.8504062316776949,
|
| 82 |
+
"precision": 0.8647115783649191,
|
| 83 |
+
"recall": 0.7067269643866592,
|
| 84 |
+
"f1_score": 0.7777777777777778,
|
| 85 |
+
"roc_auc": 0.9282198571471385
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"accuracy": 0.8515369796465365,
|
| 89 |
+
"precision": 0.8821747908854918,
|
| 90 |
+
"recall": 0.6915771622385528,
|
| 91 |
+
"f1_score": 0.7753343050890424,
|
| 92 |
+
"roc_auc": 0.9290067109566416
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"aggregate": {
|
| 96 |
+
"accuracy_mean": 0.8495602646787838,
|
| 97 |
+
"accuracy_std": 0.002473270473483476,
|
| 98 |
+
"precision_mean": 0.8742517523447184,
|
| 99 |
+
"precision_std": 0.006691849909345141,
|
| 100 |
+
"recall_mean": 0.6937181148065112,
|
| 101 |
+
"recall_std": 0.009270153023255628,
|
| 102 |
+
"f1_score_mean": 0.7735379792383648,
|
| 103 |
+
"f1_score_std": 0.004772535132748273,
|
| 104 |
+
"roc_auc_mean": 0.9276994684062508,
|
| 105 |
+
"roc_auc_std": 0.0010232916543342055
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"XGBoost": {
|
| 109 |
+
"folds": [
|
| 110 |
+
{
|
| 111 |
+
"accuracy": 0.859703492754837,
|
| 112 |
+
"precision": 0.8373863915499877,
|
| 113 |
+
"recall": 0.7709181365897784,
|
| 114 |
+
"f1_score": 0.8027787589779819,
|
| 115 |
+
"roc_auc": 0.9366735339592387
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"accuracy": 0.8613367953764972,
|
| 119 |
+
"precision": 0.8363724775103331,
|
| 120 |
+
"recall": 0.7778405879027699,
|
| 121 |
+
"f1_score": 0.8060453400503779,
|
| 122 |
+
"roc_auc": 0.9374002707516237
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"accuracy": 0.8604992042884664,
|
| 126 |
+
"precision": 0.841635687732342,
|
| 127 |
+
"recall": 0.7678914641040135,
|
| 128 |
+
"f1_score": 0.8030741945019214,
|
| 129 |
+
"roc_auc": 0.9368807090577477
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"accuracy": 0.8605829633972695,
|
| 133 |
+
"precision": 0.8370813981911513,
|
| 134 |
+
"recall": 0.774335782928208,
|
| 135 |
+
"f1_score": 0.8044869912491924,
|
| 136 |
+
"roc_auc": 0.9380101143228257
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"accuracy": 0.8639333277493928,
|
| 140 |
+
"precision": 0.8406379352325298,
|
| 141 |
+
"recall": 0.7806670435274167,
|
| 142 |
+
"f1_score": 0.8095433495515564,
|
| 143 |
+
"roc_auc": 0.9392122068588733
|
| 144 |
+
}
|
| 145 |
+
],
|
| 146 |
+
"aggregate": {
|
| 147 |
+
"accuracy_mean": 0.8612111567132926,
|
| 148 |
+
"accuracy_std": 0.001627924593816666,
|
| 149 |
+
"precision_mean": 0.8386227780432687,
|
| 150 |
+
"precision_std": 0.002350895530949529,
|
| 151 |
+
"recall_mean": 0.7743306030104373,
|
| 152 |
+
"recall_std": 0.005137556231553914,
|
| 153 |
+
"f1_score_mean": 0.8051857268662059,
|
| 154 |
+
"f1_score_std": 0.0027598960622244877,
|
| 155 |
+
"roc_auc_mean": 0.9376353669900619,
|
| 156 |
+
"roc_auc_std": 0.0010218562950094605
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
},
|
| 160 |
+
"timestamp": "2025-10-05T12:26:19.675852+00:00"
|
| 161 |
+
}
|
artifacts/distribution_baselines.json
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_utc": "2025-10-04T14:18:28.568003",
|
| 3 |
+
"columns": {
|
| 4 |
+
"hotel": {
|
| 5 |
+
"top_value_proportions": {
|
| 6 |
+
"0": 0.664461,
|
| 7 |
+
"1": 0.335539
|
| 8 |
+
},
|
| 9 |
+
"n_unique": 2
|
| 10 |
+
},
|
| 11 |
+
"market_segment": {
|
| 12 |
+
"top_value_proportions": {
|
| 13 |
+
"6": 0.473046,
|
| 14 |
+
"5": 0.202856,
|
| 15 |
+
"4": 0.165935,
|
| 16 |
+
"3": 0.105587,
|
| 17 |
+
"2": 0.04435,
|
| 18 |
+
"1": 0.006223,
|
| 19 |
+
"0": 0.001985,
|
| 20 |
+
"7": 1.7e-05
|
| 21 |
+
},
|
| 22 |
+
"n_unique": 8
|
| 23 |
+
},
|
| 24 |
+
"distribution_channel": {
|
| 25 |
+
"top_value_proportions": {
|
| 26 |
+
"3": 0.81975,
|
| 27 |
+
"1": 0.122665,
|
| 28 |
+
"0": 0.055926,
|
| 29 |
+
"2": 0.001617,
|
| 30 |
+
"4": 4.2e-05
|
| 31 |
+
},
|
| 32 |
+
"n_unique": 5
|
| 33 |
+
},
|
| 34 |
+
"reserved_room_type": {
|
| 35 |
+
"top_value_proportions": {
|
| 36 |
+
"0": 0.720278,
|
| 37 |
+
"3": 0.160826,
|
| 38 |
+
"4": 0.054737,
|
| 39 |
+
"5": 0.024265,
|
| 40 |
+
"6": 0.017539,
|
| 41 |
+
"1": 0.009364,
|
| 42 |
+
"2": 0.007806,
|
| 43 |
+
"7": 0.005034,
|
| 44 |
+
"9": 0.000101,
|
| 45 |
+
"8": 5e-05
|
| 46 |
+
},
|
| 47 |
+
"n_unique": 10
|
| 48 |
+
},
|
| 49 |
+
"customer_type": {
|
| 50 |
+
"top_value_proportions": {
|
| 51 |
+
"2": 0.750591,
|
| 52 |
+
"3": 0.210436,
|
| 53 |
+
"0": 0.03414,
|
| 54 |
+
"1": 0.004833
|
| 55 |
+
},
|
| 56 |
+
"n_unique": 4
|
| 57 |
+
},
|
| 58 |
+
"guest_type": {
|
| 59 |
+
"top_value_proportions": {
|
| 60 |
+
"couple": 0.683114,
|
| 61 |
+
"solo_traveler": 0.189103,
|
| 62 |
+
"family_with_children": 0.070517,
|
| 63 |
+
"group": 0.049585,
|
| 64 |
+
"family_with_babies": 0.007681
|
| 65 |
+
},
|
| 66 |
+
"n_unique": 5
|
| 67 |
+
},
|
| 68 |
+
"arrival_season": {
|
| 69 |
+
"top_value_proportions": {
|
| 70 |
+
"winter": 0.297127,
|
| 71 |
+
"summer": 0.279705,
|
| 72 |
+
"autumn": 0.249141,
|
| 73 |
+
"spring": 0.174026
|
| 74 |
+
},
|
| 75 |
+
"n_unique": 4
|
| 76 |
+
},
|
| 77 |
+
"hotel_target_encoded": {
|
| 78 |
+
"top_value_proportions": {
|
| 79 |
+
"0.41777805836582016": 0.133772,
|
| 80 |
+
"0.41785133877151737": 0.133604,
|
| 81 |
+
"0.41697259993382385": 0.132867,
|
| 82 |
+
"0.41866796764344844": 0.132239,
|
| 83 |
+
"0.41508187438063326": 0.131979,
|
| 84 |
+
"0.2775603494160744": 0.068021,
|
| 85 |
+
"0.27735376915858617": 0.067761,
|
| 86 |
+
"0.27948197846777967": 0.067133,
|
| 87 |
+
"0.27516882955217375": 0.066396,
|
| 88 |
+
"0.2786054178459242": 0.066228
|
| 89 |
+
},
|
| 90 |
+
"n_unique": 10
|
| 91 |
+
},
|
| 92 |
+
"market_segment_target_encoded": {
|
| 93 |
+
"top_value_proportions": {
|
| 94 |
+
"0.36711726709728976": 0.095703,
|
| 95 |
+
"0.368653079308817": 0.094958,
|
| 96 |
+
"0.36769683428091055": 0.09443,
|
| 97 |
+
"0.3667654736096978": 0.094405,
|
| 98 |
+
"0.365829434095524": 0.093551,
|
| 99 |
+
"0.34480257021453": 0.041218,
|
| 100 |
+
"0.3420074349442379": 0.040632,
|
| 101 |
+
"0.3410017021715583": 0.040472,
|
| 102 |
+
"0.34453261877769764": 0.040313,
|
| 103 |
+
"0.343461914816913": 0.040221,
|
| 104 |
+
"0.6071020925808497": 0.033847,
|
| 105 |
+
"0.6109952606635071": 0.033386,
|
| 106 |
+
"0.614203576167309": 0.03337,
|
| 107 |
+
"0.6104215235334888": 0.033001,
|
| 108 |
+
"0.6103692558460284": 0.032331,
|
| 109 |
+
"0.15239520958083833": 0.02166,
|
| 110 |
+
"0.1561072492552135": 0.021241,
|
| 111 |
+
"0.15370902248192533": 0.021015,
|
| 112 |
+
"0.1535950944515874": 0.020898,
|
| 113 |
+
"0.15129369938771478": 0.020772,
|
| 114 |
+
"0.1863398381722989": 0.009155,
|
| 115 |
+
"0.1911100546707868": 0.009113,
|
| 116 |
+
"0.18545712932259592": 0.008987,
|
| 117 |
+
"0.18583862620559868": 0.008744,
|
| 118 |
+
"0.18799441600744532": 0.008351,
|
| 119 |
+
"0.125": 0.001332,
|
| 120 |
+
"0.14188034188034188": 0.001323,
|
| 121 |
+
"0.1218274111675127": 0.001273,
|
| 122 |
+
"0.13210702341137123": 0.001215,
|
| 123 |
+
"0.13192182410423453": 0.00108,
|
| 124 |
+
"0.22043010891237144": 0.000427,
|
| 125 |
+
"0.2287234053269401": 0.00041,
|
| 126 |
+
"0.2116402127267593": 0.000402,
|
| 127 |
+
"0.20942408467164003": 0.000385,
|
| 128 |
+
"0.22680412430742453": 0.00036,
|
| 129 |
+
"0.6852081413853757": 1.7e-05
|
| 130 |
+
},
|
| 131 |
+
"n_unique": 36
|
| 132 |
+
},
|
| 133 |
+
"distribution_channel_target_encoded": {
|
| 134 |
+
"top_value_proportions": {
|
| 135 |
+
"0.4109862288541946": 0.164696,
|
| 136 |
+
"0.41108456516462033": 0.164159,
|
| 137 |
+
"0.4109083479127026": 0.163858,
|
| 138 |
+
"0.4100920593981026": 0.163757,
|
| 139 |
+
"0.4082244564662652": 0.16328,
|
| 140 |
+
"0.1772357723577236": 0.024793,
|
| 141 |
+
"0.17429622657653804": 0.024776,
|
| 142 |
+
"0.17191097467382963": 0.024441,
|
| 143 |
+
"0.17558813501534265": 0.024399,
|
| 144 |
+
"0.17397225295769853": 0.024257,
|
| 145 |
+
"0.21781804454511136": 0.01155,
|
| 146 |
+
"0.2185617469879518": 0.011433,
|
| 147 |
+
"0.2206432311144353": 0.011132,
|
| 148 |
+
"0.22444402915342926": 0.011106,
|
| 149 |
+
"0.22226338210779775": 0.010704,
|
| 150 |
+
"0.19463093816188887": 0.000369,
|
| 151 |
+
"0.18000006437483895": 0.00036,
|
| 152 |
+
"0.19736846895085886": 0.000343,
|
| 153 |
+
"0.20512823579560657": 0.00031,
|
| 154 |
+
"0.18181819604499994": 0.000235,
|
| 155 |
+
"0.43844503735226303": 2.5e-05,
|
| 156 |
+
"0.7320759378395608": 8e-06,
|
| 157 |
+
"0.588465308636646": 8e-06
|
| 158 |
+
},
|
| 159 |
+
"n_unique": 23
|
| 160 |
+
},
|
| 161 |
+
"reserved_room_type_target_encoded": {
|
| 162 |
+
"top_value_proportions": {
|
| 163 |
+
"0.3896345901878003": 0.144937,
|
| 164 |
+
"0.3905292722324898": 0.144711,
|
| 165 |
+
"0.3910365366456018": 0.143722,
|
| 166 |
+
"0.39163580919189717": 0.143471,
|
| 167 |
+
"0.39252784271588087": 0.143437,
|
| 168 |
+
"0.3161205766710354": 0.033009,
|
| 169 |
+
"0.31908924843423797": 0.03244,
|
| 170 |
+
"0.31619084865076263": 0.032323,
|
| 171 |
+
"0.3189817520618222": 0.031845,
|
| 172 |
+
"0.318578352180937": 0.031209,
|
| 173 |
+
"0.2947429454967143": 0.0114,
|
| 174 |
+
"0.2954064962521622": 0.011157,
|
| 175 |
+
"0.29563227160022887": 0.010822,
|
| 176 |
+
"0.28764773160503243": 0.010797,
|
| 177 |
+
"0.2910504361016306": 0.010562,
|
| 178 |
+
"0.30065934065934063": 0.00521,
|
| 179 |
+
"0.3119861531804414": 0.004908,
|
| 180 |
+
"0.2967602591792657": 0.004875,
|
| 181 |
+
"0.3099742046431642": 0.004783,
|
| 182 |
+
"0.29944938585345193": 0.004489,
|
| 183 |
+
"0.37227602905569007": 0.003702,
|
| 184 |
+
"0.35645355850422195": 0.003652,
|
| 185 |
+
"0.3641826923076923": 0.003602,
|
| 186 |
+
"0.3662551440329218": 0.003292,
|
| 187 |
+
"0.3627278071722516": 0.003292,
|
| 188 |
+
"0.3403019744483159": 0.002153,
|
| 189 |
+
"0.3164983164983165": 0.001901,
|
| 190 |
+
"0.32297447280799113": 0.001818,
|
| 191 |
+
"0.33074361820199777": 0.001818,
|
| 192 |
+
"0.3355119825708061": 0.001675,
|
| 193 |
+
"0.32103825136612024": 0.001675,
|
| 194 |
+
"0.33694181326116374": 0.001617,
|
| 195 |
+
"0.3247978436657682": 0.001591,
|
| 196 |
+
"0.32754010695187163": 0.001541,
|
| 197 |
+
"0.34159061277705344": 0.001382,
|
| 198 |
+
"0.40425531914893614": 0.001097,
|
| 199 |
+
"0.41226215644820297": 0.001072,
|
| 200 |
+
"0.40794979079497906": 0.00103,
|
| 201 |
+
"0.4086242299794661": 0.000955,
|
| 202 |
+
"0.40524193548387094": 0.000879,
|
| 203 |
+
"0.8048129813923957": 5e-05,
|
| 204 |
+
"0.8180185133956868": 5e-05,
|
| 205 |
+
"0.3012440502308165": 1.7e-05,
|
| 206 |
+
"0.44485467943373125": 1.7e-05,
|
| 207 |
+
"0.3881276892161587": 8e-06,
|
| 208 |
+
"0.26839015719366827": 8e-06
|
| 209 |
+
},
|
| 210 |
+
"n_unique": 46
|
| 211 |
+
},
|
| 212 |
+
"customer_type_target_encoded": {
|
| 213 |
+
"top_value_proportions": {
|
| 214 |
+
"0.4066526151226253": 0.150532,
|
| 215 |
+
"0.40595648472499407": 0.150431,
|
| 216 |
+
"0.4087548475295037": 0.150155,
|
| 217 |
+
"0.4077947430802482": 0.149912,
|
| 218 |
+
"0.40815530192176375": 0.14956,
|
| 219 |
+
"0.2551534814075368": 0.042625,
|
| 220 |
+
"0.2534256813991729": 0.04234,
|
| 221 |
+
"0.2539493293591654": 0.041829,
|
| 222 |
+
"0.2543964232488823": 0.041829,
|
| 223 |
+
"0.25456983906218955": 0.041813,
|
| 224 |
+
"0.30964939497362703": 0.007145,
|
| 225 |
+
"0.3082614056720099": 0.006969,
|
| 226 |
+
"0.3088235294117647": 0.006801,
|
| 227 |
+
"0.31187061336588345": 0.006692,
|
| 228 |
+
"0.3094660194174757": 0.006533,
|
| 229 |
+
"0.10398230088495575": 0.001047,
|
| 230 |
+
"0.09740259740259741": 0.000963,
|
| 231 |
+
"0.10367170626349892": 0.000955,
|
| 232 |
+
"0.0967741935483871": 0.000938,
|
| 233 |
+
"0.10944206008583691": 0.00093
|
| 234 |
+
},
|
| 235 |
+
"n_unique": 20
|
| 236 |
+
}
|
| 237 |
+
},
|
| 238 |
+
"target_mean": 0.37041628277075134
|
| 239 |
+
}
|
artifacts/dropped_columns.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-10-05T02:04:46.609259+00:00",
|
| 3 |
+
"categorical_strategy": "drop",
|
| 4 |
+
"dropped_columns": [
|
| 5 |
+
"country",
|
| 6 |
+
"guest_type",
|
| 7 |
+
"arrival_season",
|
| 8 |
+
"arrival_quarter"
|
| 9 |
+
],
|
| 10 |
+
"remaining_feature_count": 37
|
| 11 |
+
}
|
artifacts/fairness_group_metrics.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"error": "feature_shape_mismatch",
|
| 3 |
+
"message": "Feature shape mismatch, expected: 41, got 28",
|
| 4 |
+
"note": "Full encoded feature space not reconstructed; run full pipeline-based fairness later."
|
| 5 |
+
}
|
artifacts/fairness_group_outcome_only.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"group": "lead_time_bucket",
|
| 4 |
+
"value": "LT_180+",
|
| 5 |
+
"cancellation_rate": 0.5684,
|
| 6 |
+
"support": 24962
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"group": "lead_time_bucket",
|
| 10 |
+
"value": "LT_30_89",
|
| 11 |
+
"cancellation_rate": 0.3779,
|
| 12 |
+
"support": 29919
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"group": "lead_time_bucket",
|
| 16 |
+
"value": "LT_90_179",
|
| 17 |
+
"cancellation_rate": 0.4455,
|
| 18 |
+
"support": 26462
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"group": "lead_time_bucket",
|
| 22 |
+
"value": "LT_<30",
|
| 23 |
+
"cancellation_rate": 0.1825,
|
| 24 |
+
"support": 38047
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"group": "special_requests_bucket",
|
| 28 |
+
"value": "SR_0",
|
| 29 |
+
"cancellation_rate": 0.4772,
|
| 30 |
+
"support": 70318
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"group": "special_requests_bucket",
|
| 34 |
+
"value": "SR_1",
|
| 35 |
+
"cancellation_rate": 0.2202,
|
| 36 |
+
"support": 33226
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"group": "special_requests_bucket",
|
| 40 |
+
"value": "SR_2_3",
|
| 41 |
+
"cancellation_rate": 0.2141,
|
| 42 |
+
"support": 15466
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"group": "special_requests_bucket",
|
| 46 |
+
"value": "SR_4+",
|
| 47 |
+
"cancellation_rate": 0.1,
|
| 48 |
+
"support": 380
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"group": "is_repeated_guest_str",
|
| 52 |
+
"value": "0",
|
| 53 |
+
"cancellation_rate": 0.3779,
|
| 54 |
+
"support": 115580
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"group": "is_repeated_guest_str",
|
| 58 |
+
"value": "1",
|
| 59 |
+
"cancellation_rate": 0.1449,
|
| 60 |
+
"support": 3810
|
| 61 |
+
}
|
| 62 |
+
]
|
artifacts/fairness_summary.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Fairness Analysis
|
| 2 |
+
|
| 3 |
+
Fairness analysis artifacts will be generated here when running `scripts/fairness_analysis.py`.
|
| 4 |
+
|
| 5 |
+
Run with encoded features:
|
| 6 |
+
```bash
|
| 7 |
+
python scripts/fairness_analysis.py
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
This will generate subgroup performance metrics and fairness evaluations.
|
artifacts/feature_contract.json
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_utc": "2025-10-04T14:18:28.557575",
|
| 3 |
+
"feature_order": [
|
| 4 |
+
"hotel",
|
| 5 |
+
"lead_time",
|
| 6 |
+
"arrival_date_year",
|
| 7 |
+
"arrival_date_month",
|
| 8 |
+
"arrival_date_week_number",
|
| 9 |
+
"arrival_date_day_of_month",
|
| 10 |
+
"stays_in_weekend_nights",
|
| 11 |
+
"stays_in_week_nights",
|
| 12 |
+
"adults",
|
| 13 |
+
"children",
|
| 14 |
+
"babies",
|
| 15 |
+
"meal",
|
| 16 |
+
"country",
|
| 17 |
+
"market_segment",
|
| 18 |
+
"distribution_channel",
|
| 19 |
+
"is_repeated_guest",
|
| 20 |
+
"previous_cancellations",
|
| 21 |
+
"previous_bookings_not_canceled",
|
| 22 |
+
"reserved_room_type",
|
| 23 |
+
"assigned_room_type",
|
| 24 |
+
"booking_changes",
|
| 25 |
+
"deposit_type",
|
| 26 |
+
"days_in_waiting_list",
|
| 27 |
+
"customer_type",
|
| 28 |
+
"adr",
|
| 29 |
+
"required_car_parking_spaces",
|
| 30 |
+
"total_of_special_requests",
|
| 31 |
+
"total_stay_duration",
|
| 32 |
+
"total_guests",
|
| 33 |
+
"is_family",
|
| 34 |
+
"guest_type",
|
| 35 |
+
"arrival_season",
|
| 36 |
+
"is_peak_season",
|
| 37 |
+
"arrival_quarter",
|
| 38 |
+
"is_summer_peak",
|
| 39 |
+
"is_holiday_season",
|
| 40 |
+
"hotel_target_encoded",
|
| 41 |
+
"market_segment_target_encoded",
|
| 42 |
+
"distribution_channel_target_encoded",
|
| 43 |
+
"reserved_room_type_target_encoded",
|
| 44 |
+
"customer_type_target_encoded"
|
| 45 |
+
],
|
| 46 |
+
"dtypes": {
|
| 47 |
+
"hotel": "int64",
|
| 48 |
+
"lead_time": "int64",
|
| 49 |
+
"arrival_date_year": "int64",
|
| 50 |
+
"arrival_date_month": "int64",
|
| 51 |
+
"arrival_date_week_number": "int64",
|
| 52 |
+
"arrival_date_day_of_month": "int64",
|
| 53 |
+
"stays_in_weekend_nights": "int64",
|
| 54 |
+
"stays_in_week_nights": "int64",
|
| 55 |
+
"adults": "int64",
|
| 56 |
+
"children": "float64",
|
| 57 |
+
"babies": "int64",
|
| 58 |
+
"meal": "int64",
|
| 59 |
+
"country": "object",
|
| 60 |
+
"market_segment": "int64",
|
| 61 |
+
"distribution_channel": "int64",
|
| 62 |
+
"is_repeated_guest": "int64",
|
| 63 |
+
"previous_cancellations": "int64",
|
| 64 |
+
"previous_bookings_not_canceled": "int64",
|
| 65 |
+
"reserved_room_type": "int64",
|
| 66 |
+
"assigned_room_type": "int64",
|
| 67 |
+
"booking_changes": "int64",
|
| 68 |
+
"deposit_type": "int64",
|
| 69 |
+
"days_in_waiting_list": "int64",
|
| 70 |
+
"customer_type": "int64",
|
| 71 |
+
"adr": "float64",
|
| 72 |
+
"required_car_parking_spaces": "int64",
|
| 73 |
+
"total_of_special_requests": "int64",
|
| 74 |
+
"total_stay_duration": "int64",
|
| 75 |
+
"total_guests": "float64",
|
| 76 |
+
"is_family": "int64",
|
| 77 |
+
"guest_type": "object",
|
| 78 |
+
"arrival_season": "object",
|
| 79 |
+
"is_peak_season": "int64",
|
| 80 |
+
"arrival_quarter": "object",
|
| 81 |
+
"is_summer_peak": "int64",
|
| 82 |
+
"is_holiday_season": "int64",
|
| 83 |
+
"hotel_target_encoded": "float64",
|
| 84 |
+
"market_segment_target_encoded": "float64",
|
| 85 |
+
"distribution_channel_target_encoded": "float64",
|
| 86 |
+
"reserved_room_type_target_encoded": "float64",
|
| 87 |
+
"customer_type_target_encoded": "float64"
|
| 88 |
+
}
|
| 89 |
+
}
|
artifacts/feature_importance.json
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"feature": "deposit_type",
|
| 4 |
+
"mean_abs_shap": 1.004747748374939
|
| 5 |
+
},
|
| 6 |
+
{
|
| 7 |
+
"feature": "country__te",
|
| 8 |
+
"mean_abs_shap": 0.8516273498535156
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"feature": "market_segment",
|
| 12 |
+
"mean_abs_shap": 0.43541011214256287
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"feature": "total_of_special_requests",
|
| 16 |
+
"mean_abs_shap": 0.4210052192211151
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"feature": "lead_time",
|
| 20 |
+
"mean_abs_shap": 0.41456905007362366
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"feature": "required_car_parking_spaces",
|
| 24 |
+
"mean_abs_shap": 0.4020047187805176
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"feature": "assigned_room_type",
|
| 28 |
+
"mean_abs_shap": 0.3292023837566376
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"feature": "customer_type_target_encoded",
|
| 32 |
+
"mean_abs_shap": 0.2506164312362671
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"feature": "reserved_room_type",
|
| 36 |
+
"mean_abs_shap": 0.23714518547058105
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"feature": "previous_cancellations",
|
| 40 |
+
"mean_abs_shap": 0.21544909477233887
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"feature": "arrival_date_year",
|
| 44 |
+
"mean_abs_shap": 0.2018701285123825
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"feature": "adr",
|
| 48 |
+
"mean_abs_shap": 0.1720850169658661
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"feature": "booking_changes",
|
| 52 |
+
"mean_abs_shap": 0.13707901537418365
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"feature": "market_segment_target_encoded",
|
| 56 |
+
"mean_abs_shap": 0.12096284329891205
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"feature": "hotel",
|
| 60 |
+
"mean_abs_shap": 0.08043359220027924
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"feature": "previous_bookings_not_canceled",
|
| 64 |
+
"mean_abs_shap": 0.07711290568113327
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"feature": "arrival_date_week_number",
|
| 68 |
+
"mean_abs_shap": 0.053753212094306946
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"feature": "total_stay_duration",
|
| 72 |
+
"mean_abs_shap": 0.04918520152568817
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"feature": "distribution_channel_target_encoded",
|
| 76 |
+
"mean_abs_shap": 0.046840302646160126
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"feature": "meal",
|
| 80 |
+
"mean_abs_shap": 0.02777845785021782
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"feature": "stays_in_weekend_nights",
|
| 84 |
+
"mean_abs_shap": 0.02747640572488308
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"feature": "is_peak_season",
|
| 88 |
+
"mean_abs_shap": 0.026596231386065483
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"feature": "adults",
|
| 92 |
+
"mean_abs_shap": 0.02620122581720352
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"feature": "customer_type",
|
| 96 |
+
"mean_abs_shap": 0.024083152413368225
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"feature": "arrival_season__te",
|
| 100 |
+
"mean_abs_shap": 0.021409466862678528
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"feature": "stays_in_week_nights",
|
| 104 |
+
"mean_abs_shap": 0.020597653463482857
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"feature": "arrival_date_month",
|
| 108 |
+
"mean_abs_shap": 0.020358875393867493
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"feature": "reserved_room_type_target_encoded",
|
| 112 |
+
"mean_abs_shap": 0.017543498426675797
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"feature": "arrival_date_day_of_month",
|
| 116 |
+
"mean_abs_shap": 0.016908343881368637
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"feature": "hotel_target_encoded",
|
| 120 |
+
"mean_abs_shap": 0.014303297735750675
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"feature": "distribution_channel",
|
| 124 |
+
"mean_abs_shap": 0.012937864288687706
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"feature": "is_repeated_guest",
|
| 128 |
+
"mean_abs_shap": 0.012019594199955463
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"feature": "children",
|
| 132 |
+
"mean_abs_shap": 0.011608750559389591
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"feature": "days_in_waiting_list",
|
| 136 |
+
"mean_abs_shap": 0.008652577176690102
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"feature": "guest_type__te",
|
| 140 |
+
"mean_abs_shap": 0.008242463693022728
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"feature": "total_guests",
|
| 144 |
+
"mean_abs_shap": 0.006184790749102831
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"feature": "is_family",
|
| 148 |
+
"mean_abs_shap": 0.005206487141549587
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"feature": "is_summer_peak",
|
| 152 |
+
"mean_abs_shap": 0.002421196084469557
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"feature": "is_holiday_season",
|
| 156 |
+
"mean_abs_shap": 0.0011103155557066202
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"feature": "arrival_quarter__te",
|
| 160 |
+
"mean_abs_shap": 0.0009324172860942781
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"feature": "babies",
|
| 164 |
+
"mean_abs_shap": 0.00015189241094049066
|
| 165 |
+
}
|
| 166 |
+
]
|
artifacts/feature_name_map.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"hotel": "Hotel",
|
| 3 |
+
"lead_time": "Lead Time",
|
| 4 |
+
"arrival_date_year": "Arrival Date Year",
|
| 5 |
+
"arrival_date_month": "Arrival Date Month",
|
| 6 |
+
"arrival_date_week_number": "Arrival Date Week Number",
|
| 7 |
+
"arrival_date_day_of_month": "Arrival Date Day Of Month",
|
| 8 |
+
"stays_in_weekend_nights": "Stays In Weekend Nights",
|
| 9 |
+
"stays_in_week_nights": "Stays In Week Nights",
|
| 10 |
+
"adults": "Adults",
|
| 11 |
+
"children": "Children",
|
| 12 |
+
"babies": "Babies",
|
| 13 |
+
"meal": "Meal",
|
| 14 |
+
"market_segment": "Market Segment",
|
| 15 |
+
"distribution_channel": "Distribution Channel",
|
| 16 |
+
"is_repeated_guest": "Is Repeated Guest",
|
| 17 |
+
"previous_cancellations": "Previous Cancellations",
|
| 18 |
+
"previous_bookings_not_canceled": "Previous Bookings Not Canceled",
|
| 19 |
+
"reserved_room_type": "Reserved Room Type",
|
| 20 |
+
"assigned_room_type": "Assigned Room Type",
|
| 21 |
+
"booking_changes": "Booking Changes",
|
| 22 |
+
"deposit_type": "Deposit Type",
|
| 23 |
+
"days_in_waiting_list": "Days In Waiting List",
|
| 24 |
+
"customer_type": "Customer Type",
|
| 25 |
+
"adr": "Adr",
|
| 26 |
+
"required_car_parking_spaces": "Required Car Parking Spaces",
|
| 27 |
+
"total_of_special_requests": "Total Of Special Requests",
|
| 28 |
+
"total_stay_duration": "Total stay duration (nights)",
|
| 29 |
+
"total_guests": "Total guests (adults + children + babies)",
|
| 30 |
+
"is_family": "Family booking flag",
|
| 31 |
+
"is_peak_season": "Peak season flag",
|
| 32 |
+
"is_summer_peak": "Summer peak season flag",
|
| 33 |
+
"is_holiday_season": "Holiday season flag",
|
| 34 |
+
"hotel_target_encoded": "Hotel (target encoded)",
|
| 35 |
+
"market_segment_target_encoded": "Market Segment (target encoded)",
|
| 36 |
+
"distribution_channel_target_encoded": "Distribution Channel (target encoded)",
|
| 37 |
+
"reserved_room_type_target_encoded": "Reserved Room Type (target encoded)",
|
| 38 |
+
"customer_type_target_encoded": "Customer Type (target encoded)",
|
| 39 |
+
"country__te": "Country (target encoded)",
|
| 40 |
+
"guest_type__te": "Guest Type (target encoded)",
|
| 41 |
+
"arrival_season__te": "Arrival Season (target encoded)",
|
| 42 |
+
"arrival_quarter__te": "Arrival Quarter (target encoded)"
|
| 43 |
+
}
|
artifacts/feature_rules.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_utc": "2025-10-04T14:18:28.567902",
|
| 3 |
+
"rules": {
|
| 4 |
+
"guest_type_rule": [
|
| 5 |
+
{
|
| 6 |
+
"if": "babies>0",
|
| 7 |
+
"return": "family_with_babies"
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"elif": "children>0",
|
| 11 |
+
"return": "family_with_children"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"elif": "adults==1",
|
| 15 |
+
"return": "solo_traveler"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"elif": "adults==2",
|
| 19 |
+
"return": "couple"
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"else": true,
|
| 23 |
+
"return": "group"
|
| 24 |
+
}
|
| 25 |
+
],
|
| 26 |
+
"season_mapping_numeric": {
|
| 27 |
+
"1": "winter",
|
| 28 |
+
"2": "winter",
|
| 29 |
+
"3": "spring",
|
| 30 |
+
"4": "spring",
|
| 31 |
+
"5": "spring",
|
| 32 |
+
"6": "summer",
|
| 33 |
+
"7": "summer",
|
| 34 |
+
"8": "summer",
|
| 35 |
+
"9": "autumn",
|
| 36 |
+
"10": "autumn",
|
| 37 |
+
"11": "autumn",
|
| 38 |
+
"12": "winter"
|
| 39 |
+
},
|
| 40 |
+
"peak_months_numeric": [
|
| 41 |
+
5,
|
| 42 |
+
6,
|
| 43 |
+
7,
|
| 44 |
+
8,
|
| 45 |
+
9
|
| 46 |
+
],
|
| 47 |
+
"temporal_flags": {
|
| 48 |
+
"arrival_quarter": "Q{((month-1)//3)+1}",
|
| 49 |
+
"is_summer_peak": "[7,8]",
|
| 50 |
+
"is_holiday_season": "[12,1]"
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
}
|
artifacts/feature_schema.json
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_utc": "2025-10-04T14:18:28.567462",
|
| 3 |
+
"schema": {
|
| 4 |
+
"hotel": {
|
| 5 |
+
"dtype": "int64",
|
| 6 |
+
"nullable": false,
|
| 7 |
+
"constraints": {}
|
| 8 |
+
},
|
| 9 |
+
"lead_time": {
|
| 10 |
+
"dtype": "int64",
|
| 11 |
+
"nullable": false,
|
| 12 |
+
"constraints": {}
|
| 13 |
+
},
|
| 14 |
+
"arrival_date_year": {
|
| 15 |
+
"dtype": "int64",
|
| 16 |
+
"nullable": false,
|
| 17 |
+
"constraints": {}
|
| 18 |
+
},
|
| 19 |
+
"arrival_date_month": {
|
| 20 |
+
"dtype": "int64",
|
| 21 |
+
"nullable": false,
|
| 22 |
+
"constraints": {}
|
| 23 |
+
},
|
| 24 |
+
"arrival_date_week_number": {
|
| 25 |
+
"dtype": "int64",
|
| 26 |
+
"nullable": false,
|
| 27 |
+
"constraints": {}
|
| 28 |
+
},
|
| 29 |
+
"arrival_date_day_of_month": {
|
| 30 |
+
"dtype": "int64",
|
| 31 |
+
"nullable": false,
|
| 32 |
+
"constraints": {}
|
| 33 |
+
},
|
| 34 |
+
"stays_in_weekend_nights": {
|
| 35 |
+
"dtype": "int64",
|
| 36 |
+
"nullable": false,
|
| 37 |
+
"constraints": {
|
| 38 |
+
"min": 0
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"stays_in_week_nights": {
|
| 42 |
+
"dtype": "int64",
|
| 43 |
+
"nullable": false,
|
| 44 |
+
"constraints": {
|
| 45 |
+
"min": 0
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"adults": {
|
| 49 |
+
"dtype": "int64",
|
| 50 |
+
"nullable": false,
|
| 51 |
+
"constraints": {
|
| 52 |
+
"min": 1
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"children": {
|
| 56 |
+
"dtype": "float64",
|
| 57 |
+
"nullable": false,
|
| 58 |
+
"constraints": {
|
| 59 |
+
"min": 0
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
"babies": {
|
| 63 |
+
"dtype": "int64",
|
| 64 |
+
"nullable": false,
|
| 65 |
+
"constraints": {
|
| 66 |
+
"min": 0
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"meal": {
|
| 70 |
+
"dtype": "int64",
|
| 71 |
+
"nullable": false,
|
| 72 |
+
"constraints": {}
|
| 73 |
+
},
|
| 74 |
+
"country": {
|
| 75 |
+
"dtype": "object",
|
| 76 |
+
"nullable": false,
|
| 77 |
+
"constraints": {}
|
| 78 |
+
},
|
| 79 |
+
"market_segment": {
|
| 80 |
+
"dtype": "int64",
|
| 81 |
+
"nullable": false,
|
| 82 |
+
"constraints": {}
|
| 83 |
+
},
|
| 84 |
+
"distribution_channel": {
|
| 85 |
+
"dtype": "int64",
|
| 86 |
+
"nullable": false,
|
| 87 |
+
"constraints": {}
|
| 88 |
+
},
|
| 89 |
+
"is_repeated_guest": {
|
| 90 |
+
"dtype": "int64",
|
| 91 |
+
"nullable": false,
|
| 92 |
+
"constraints": {}
|
| 93 |
+
},
|
| 94 |
+
"previous_cancellations": {
|
| 95 |
+
"dtype": "int64",
|
| 96 |
+
"nullable": false,
|
| 97 |
+
"constraints": {}
|
| 98 |
+
},
|
| 99 |
+
"previous_bookings_not_canceled": {
|
| 100 |
+
"dtype": "int64",
|
| 101 |
+
"nullable": false,
|
| 102 |
+
"constraints": {}
|
| 103 |
+
},
|
| 104 |
+
"reserved_room_type": {
|
| 105 |
+
"dtype": "int64",
|
| 106 |
+
"nullable": false,
|
| 107 |
+
"constraints": {}
|
| 108 |
+
},
|
| 109 |
+
"assigned_room_type": {
|
| 110 |
+
"dtype": "int64",
|
| 111 |
+
"nullable": false,
|
| 112 |
+
"constraints": {}
|
| 113 |
+
},
|
| 114 |
+
"booking_changes": {
|
| 115 |
+
"dtype": "int64",
|
| 116 |
+
"nullable": false,
|
| 117 |
+
"constraints": {}
|
| 118 |
+
},
|
| 119 |
+
"deposit_type": {
|
| 120 |
+
"dtype": "int64",
|
| 121 |
+
"nullable": false,
|
| 122 |
+
"constraints": {}
|
| 123 |
+
},
|
| 124 |
+
"days_in_waiting_list": {
|
| 125 |
+
"dtype": "int64",
|
| 126 |
+
"nullable": false,
|
| 127 |
+
"constraints": {}
|
| 128 |
+
},
|
| 129 |
+
"customer_type": {
|
| 130 |
+
"dtype": "int64",
|
| 131 |
+
"nullable": false,
|
| 132 |
+
"constraints": {}
|
| 133 |
+
},
|
| 134 |
+
"adr": {
|
| 135 |
+
"dtype": "float64",
|
| 136 |
+
"nullable": false,
|
| 137 |
+
"constraints": {}
|
| 138 |
+
},
|
| 139 |
+
"required_car_parking_spaces": {
|
| 140 |
+
"dtype": "int64",
|
| 141 |
+
"nullable": false,
|
| 142 |
+
"constraints": {}
|
| 143 |
+
},
|
| 144 |
+
"total_of_special_requests": {
|
| 145 |
+
"dtype": "int64",
|
| 146 |
+
"nullable": false,
|
| 147 |
+
"constraints": {}
|
| 148 |
+
},
|
| 149 |
+
"total_stay_duration": {
|
| 150 |
+
"dtype": "int64",
|
| 151 |
+
"nullable": false,
|
| 152 |
+
"constraints": {
|
| 153 |
+
"min": 0
|
| 154 |
+
}
|
| 155 |
+
},
|
| 156 |
+
"total_guests": {
|
| 157 |
+
"dtype": "float64",
|
| 158 |
+
"nullable": false,
|
| 159 |
+
"constraints": {}
|
| 160 |
+
},
|
| 161 |
+
"is_family": {
|
| 162 |
+
"dtype": "int64",
|
| 163 |
+
"nullable": false,
|
| 164 |
+
"constraints": {
|
| 165 |
+
"values": [
|
| 166 |
+
0,
|
| 167 |
+
1
|
| 168 |
+
]
|
| 169 |
+
}
|
| 170 |
+
},
|
| 171 |
+
"guest_type": {
|
| 172 |
+
"dtype": "object",
|
| 173 |
+
"nullable": false,
|
| 174 |
+
"constraints": {}
|
| 175 |
+
},
|
| 176 |
+
"arrival_season": {
|
| 177 |
+
"dtype": "object",
|
| 178 |
+
"nullable": false,
|
| 179 |
+
"constraints": {}
|
| 180 |
+
},
|
| 181 |
+
"is_peak_season": {
|
| 182 |
+
"dtype": "int64",
|
| 183 |
+
"nullable": false,
|
| 184 |
+
"constraints": {
|
| 185 |
+
"values": [
|
| 186 |
+
0,
|
| 187 |
+
1
|
| 188 |
+
]
|
| 189 |
+
}
|
| 190 |
+
},
|
| 191 |
+
"arrival_quarter": {
|
| 192 |
+
"dtype": "object",
|
| 193 |
+
"nullable": false,
|
| 194 |
+
"constraints": {}
|
| 195 |
+
},
|
| 196 |
+
"is_summer_peak": {
|
| 197 |
+
"dtype": "int64",
|
| 198 |
+
"nullable": false,
|
| 199 |
+
"constraints": {
|
| 200 |
+
"values": [
|
| 201 |
+
0,
|
| 202 |
+
1
|
| 203 |
+
]
|
| 204 |
+
}
|
| 205 |
+
},
|
| 206 |
+
"is_holiday_season": {
|
| 207 |
+
"dtype": "int64",
|
| 208 |
+
"nullable": false,
|
| 209 |
+
"constraints": {
|
| 210 |
+
"values": [
|
| 211 |
+
0,
|
| 212 |
+
1
|
| 213 |
+
]
|
| 214 |
+
}
|
| 215 |
+
},
|
| 216 |
+
"hotel_target_encoded": {
|
| 217 |
+
"dtype": "float64",
|
| 218 |
+
"nullable": false,
|
| 219 |
+
"constraints": {}
|
| 220 |
+
},
|
| 221 |
+
"market_segment_target_encoded": {
|
| 222 |
+
"dtype": "float64",
|
| 223 |
+
"nullable": false,
|
| 224 |
+
"constraints": {}
|
| 225 |
+
},
|
| 226 |
+
"distribution_channel_target_encoded": {
|
| 227 |
+
"dtype": "float64",
|
| 228 |
+
"nullable": false,
|
| 229 |
+
"constraints": {}
|
| 230 |
+
},
|
| 231 |
+
"reserved_room_type_target_encoded": {
|
| 232 |
+
"dtype": "float64",
|
| 233 |
+
"nullable": false,
|
| 234 |
+
"constraints": {}
|
| 235 |
+
},
|
| 236 |
+
"customer_type_target_encoded": {
|
| 237 |
+
"dtype": "float64",
|
| 238 |
+
"nullable": false,
|
| 239 |
+
"constraints": {}
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
}
|
artifacts/mte_mappings.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_utc": "2025-10-04T14:18:28.557206",
|
| 3 |
+
"target": "is_canceled",
|
| 4 |
+
"n_mappings": 5,
|
| 5 |
+
"encodings": {
|
| 6 |
+
"hotel": {
|
| 7 |
+
"encoded_column": "hotel_target_encoded",
|
| 8 |
+
"global_mean": 0.37041628277075134,
|
| 9 |
+
"categories": {
|
| 10 |
+
"0": 0.4172733063837846,
|
| 11 |
+
"1": 0.2776361457385911
|
| 12 |
+
},
|
| 13 |
+
"unique_categories": 2,
|
| 14 |
+
"correlation_with_target": 0.1362919003417029
|
| 15 |
+
},
|
| 16 |
+
"market_segment": {
|
| 17 |
+
"encoded_column": "market_segment_target_encoded",
|
| 18 |
+
"global_mean": 0.37041628277075134,
|
| 19 |
+
"categories": {
|
| 20 |
+
"0": 0.2193848011246842,
|
| 21 |
+
"1": 0.1305293327303513,
|
| 22 |
+
"2": 0.18735385096984933,
|
| 23 |
+
"3": 0.1534242478686866,
|
| 24 |
+
"4": 0.6106102590915212,
|
| 25 |
+
"5": 0.3431649262546367,
|
| 26 |
+
"6": 0.36721636358391,
|
| 27 |
+
"7": 0.6852081413853757
|
| 28 |
+
},
|
| 29 |
+
"unique_categories": 8,
|
| 30 |
+
"correlation_with_target": 0.26658119698812255
|
| 31 |
+
},
|
| 32 |
+
"distribution_channel": {
|
| 33 |
+
"encoded_column": "distribution_channel_target_encoded",
|
| 34 |
+
"global_mean": 0.37041628277075134,
|
| 35 |
+
"categories": {
|
| 36 |
+
"0": 0.22069913242602368,
|
| 37 |
+
"1": 0.1746080048040754,
|
| 38 |
+
"2": 0.19210634967836185,
|
| 39 |
+
"3": 0.41026163306920715,
|
| 40 |
+
"4": 0.5271752717065992
|
| 41 |
+
},
|
| 42 |
+
"unique_categories": 5,
|
| 43 |
+
"correlation_with_target": 0.17684471269279609
|
| 44 |
+
},
|
| 45 |
+
"reserved_room_type": {
|
| 46 |
+
"encoded_column": "reserved_room_type_target_encoded",
|
| 47 |
+
"global_mean": 0.37041628277075134,
|
| 48 |
+
"categories": {
|
| 49 |
+
"0": 0.3910688675887826,
|
| 50 |
+
"1": 0.329393513006855,
|
| 51 |
+
"2": 0.33002021600480985,
|
| 52 |
+
"3": 0.31777698548798844,
|
| 53 |
+
"4": 0.2929420080641847,
|
| 54 |
+
"5": 0.30377929173796275,
|
| 55 |
+
"6": 0.3643976188473718,
|
| 56 |
+
"7": 0.40771779125645363,
|
| 57 |
+
"8": 0.35811921762315374,
|
| 58 |
+
"9": 0.8114157473940412
|
| 59 |
+
},
|
| 60 |
+
"unique_categories": 10,
|
| 61 |
+
"correlation_with_target": 0.07241322555323557
|
| 62 |
+
},
|
| 63 |
+
"customer_type": {
|
| 64 |
+
"encoded_column": "customer_type_target_encoded",
|
| 65 |
+
"global_mean": 0.37041628277075134,
|
| 66 |
+
"categories": {
|
| 67 |
+
"0": 0.3096018755002309,
|
| 68 |
+
"1": 0.10226072646950485,
|
| 69 |
+
"2": 0.40746118084509414,
|
| 70 |
+
"3": 0.2543000402594656
|
| 71 |
+
},
|
| 72 |
+
"unique_categories": 4,
|
| 73 |
+
"correlation_with_target": 0.1362960500064263
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
}
|
artifacts/pr_curve.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
artifacts/roc_curve.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
artifacts/shap_importance_bar.png
ADDED
|
artifacts/shap_summary.png
ADDED
|
Git LFS Details
|
artifacts/shap_values_sample.json
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"category": "true_positive",
|
| 4 |
+
"index": 2,
|
| 5 |
+
"y_true": 1,
|
| 6 |
+
"prediction": 1,
|
| 7 |
+
"probability": 0.7679175138473511,
|
| 8 |
+
"shap_values": {
|
| 9 |
+
"hotel": -0.013177326880395412,
|
| 10 |
+
"lead_time": 0.27093467116355896,
|
| 11 |
+
"arrival_date_year": 0.10717716068029404,
|
| 12 |
+
"arrival_date_month": 0.046636875718832016,
|
| 13 |
+
"arrival_date_week_number": 0.007057845126837492,
|
| 14 |
+
"arrival_date_day_of_month": 0.015515242703258991,
|
| 15 |
+
"stays_in_weekend_nights": -0.003432020079344511,
|
| 16 |
+
"stays_in_week_nights": -0.006210292223840952,
|
| 17 |
+
"adults": 0.015318267978727818,
|
| 18 |
+
"children": -0.007357093971222639,
|
| 19 |
+
"babies": -6.629295239690691e-05,
|
| 20 |
+
"meal": 0.06084809452295303,
|
| 21 |
+
"market_segment": 0.5826431512832642,
|
| 22 |
+
"distribution_channel": -0.005265130195766687,
|
| 23 |
+
"is_repeated_guest": 0.0023761300835758448,
|
| 24 |
+
"previous_cancellations": -0.048768918961286545,
|
| 25 |
+
"previous_bookings_not_canceled": 0.0238479096442461,
|
| 26 |
+
"reserved_room_type": -0.17490211129188538,
|
| 27 |
+
"assigned_room_type": 0.3624641001224518,
|
| 28 |
+
"booking_changes": 0.09503821283578873,
|
| 29 |
+
"deposit_type": -0.38260650634765625,
|
| 30 |
+
"days_in_waiting_list": 0.00499193649739027,
|
| 31 |
+
"customer_type": 0.005631973035633564,
|
| 32 |
+
"adr": 0.22705571353435516,
|
| 33 |
+
"required_car_parking_spaces": 0.10856841504573822,
|
| 34 |
+
"total_of_special_requests": 0.736638605594635,
|
| 35 |
+
"total_stay_duration": 0.03156501054763794,
|
| 36 |
+
"total_guests": -0.0028324569575488567,
|
| 37 |
+
"is_family": -0.004709186032414436,
|
| 38 |
+
"is_peak_season": 0.03317419812083244,
|
| 39 |
+
"is_summer_peak": 0.0013722111470997334,
|
| 40 |
+
"is_holiday_season": -0.017516281455755234,
|
| 41 |
+
"hotel_target_encoded": -0.008734573610126972,
|
| 42 |
+
"market_segment_target_encoded": 0.15508981049060822,
|
| 43 |
+
"distribution_channel_target_encoded": -0.03044990263879299,
|
| 44 |
+
"reserved_room_type_target_encoded": 0.0011005409760400653,
|
| 45 |
+
"customer_type_target_encoded": 0.1587882786989212,
|
| 46 |
+
"country__te": -0.5817746520042419,
|
| 47 |
+
"guest_type__te": 0.002932904753834009,
|
| 48 |
+
"arrival_season__te": -0.02166694961488247,
|
| 49 |
+
"arrival_quarter__te": -0.001362183946184814
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"category": "false_positive",
|
| 54 |
+
"index": 7,
|
| 55 |
+
"y_true": 0,
|
| 56 |
+
"prediction": 1,
|
| 57 |
+
"probability": 0.7768429517745972,
|
| 58 |
+
"shap_values": {
|
| 59 |
+
"hotel": -0.04107680171728134,
|
| 60 |
+
"lead_time": 0.20630843937397003,
|
| 61 |
+
"arrival_date_year": 0.12643642723560333,
|
| 62 |
+
"arrival_date_month": -0.03379317745566368,
|
| 63 |
+
"arrival_date_week_number": -0.01691223680973053,
|
| 64 |
+
"arrival_date_day_of_month": -0.0038236246909946203,
|
| 65 |
+
"stays_in_weekend_nights": -0.020141033455729485,
|
| 66 |
+
"stays_in_week_nights": 0.005030508618801832,
|
| 67 |
+
"adults": 0.012447455897927284,
|
| 68 |
+
"children": 0.08788547664880753,
|
| 69 |
+
"babies": -6.629295239690691e-05,
|
| 70 |
+
"meal": 0.00710188876837492,
|
| 71 |
+
"market_segment": 0.5442723035812378,
|
| 72 |
+
"distribution_channel": -0.006446031853556633,
|
| 73 |
+
"is_repeated_guest": 0.0023962713312357664,
|
| 74 |
+
"previous_cancellations": -0.054328884929418564,
|
| 75 |
+
"previous_bookings_not_canceled": 0.027283739298582077,
|
| 76 |
+
"reserved_room_type": -0.28937697410583496,
|
| 77 |
+
"assigned_room_type": 0.37023118138313293,
|
| 78 |
+
"booking_changes": 0.07717075943946838,
|
| 79 |
+
"deposit_type": -0.37375608086586,
|
| 80 |
+
"days_in_waiting_list": 0.003258473239839077,
|
| 81 |
+
"customer_type": 0.0052125826478004456,
|
| 82 |
+
"adr": 0.2486179769039154,
|
| 83 |
+
"required_car_parking_spaces": 0.11153995245695114,
|
| 84 |
+
"total_of_special_requests": 0.7757676243782043,
|
| 85 |
+
"total_stay_duration": 0.038464903831481934,
|
| 86 |
+
"total_guests": 0.002308598253875971,
|
| 87 |
+
"is_family": 0.01916220597922802,
|
| 88 |
+
"is_peak_season": -0.023274041712284088,
|
| 89 |
+
"is_summer_peak": -0.0020165895111858845,
|
| 90 |
+
"is_holiday_season": 4.689610796049237e-05,
|
| 91 |
+
"hotel_target_encoded": -0.007977521046996117,
|
| 92 |
+
"market_segment_target_encoded": 0.1509910523891449,
|
| 93 |
+
"distribution_channel_target_encoded": -0.03097727708518505,
|
| 94 |
+
"reserved_room_type_target_encoded": -0.036723002791404724,
|
| 95 |
+
"customer_type_target_encoded": 0.1517724245786667,
|
| 96 |
+
"country__te": -0.22035948932170868,
|
| 97 |
+
"guest_type__te": -0.00904961209744215,
|
| 98 |
+
"arrival_season__te": -0.007437328342348337,
|
| 99 |
+
"arrival_quarter__te": 0.0005368555430322886
|
| 100 |
+
}
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"category": "false_negative",
|
| 104 |
+
"index": 10,
|
| 105 |
+
"y_true": 1,
|
| 106 |
+
"prediction": 0,
|
| 107 |
+
"probability": 0.36954548954963684,
|
| 108 |
+
"shap_values": {
|
| 109 |
+
"hotel": -0.05511629208922386,
|
| 110 |
+
"lead_time": 0.18352645635604858,
|
| 111 |
+
"arrival_date_year": 0.1316099613904953,
|
| 112 |
+
"arrival_date_month": 0.02599601075053215,
|
| 113 |
+
"arrival_date_week_number": -0.07640720903873444,
|
| 114 |
+
"arrival_date_day_of_month": -0.005574983078986406,
|
| 115 |
+
"stays_in_weekend_nights": 0.03126369044184685,
|
| 116 |
+
"stays_in_week_nights": 0.019554395228624344,
|
| 117 |
+
"adults": 0.014410095289349556,
|
| 118 |
+
"children": -0.0004327027127146721,
|
| 119 |
+
"babies": -2.9597815228044055e-05,
|
| 120 |
+
"meal": 0.005594600923359394,
|
| 121 |
+
"market_segment": 0.35903194546699524,
|
| 122 |
+
"distribution_channel": -0.00748800253495574,
|
| 123 |
+
"is_repeated_guest": 0.001656562671996653,
|
| 124 |
+
"previous_cancellations": -0.0757187083363533,
|
| 125 |
+
"previous_bookings_not_canceled": 0.027033040300011635,
|
| 126 |
+
"reserved_room_type": -0.20557740330696106,
|
| 127 |
+
"assigned_room_type": 0.19640541076660156,
|
| 128 |
+
"booking_changes": 0.07736871391534805,
|
| 129 |
+
"deposit_type": -0.39705631136894226,
|
| 130 |
+
"days_in_waiting_list": 0.0031135703902691603,
|
| 131 |
+
"customer_type": 0.007273601833730936,
|
| 132 |
+
"adr": 0.2758876085281372,
|
| 133 |
+
"required_car_parking_spaces": 0.12637829780578613,
|
| 134 |
+
"total_of_special_requests": -1.018623948097229,
|
| 135 |
+
"total_stay_duration": 0.12197940051555634,
|
| 136 |
+
"total_guests": -0.0008809716673567891,
|
| 137 |
+
"is_family": -0.0023323686327785254,
|
| 138 |
+
"is_peak_season": -0.015776723623275757,
|
| 139 |
+
"is_summer_peak": 0.0005290449480526149,
|
| 140 |
+
"is_holiday_season": 4.4685075408779085e-05,
|
| 141 |
+
"hotel_target_encoded": -0.011267091147601604,
|
| 142 |
+
"market_segment_target_encoded": 0.12202320247888565,
|
| 143 |
+
"distribution_channel_target_encoded": -0.027583105489611626,
|
| 144 |
+
"reserved_room_type_target_encoded": -0.060819387435913086,
|
| 145 |
+
"customer_type_target_encoded": 0.11991596966981888,
|
| 146 |
+
"country__te": 0.09951554983854294,
|
| 147 |
+
"guest_type__te": 0.0032427667174488306,
|
| 148 |
+
"arrival_season__te": -0.004380077589303255,
|
| 149 |
+
"arrival_quarter__te": 0.026889480650424957
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
]
|
artifacts/threshold_sweep.csv
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
threshold,precision,recall,f1_score
|
| 2 |
+
0.0,0.3704246586816316,1.0,0.5405983558964642
|
| 3 |
+
0.01,0.43009340338587276,0.9995477671000566,0.6014081153702255
|
| 4 |
+
0.02,0.46293383663625876,0.998304126625212,0.6325441455639529
|
| 5 |
+
0.03,0.48265864332603936,0.9975127190503109,0.6505437788018433
|
| 6 |
+
0.04,0.4968429360694554,0.9963821368004522,0.6630553361170674
|
| 7 |
+
0.05,0.5095191250506337,0.9954776710005653,0.6740411850264105
|
| 8 |
+
0.06,0.5218087947302831,0.9941209723007349,0.6843866749688667
|
| 9 |
+
0.07,0.532233610285645,0.9921989824759752,0.6928238730559723
|
| 10 |
+
0.08,0.5413241021202942,0.9900508762012437,0.6999440492366718
|
| 11 |
+
0.09,0.5502205419029615,0.987224420576597,0.7066154157394295
|
| 12 |
+
0.1,0.5585106382978723,0.9853024307518372,0.7129125935621089
|
| 13 |
+
0.11,0.5663670704702358,0.9831543244771057,0.7187073846026696
|
| 14 |
+
0.12,0.574332825640686,0.9805539853024308,0.7243798546730142
|
| 15 |
+
0.13,0.5835020926150938,0.9772752967778406,0.730715583921552
|
| 16 |
+
0.14,0.5936724565756824,0.9737704918032787,0.7376354215732454
|
| 17 |
+
0.15,0.6033598088142265,0.9704918032786886,0.7441054091539528
|
| 18 |
+
0.16,0.6137064944384643,0.9668739400791407,0.7508340649692713
|
| 19 |
+
0.17,0.6232636350343618,0.9638213680045223,0.7570039515162279
|
| 20 |
+
0.18,0.6308150353554149,0.958168456755229,0.7607719928186715
|
| 21 |
+
0.19,0.6401883353584447,0.9530808366308648,0.765911052559851
|
| 22 |
+
0.2,0.6512295399891397,0.9491237987563595,0.7724512329775488
|
| 23 |
+
0.21,0.6593293885601578,0.9448275862068966,0.7766728624535316
|
| 24 |
+
0.22,0.6668272705372199,0.9388355002826455,0.7797915297211006
|
| 25 |
+
0.23,0.6737409191086442,0.9331825890333522,0.7825180128934395
|
| 26 |
+
0.24,0.6818445716181472,0.9294516676088186,0.7866232896373553
|
| 27 |
+
0.25,0.6908783783783784,0.924816280384398,0.7909112883732173
|
| 28 |
+
0.26,0.6998452810727179,0.920520067834935,0.7951560134772205
|
| 29 |
+
0.27,0.7078189300411523,0.9139626907857547,0.7977894009671371
|
| 30 |
+
0.28,0.716501738432736,0.9086489542114189,0.8012162296879674
|
| 31 |
+
0.29,0.723057417134577,0.9026568682871678,0.80293659174335
|
| 32 |
+
0.3,0.7301850317591826,0.8967778405879028,0.8049523036330424
|
| 33 |
+
0.31,0.7381466528964417,0.8888637648388921,0.8065244152646697
|
| 34 |
+
0.32,0.7462929302592557,0.8819672131147541,0.8084775624417038
|
| 35 |
+
0.33,0.7527445836976586,0.8759751271905031,0.8096979830703313
|
| 36 |
+
0.34,0.7594222969631022,0.8679479932165065,0.8100664767331434
|
| 37 |
+
0.35000000000000003,0.7663852030558906,0.8619559072922555,0.811365934124408
|
| 38 |
+
0.36,0.7718635063342869,0.8541548897682306,0.8109268502119895
|
| 39 |
+
0.37,0.777870043595599,0.8472583380440927,0.8110828508036149
|
| 40 |
+
0.38,0.7824395373291272,0.8412662521198417,0.810787251430128
|
| 41 |
+
0.39,0.7876427275637605,0.8344827586206897,0.8103864734299517
|
| 42 |
+
0.4,0.7927937675827743,0.8283776144714528,0.8101951678000774
|
| 43 |
+
0.41000000000000003,0.797544667324345,0.8226116449971735,0.8098842386464826
|
| 44 |
+
0.42,0.8013728963684676,0.8183154324477105,0.809755551826369
|
| 45 |
+
0.43,0.8074848280512474,0.8123233465234596,0.8098968607338105
|
| 46 |
+
0.44,0.8116897884921538,0.8070096099491239,0.8093429332728612
|
| 47 |
+
0.45,0.8169972324723247,0.8010175240248728,0.8089284694867843
|
| 48 |
+
0.46,0.8224561403508772,0.7950254381006219,0.808508192009198
|
| 49 |
+
0.47000000000000003,0.8271766243021736,0.7873374788015828,0.8067655236329935
|
| 50 |
+
0.48,0.8318690576483332,0.7814584511023177,0.8058761804826863
|
| 51 |
+
0.49,0.8362794099719615,0.7755794234030525,0.8047864852182074
|
| 52 |
+
0.5,0.841708852944808,0.7707179197286602,0.8046506137865911
|
| 53 |
+
0.51,0.8476214384335383,0.7634821933295647,0.8033547466095646
|
| 54 |
+
0.52,0.8530878404296126,0.7543244771057095,0.8006720268810752
|
| 55 |
+
0.53,0.8584599402674977,0.7474279253815715,0.7991055239937145
|
| 56 |
+
0.54,0.8627683392598446,0.7406444318824195,0.7970556028713955
|
| 57 |
+
0.55,0.8673988748995446,0.7321650650084793,0.7940653546686285
|
| 58 |
+
0.56,0.8710996048507971,0.7227812323346523,0.7900395452298566
|
| 59 |
+
0.5700000000000001,0.8760536133757082,0.7167891464104014,0.7884591468722796
|
| 60 |
+
0.58,0.8804103428892637,0.7083097795364612,0.7850385314203371
|
| 61 |
+
0.59,0.8839272935451553,0.698247597512719,0.7801920161697827
|
| 62 |
+
0.6,0.8901995048784039,0.6911249293386094,0.7781313645621182
|
| 63 |
+
0.61,0.8951971538689594,0.6827586206896552,0.774677698672311
|
| 64 |
+
0.62,0.9002879224124868,0.6716789146410401,0.7693602693602694
|
| 65 |
+
0.63,0.9062887236679058,0.6615036743923121,0.764786615253905
|
| 66 |
+
0.64,0.9127879269261319,0.6496325607687959,0.7590488771466314
|
| 67 |
+
0.65,0.9175877763328999,0.6382136800452233,0.7528172301126892
|
| 68 |
+
0.66,0.923923923923924,0.6261164499717354,0.7464114832535885
|
| 69 |
+
0.67,0.93071000855432,0.6150367439231204,0.7406398910823689
|
| 70 |
+
0.68,0.9360955056179775,0.6028264556246467,0.7333745959700159
|
| 71 |
+
0.6900000000000001,0.9403686302855078,0.5883550028264556,0.7238333681062661
|
| 72 |
+
0.7000000000000001,0.9439407955596669,0.5768230638778972,0.7160701754385965
|
| 73 |
+
0.71,0.9473384030418252,0.5633691351045789,0.7065579581708614
|
| 74 |
+
0.72,0.9505066250974279,0.5514980214810628,0.6980038634900193
|
| 75 |
+
0.73,0.9555288461538461,0.539287733182589,0.6894558068945581
|
| 76 |
+
0.74,0.9596375617792422,0.5268513284341436,0.6802423180789723
|
| 77 |
+
0.75,0.965042372881356,0.5149802148106275,0.671581275340951
|
| 78 |
+
0.76,0.9696639022261021,0.5023176936122102,0.6618008490355254
|
| 79 |
+
0.77,0.9747007002484752,0.4878462408140192,0.6502411091018686
|
| 80 |
+
0.78,0.9780783582089553,0.4741661955907292,0.6386964136145588
|
| 81 |
+
0.79,0.9814547206165704,0.46071226681741095,0.6270677848734323
|
| 82 |
+
0.8,0.9849233811171527,0.4505370265686829,0.6182608021099992
|
| 83 |
+
0.81,0.9865448083269865,0.43934426229508194,0.6079474342928661
|
| 84 |
+
0.8200000000000001,0.9881013967925505,0.4318824194460147,0.6010542050192746
|
| 85 |
+
0.8300000000000001,0.9899391051098756,0.42272470322215944,0.5924576136903819
|
| 86 |
+
0.84,0.991887506760411,0.4146975692481628,0.5848680538946026
|
| 87 |
+
0.85,0.9934084042845372,0.40893159977388355,0.5793688931603396
|
| 88 |
+
0.86,0.9958088851634534,0.40293951384963256,0.5737282678686413
|
| 89 |
+
0.87,0.9968820861678005,0.3976257772752968,0.5684959185322881
|
| 90 |
+
0.88,0.9971305595408895,0.39287733182589035,0.5636658556366586
|
| 91 |
+
0.89,0.9982507288629737,0.3871113623516111,0.5578818737270875
|
| 92 |
+
0.9,0.9988197108291531,0.38270209157716223,0.5533758378290011
|
| 93 |
+
0.91,0.9991074085093722,0.37964951950254383,0.550221202687203
|
| 94 |
+
0.92,0.999400299850075,0.3768230638778971,0.5472906403940887
|
| 95 |
+
0.93,0.9996982498491249,0.37456189937817974,0.5449461304383584
|
| 96 |
+
0.9400000000000001,0.9996969696969698,0.3729790842283776,0.5432688349114863
|
| 97 |
+
0.9500000000000001,1.0,0.3723007348784624,0.5425935079914319
|
| 98 |
+
0.96,1.0,0.3716223855285472,0.541872733267392
|
| 99 |
+
0.97,1.0,0.36936122102882984,0.5394649933949802
|
| 100 |
+
0.98,1.0,0.36551724137931035,0.5353535353535354
|
| 101 |
+
0.99,1.0,0.34980214810627475,0.5183013652734735
|
| 102 |
+
1.0,0.0,0.0,0.0
|
artifacts/value_domains.json
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"categorical": {
|
| 3 |
+
"deposit_type": [
|
| 4 |
+
"No Deposit",
|
| 5 |
+
"Non Refund",
|
| 6 |
+
"Refundable"
|
| 7 |
+
],
|
| 8 |
+
"country": [
|
| 9 |
+
"ABW",
|
| 10 |
+
"AGO",
|
| 11 |
+
"AIA",
|
| 12 |
+
"ALB",
|
| 13 |
+
"AND",
|
| 14 |
+
"ARE",
|
| 15 |
+
"ARG",
|
| 16 |
+
"ARM",
|
| 17 |
+
"ASM",
|
| 18 |
+
"ATA",
|
| 19 |
+
"ATF",
|
| 20 |
+
"AUS",
|
| 21 |
+
"AUT",
|
| 22 |
+
"AZE",
|
| 23 |
+
"BDI",
|
| 24 |
+
"BEL",
|
| 25 |
+
"BEN",
|
| 26 |
+
"BFA",
|
| 27 |
+
"BGD",
|
| 28 |
+
"BGR",
|
| 29 |
+
"BHR",
|
| 30 |
+
"BHS",
|
| 31 |
+
"BIH",
|
| 32 |
+
"BLR",
|
| 33 |
+
"BOL",
|
| 34 |
+
"BRA",
|
| 35 |
+
"BRB",
|
| 36 |
+
"BWA",
|
| 37 |
+
"CAF",
|
| 38 |
+
"CHE",
|
| 39 |
+
"CHL",
|
| 40 |
+
"CHN",
|
| 41 |
+
"CIV",
|
| 42 |
+
"CMR",
|
| 43 |
+
"CN",
|
| 44 |
+
"COL",
|
| 45 |
+
"COM",
|
| 46 |
+
"CPV",
|
| 47 |
+
"CRI",
|
| 48 |
+
"CUB",
|
| 49 |
+
"CYM",
|
| 50 |
+
"CYP",
|
| 51 |
+
"CZE",
|
| 52 |
+
"DEU",
|
| 53 |
+
"DJI",
|
| 54 |
+
"DMA",
|
| 55 |
+
"DNK",
|
| 56 |
+
"DOM",
|
| 57 |
+
"DZA",
|
| 58 |
+
"ECU",
|
| 59 |
+
"EGY",
|
| 60 |
+
"ESP",
|
| 61 |
+
"EST",
|
| 62 |
+
"ETH",
|
| 63 |
+
"FIN",
|
| 64 |
+
"FJI",
|
| 65 |
+
"FRA",
|
| 66 |
+
"FRO",
|
| 67 |
+
"GAB",
|
| 68 |
+
"GBR",
|
| 69 |
+
"GEO",
|
| 70 |
+
"GGY",
|
| 71 |
+
"GHA",
|
| 72 |
+
"GIB",
|
| 73 |
+
"GLP",
|
| 74 |
+
"GNB",
|
| 75 |
+
"GRC",
|
| 76 |
+
"GTM",
|
| 77 |
+
"GUY",
|
| 78 |
+
"HKG",
|
| 79 |
+
"HND",
|
| 80 |
+
"HRV",
|
| 81 |
+
"HUN",
|
| 82 |
+
"IDN",
|
| 83 |
+
"IMN",
|
| 84 |
+
"IND",
|
| 85 |
+
"IRL",
|
| 86 |
+
"IRN",
|
| 87 |
+
"IRQ",
|
| 88 |
+
"ISL",
|
| 89 |
+
"ISR",
|
| 90 |
+
"ITA",
|
| 91 |
+
"JAM",
|
| 92 |
+
"JEY",
|
| 93 |
+
"JOR",
|
| 94 |
+
"JPN",
|
| 95 |
+
"KAZ",
|
| 96 |
+
"KEN",
|
| 97 |
+
"KHM",
|
| 98 |
+
"KIR",
|
| 99 |
+
"KNA",
|
| 100 |
+
"KOR",
|
| 101 |
+
"KWT",
|
| 102 |
+
"LAO",
|
| 103 |
+
"LBN",
|
| 104 |
+
"LBY",
|
| 105 |
+
"LCA",
|
| 106 |
+
"LIE",
|
| 107 |
+
"LKA",
|
| 108 |
+
"LTU",
|
| 109 |
+
"LUX",
|
| 110 |
+
"LVA",
|
| 111 |
+
"MAC",
|
| 112 |
+
"MAR",
|
| 113 |
+
"MCO",
|
| 114 |
+
"MDG",
|
| 115 |
+
"MDV",
|
| 116 |
+
"MEX",
|
| 117 |
+
"MKD",
|
| 118 |
+
"MLI",
|
| 119 |
+
"MLT",
|
| 120 |
+
"MMR",
|
| 121 |
+
"MNE",
|
| 122 |
+
"MOZ",
|
| 123 |
+
"MRT",
|
| 124 |
+
"MUS",
|
| 125 |
+
"MWI",
|
| 126 |
+
"MYS",
|
| 127 |
+
"MYT",
|
| 128 |
+
"NAM",
|
| 129 |
+
"NCL",
|
| 130 |
+
"NGA",
|
| 131 |
+
"NIC",
|
| 132 |
+
"NLD",
|
| 133 |
+
"NOR",
|
| 134 |
+
"NPL",
|
| 135 |
+
"NZL",
|
| 136 |
+
"OMN",
|
| 137 |
+
"PAK",
|
| 138 |
+
"PAN",
|
| 139 |
+
"PER",
|
| 140 |
+
"PHL",
|
| 141 |
+
"PLW",
|
| 142 |
+
"POL",
|
| 143 |
+
"PRI",
|
| 144 |
+
"PRT",
|
| 145 |
+
"PRY",
|
| 146 |
+
"PYF",
|
| 147 |
+
"QAT",
|
| 148 |
+
"ROU",
|
| 149 |
+
"RUS",
|
| 150 |
+
"RWA",
|
| 151 |
+
"SAU",
|
| 152 |
+
"SDN",
|
| 153 |
+
"SEN",
|
| 154 |
+
"SGP",
|
| 155 |
+
"SLE",
|
| 156 |
+
"SLV",
|
| 157 |
+
"SMR",
|
| 158 |
+
"SRB",
|
| 159 |
+
"STP",
|
| 160 |
+
"SUR",
|
| 161 |
+
"SVK",
|
| 162 |
+
"SVN",
|
| 163 |
+
"SWE",
|
| 164 |
+
"SYC",
|
| 165 |
+
"SYR",
|
| 166 |
+
"TGO",
|
| 167 |
+
"THA",
|
| 168 |
+
"TJK",
|
| 169 |
+
"TMP",
|
| 170 |
+
"TUN",
|
| 171 |
+
"TUR",
|
| 172 |
+
"TWN",
|
| 173 |
+
"TZA",
|
| 174 |
+
"UGA",
|
| 175 |
+
"UKR",
|
| 176 |
+
"UMI",
|
| 177 |
+
"URY",
|
| 178 |
+
"USA",
|
| 179 |
+
"UZB",
|
| 180 |
+
"VEN",
|
| 181 |
+
"VGB",
|
| 182 |
+
"VNM",
|
| 183 |
+
"ZAF",
|
| 184 |
+
"ZMB",
|
| 185 |
+
"ZWE"
|
| 186 |
+
],
|
| 187 |
+
"market_segment": [
|
| 188 |
+
"Aviation",
|
| 189 |
+
"Complementary",
|
| 190 |
+
"Corporate",
|
| 191 |
+
"Direct",
|
| 192 |
+
"Groups",
|
| 193 |
+
"Offline TA/TO",
|
| 194 |
+
"Online TA",
|
| 195 |
+
"Undefined"
|
| 196 |
+
],
|
| 197 |
+
"reserved_room_type": [
|
| 198 |
+
"A",
|
| 199 |
+
"B",
|
| 200 |
+
"C",
|
| 201 |
+
"D",
|
| 202 |
+
"E",
|
| 203 |
+
"F",
|
| 204 |
+
"G",
|
| 205 |
+
"H",
|
| 206 |
+
"L",
|
| 207 |
+
"P"
|
| 208 |
+
],
|
| 209 |
+
"assigned_room_type": [
|
| 210 |
+
"A",
|
| 211 |
+
"B",
|
| 212 |
+
"C",
|
| 213 |
+
"D",
|
| 214 |
+
"E",
|
| 215 |
+
"F",
|
| 216 |
+
"G",
|
| 217 |
+
"H",
|
| 218 |
+
"I",
|
| 219 |
+
"K",
|
| 220 |
+
"L",
|
| 221 |
+
"P"
|
| 222 |
+
],
|
| 223 |
+
"customer_type": [
|
| 224 |
+
"Contract",
|
| 225 |
+
"Group",
|
| 226 |
+
"Transient",
|
| 227 |
+
"Transient-Party"
|
| 228 |
+
],
|
| 229 |
+
"distribution_channel": [
|
| 230 |
+
"Corporate",
|
| 231 |
+
"Direct",
|
| 232 |
+
"GDS",
|
| 233 |
+
"TA/TO",
|
| 234 |
+
"Undefined"
|
| 235 |
+
],
|
| 236 |
+
"meal": [
|
| 237 |
+
"BB",
|
| 238 |
+
"FB",
|
| 239 |
+
"HB",
|
| 240 |
+
"SC",
|
| 241 |
+
"Undefined"
|
| 242 |
+
],
|
| 243 |
+
"room_type_code_set": [
|
| 244 |
+
"A",
|
| 245 |
+
"B",
|
| 246 |
+
"C",
|
| 247 |
+
"D",
|
| 248 |
+
"E",
|
| 249 |
+
"F",
|
| 250 |
+
"G",
|
| 251 |
+
"H",
|
| 252 |
+
"I",
|
| 253 |
+
"K",
|
| 254 |
+
"L",
|
| 255 |
+
"P"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
"numeric": {
|
| 259 |
+
"lead_time": {
|
| 260 |
+
"min": 0.0,
|
| 261 |
+
"max": 737.0
|
| 262 |
+
},
|
| 263 |
+
"required_car_parking_spaces": {
|
| 264 |
+
"min": 0.0,
|
| 265 |
+
"max": 8.0
|
| 266 |
+
},
|
| 267 |
+
"previous_cancellations": {
|
| 268 |
+
"min": 0.0,
|
| 269 |
+
"max": 26.0
|
| 270 |
+
},
|
| 271 |
+
"adr": {
|
| 272 |
+
"min": -6.38,
|
| 273 |
+
"max": 5400.0
|
| 274 |
+
},
|
| 275 |
+
"booking_changes": {
|
| 276 |
+
"min": 0.0,
|
| 277 |
+
"max": 21.0
|
| 278 |
+
},
|
| 279 |
+
"total_of_special_requests": {
|
| 280 |
+
"min": 0.0,
|
| 281 |
+
"max": 5.0
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
}
|
main.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Minimal FastAPI bootstrap that wires modular routes and startup load."""
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
import os
|
| 4 |
+
from app.routes import router, startup_load
|
| 5 |
+
from app import config
|
| 6 |
+
|
| 7 |
+
app = FastAPI(title="Hotel Cancellation Prediction API", version=config.APP_VERSION)
|
| 8 |
+
|
| 9 |
+
@app.on_event("startup")
|
| 10 |
+
async def _load():
|
| 11 |
+
startup_load()
|
| 12 |
+
|
| 13 |
+
@app.get("/", response_model=dict)
|
| 14 |
+
async def root():
|
| 15 |
+
return {"message": "Hotel Cancellation Prediction API", "version": config.APP_VERSION, "endpoints": {"health": "/health", "predict": "/predict", "docs": "/docs"}}
|
| 16 |
+
|
| 17 |
+
app.include_router(router)
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
import uvicorn
|
| 21 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8000")))
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.0
|
| 2 |
+
uvicorn[standard]>=0.24.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
scikit-learn==1.7.2
|
| 6 |
+
xgboost>=2.0.0
|
| 7 |
+
joblib>=1.3.0
|
| 8 |
+
numpy>=1.24.0
|
| 9 |
+
python-dotenv>=1.0.0
|
| 10 |
+
huggingface_hub>=0.23.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file makes src a Python package
|
src/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (169 Bytes). View file
|
|
|
src/__pycache__/preprocessing.cpython-312.pyc
ADDED
|
Binary file (15.2 kB). View file
|
|
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Centralized preprocessing pipeline for hotel cancellation prediction.
|
| 2 |
+
|
| 3 |
+
Provides a reusable class that encapsulates:
|
| 4 |
+
- Categorical handling strategy (currently: drop)
|
| 5 |
+
- Numeric scaling (StandardScaler)
|
| 6 |
+
- Feature ordering preservation
|
| 7 |
+
- Artifact persistence / loading
|
| 8 |
+
|
| 9 |
+
Future extension points:
|
| 10 |
+
- onehot / target / hybrid categorical strategies
|
| 11 |
+
- numeric imputation strategies
|
| 12 |
+
- feature selection masks
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
pipeline = PreprocessingPipeline(categorical_strategy='drop', scale=True)
|
| 16 |
+
X_train_proc = pipeline.fit_transform(X_train)
|
| 17 |
+
X_test_proc = pipeline.transform(X_test)
|
| 18 |
+
pipeline.save('models/preprocessor.pkl')
|
| 19 |
+
|
| 20 |
+
# Later / inference
|
| 21 |
+
pipeline = PreprocessingPipeline.load('models/preprocessor.pkl')
|
| 22 |
+
X_new = pipeline.transform(X_incoming)
|
| 23 |
+
"""
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
from dataclasses import dataclass, asdict
|
| 26 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 27 |
+
import pandas as pd
|
| 28 |
+
import joblib
|
| 29 |
+
from sklearn.preprocessing import StandardScaler
|
| 30 |
+
import os
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class PreprocessingState:
|
| 36 |
+
categorical_strategy: str
|
| 37 |
+
scaled_numeric: List[str]
|
| 38 |
+
dropped_columns: List[str]
|
| 39 |
+
feature_order: List[str]
|
| 40 |
+
scale: bool
|
| 41 |
+
# One-hot specific
|
| 42 |
+
onehot_categories: Optional[Dict[str, List[str]]] = None
|
| 43 |
+
# Target encoding specific
|
| 44 |
+
target_mappings: Optional[Dict[str, Dict[str, float]]] = None
|
| 45 |
+
target_global_mean: Optional[float] = None
|
| 46 |
+
target_encoded_columns: Optional[List[str]] = None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class PreprocessingPipeline:
|
| 50 |
+
def __init__(self, categorical_strategy: str = 'drop', scale: bool = True, target_min_samples: int = 5, target_smoothing: float = 10.0):
|
| 51 |
+
self.categorical_strategy = categorical_strategy
|
| 52 |
+
self.scale = scale
|
| 53 |
+
self._scaler: Optional[StandardScaler] = None
|
| 54 |
+
self.state: Optional[PreprocessingState] = None
|
| 55 |
+
# target encoding hyperparams
|
| 56 |
+
self.target_min_samples = target_min_samples
|
| 57 |
+
self.target_smoothing = target_smoothing
|
| 58 |
+
|
| 59 |
+
def _apply_onehot_fit(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
| 60 |
+
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or pd.api.types.is_categorical_dtype(X[c])]
|
| 61 |
+
categories: Dict[str, List[str]] = {}
|
| 62 |
+
transformed_parts = [X[[c]] for c in X.columns if c not in cat_cols]
|
| 63 |
+
for c in cat_cols:
|
| 64 |
+
cats = sorted([str(v) for v in X[c].dropna().unique()])
|
| 65 |
+
categories[c] = cats
|
| 66 |
+
for val in cats:
|
| 67 |
+
col_name = f"{c}__{val}"
|
| 68 |
+
transformed_parts.append((X[c].astype(str) == val).astype(int).to_frame(col_name))
|
| 69 |
+
X_new = pd.concat(transformed_parts, axis=1)
|
| 70 |
+
return X_new, categories
|
| 71 |
+
|
| 72 |
+
def _apply_onehot_transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
| 73 |
+
assert self.state and self.state.onehot_categories
|
| 74 |
+
cat_schema = self.state.onehot_categories
|
| 75 |
+
out_parts = []
|
| 76 |
+
# Numeric / other passthrough first (original columns that were not categorical at fit time)
|
| 77 |
+
for c in self.state.feature_order:
|
| 78 |
+
# original feature_order contains post-onehot columns already; skip here
|
| 79 |
+
pass
|
| 80 |
+
# Reconstruct expected columns deterministically
|
| 81 |
+
for base_col, cats in cat_schema.items():
|
| 82 |
+
series = X[base_col].astype(str) if base_col in X.columns else pd.Series([None]*len(X), index=X.index)
|
| 83 |
+
for val in cats:
|
| 84 |
+
col_name = f"{base_col}__{val}"
|
| 85 |
+
out_parts.append((series == val).astype(int).rename(col_name))
|
| 86 |
+
# Add any numeric columns (those not in cat_schema keys)
|
| 87 |
+
numeric_like = [c for c in X.columns if c not in cat_schema]
|
| 88 |
+
for c in numeric_like:
|
| 89 |
+
if c not in self.state.feature_order and any(c.startswith(f"{k}__") for k in cat_schema):
|
| 90 |
+
# skip inadvertent collision
|
| 91 |
+
continue
|
| 92 |
+
if c in cat_schema:
|
| 93 |
+
continue
|
| 94 |
+
if pd.api.types.is_numeric_dtype(X[c]):
|
| 95 |
+
out_parts.append(X[c])
|
| 96 |
+
X_new = pd.concat(out_parts, axis=1)
|
| 97 |
+
# Align to stored feature order
|
| 98 |
+
missing = [c for c in self.state.feature_order if c not in X_new.columns]
|
| 99 |
+
for m in missing:
|
| 100 |
+
X_new[m] = 0 # unseen category -> all zeros
|
| 101 |
+
X_new = X_new[self.state.feature_order]
|
| 102 |
+
return X_new
|
| 103 |
+
|
| 104 |
+
def _compute_target_encoding(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, Dict[str, Dict[str,float]], float, List[str]]:
|
| 105 |
+
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or pd.api.types.is_categorical_dtype(X[c])]
|
| 106 |
+
mappings: Dict[str, Dict[str, float]] = {}
|
| 107 |
+
global_mean = float(y.mean())
|
| 108 |
+
X_encoded = X.copy()
|
| 109 |
+
encoded_cols: List[str] = []
|
| 110 |
+
for c in cat_cols:
|
| 111 |
+
stats = y.groupby(X[c]).agg(['mean','count'])
|
| 112 |
+
# smoothing: (count*mean + smoothing*global) / (count + smoothing)
|
| 113 |
+
smooth = (stats['count'] * stats['mean'] + self.target_smoothing * global_mean) / (stats['count'] + self.target_smoothing)
|
| 114 |
+
mapping = smooth.to_dict()
|
| 115 |
+
mappings[c] = mapping
|
| 116 |
+
new_col = f"{c}__te"
|
| 117 |
+
encoded_cols.append(new_col)
|
| 118 |
+
X_encoded[new_col] = X[c].map(mapping).fillna(global_mean)
|
| 119 |
+
# Drop original categorical columns
|
| 120 |
+
X_encoded = X_encoded.drop(columns=cat_cols)
|
| 121 |
+
return X_encoded, mappings, global_mean, encoded_cols
|
| 122 |
+
|
| 123 |
+
def _apply_target_transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
| 124 |
+
assert self.state and self.state.target_mappings is not None
|
| 125 |
+
global_mean = self.state.target_global_mean
|
| 126 |
+
X_new = X.copy()
|
| 127 |
+
# For each mapping, create encoded column
|
| 128 |
+
for col, mapping in self.state.target_mappings.items():
|
| 129 |
+
new_col = f"{col}__te"
|
| 130 |
+
series = X_new[col] if col in X_new.columns else pd.Series([None]*len(X_new), index=X_new.index)
|
| 131 |
+
X_new[new_col] = series.map(mapping).fillna(global_mean)
|
| 132 |
+
# Drop raw categorical cols
|
| 133 |
+
X_new = X_new.drop(columns=list(self.state.target_mappings.keys()))
|
| 134 |
+
# Align order / add any missing
|
| 135 |
+
missing = [c for c in self.state.feature_order if c not in X_new.columns]
|
| 136 |
+
for m in missing:
|
| 137 |
+
X_new[m] = 0.0
|
| 138 |
+
X_new = X_new[self.state.feature_order]
|
| 139 |
+
return X_new
|
| 140 |
+
|
| 141 |
+
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'PreprocessingPipeline':
|
| 142 |
+
X = X.copy()
|
| 143 |
+
dropped: List[str] = []
|
| 144 |
+
onehot_categories: Optional[Dict[str, List[str]]] = None
|
| 145 |
+
target_mappings: Optional[Dict[str, Dict[str, float]]] = None
|
| 146 |
+
target_global_mean: Optional[float] = None
|
| 147 |
+
target_encoded_cols: Optional[List[str]] = None
|
| 148 |
+
|
| 149 |
+
if self.categorical_strategy == 'drop':
|
| 150 |
+
non_numeric = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
|
| 151 |
+
if non_numeric:
|
| 152 |
+
X = X.drop(columns=non_numeric)
|
| 153 |
+
dropped = non_numeric
|
| 154 |
+
elif self.categorical_strategy == 'onehot':
|
| 155 |
+
X, onehot_categories = self._apply_onehot_fit(X)
|
| 156 |
+
elif self.categorical_strategy == 'target':
|
| 157 |
+
if y is None:
|
| 158 |
+
raise ValueError("Target series y must be provided for target encoding strategy.")
|
| 159 |
+
X, target_mappings, target_global_mean, target_encoded_cols = self._compute_target_encoding(X, y)
|
| 160 |
+
else:
|
| 161 |
+
raise NotImplementedError(f"Categorical strategy '{self.categorical_strategy}' not implemented.")
|
| 162 |
+
|
| 163 |
+
numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
|
| 164 |
+
if self.scale and numeric_cols:
|
| 165 |
+
self._scaler = StandardScaler()
|
| 166 |
+
self._scaler.fit(X[numeric_cols])
|
| 167 |
+
self.state = PreprocessingState(
|
| 168 |
+
categorical_strategy=self.categorical_strategy,
|
| 169 |
+
scaled_numeric=numeric_cols if self.scale else [],
|
| 170 |
+
dropped_columns=dropped,
|
| 171 |
+
feature_order=list(X.columns),
|
| 172 |
+
scale=self.scale,
|
| 173 |
+
onehot_categories=onehot_categories,
|
| 174 |
+
target_mappings=target_mappings,
|
| 175 |
+
target_global_mean=target_global_mean,
|
| 176 |
+
target_encoded_columns=target_encoded_cols
|
| 177 |
+
)
|
| 178 |
+
return self
|
| 179 |
+
|
| 180 |
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
| 181 |
+
if self.state is None:
|
| 182 |
+
raise RuntimeError("Pipeline not fitted.")
|
| 183 |
+
X = X.copy()
|
| 184 |
+
if self.state.categorical_strategy == 'drop':
|
| 185 |
+
for col in self.state.dropped_columns:
|
| 186 |
+
if col in X.columns:
|
| 187 |
+
X = X.drop(columns=col)
|
| 188 |
+
missing = [c for c in self.state.feature_order if c not in X.columns]
|
| 189 |
+
if missing:
|
| 190 |
+
raise ValueError(f"Incoming data missing columns required by preprocessor: {missing}")
|
| 191 |
+
X = X[self.state.feature_order]
|
| 192 |
+
elif self.state.categorical_strategy == 'onehot':
|
| 193 |
+
X = self._apply_onehot_transform(X)
|
| 194 |
+
elif self.state.categorical_strategy == 'target':
|
| 195 |
+
X = self._apply_target_transform(X)
|
| 196 |
+
else:
|
| 197 |
+
raise NotImplementedError(f"Unknown strategy {self.state.categorical_strategy}")
|
| 198 |
+
if self.scale and self._scaler is not None:
|
| 199 |
+
# Ensure float dtype prior to scaling assignment to avoid pandas FutureWarning
|
| 200 |
+
for col in self.state.scaled_numeric:
|
| 201 |
+
if not pd.api.types.is_float_dtype(X[col]):
|
| 202 |
+
X[col] = X[col].astype('float64')
|
| 203 |
+
X.loc[:, self.state.scaled_numeric] = self._scaler.transform(X[self.state.scaled_numeric])
|
| 204 |
+
return X
|
| 205 |
+
|
| 206 |
+
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
|
| 207 |
+
return self.fit(X, y).transform(X)
|
| 208 |
+
|
| 209 |
+
def save(self, path: str):
|
| 210 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 211 |
+
payload: Dict[str, Any] = {
|
| 212 |
+
'state': asdict(self.state) if self.state else None,
|
| 213 |
+
'categorical_strategy': self.categorical_strategy,
|
| 214 |
+
'scale': self.scale,
|
| 215 |
+
'scaler': self._scaler,
|
| 216 |
+
'target_min_samples': self.target_min_samples,
|
| 217 |
+
'target_smoothing': self.target_smoothing
|
| 218 |
+
}
|
| 219 |
+
joblib.dump(payload, path)
|
| 220 |
+
|
| 221 |
+
@classmethod
|
| 222 |
+
def load(cls, path: str) -> 'PreprocessingPipeline':
|
| 223 |
+
payload = joblib.load(path)
|
| 224 |
+
pipe = cls(
|
| 225 |
+
categorical_strategy=payload.get('categorical_strategy', 'drop'),
|
| 226 |
+
scale=payload.get('scale', True),
|
| 227 |
+
target_min_samples=payload.get('target_min_samples', 5),
|
| 228 |
+
target_smoothing=payload.get('target_smoothing', 10.0)
|
| 229 |
+
)
|
| 230 |
+
state_dict = payload.get('state')
|
| 231 |
+
if state_dict:
|
| 232 |
+
pipe.state = PreprocessingState(**state_dict)
|
| 233 |
+
pipe._scaler = payload.get('scaler')
|
| 234 |
+
return pipe
|
| 235 |
+
|
| 236 |
+
def to_metadata(self) -> Dict[str, Any]:
|
| 237 |
+
return asdict(self.state) if self.state else {}
|
| 238 |
+
|
| 239 |
+
"""Helper for future extension: registration of new categorical strategies.
|
| 240 |
+
Currently omitted for brevity."""
|