GitHub Actions commited on
Commit
ffdb9be
Β·
0 Parent(s):

Deploy selected files

Browse files
Dockerfile ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================
2
+ # TrueNest - MLflow + FastAPI Inference Image (GeoPandas-ready)
3
+ # =============================================================
4
+
5
+ FROM python:3.10
6
+
7
+ ARG DEBIAN_FRONTEND=noninteractive
8
+
9
+ # ------------------------------------------------
10
+ # Install system deps for GeoPandas stack
11
+ # ------------------------------------------------
12
+ RUN apt-get update && apt-get install -y \
13
+ gdal-bin \
14
+ libgdal-dev \
15
+ libgeos-dev \
16
+ proj-bin \
17
+ proj-data \
18
+ libproj-dev \
19
+ build-essential \
20
+ curl \
21
+ wget \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ ENV GDAL_DATA=/usr/share/gdal
25
+ ENV PROJ_LIB=/usr/share/proj
26
+ ENV PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 LANG=C.UTF-8
27
+
28
+ WORKDIR /app
29
+
30
+ # ------------------------------------------------
31
+ # Copy inference code & metadata
32
+ # ------------------------------------------------
33
+ COPY ./main.py /app/main.py
34
+ COPY ./handler.py /app/handler.py
35
+ COPY ./reports /app/reports
36
+ COPY ./requirements.txt /app/requirements.txt
37
+
38
+ # ------------------------------------------------
39
+ # NEW β€” Copy source code for MLflow unpickling
40
+ # ------------------------------------------------
41
+ COPY ./src /app/src
42
+ ENV PYTHONPATH="/app"
43
+
44
+ # ------------------------------------------------
45
+ # Install Python deps
46
+ # ------------------------------------------------
47
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel
48
+ RUN pip install --no-cache-dir uvicorn fastapi python-dotenv
49
+ RUN pip install --no-cache-dir -r /app/requirements.txt
50
+
51
+ EXPOSE 7860
52
+
53
+ # ------------------------------------------------
54
+ # Startup script with GCP credential fix
55
+ # ------------------------------------------------
56
+ RUN printf '%s\n' \
57
+ '#!/bin/bash' \
58
+ 'set -e' \
59
+ 'echo "πŸš€ Starting TrueNest GeoPandas-enabled inference container"' \
60
+ '' \
61
+ '# -------------------------------' \
62
+ '# Configure GCP credentials' \
63
+ '# -------------------------------' \
64
+ 'if [ -n "$GOOGLE_APPLICATION_CREDENTIALS_JSON" ]; then' \
65
+ ' echo "$GOOGLE_APPLICATION_CREDENTIALS_JSON" > /tmp/creds.json' \
66
+ ' export GOOGLE_APPLICATION_CREDENTIALS=/tmp/creds.json' \
67
+ ' export CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=/tmp/creds.json' \
68
+ ' echo "βœ… GCP credentials written to /tmp/creds.json"' \
69
+ 'else' \
70
+ ' echo "⚠️ No GOOGLE_APPLICATION_CREDENTIALS_JSON provided. GCS access will fail."' \
71
+ 'fi' \
72
+ '' \
73
+ '# -------------------------------' \
74
+ '# Force GCP credential load BEFORE MLflow loads model' \
75
+ 'echo "πŸ” Verifying GCP credentials..."' \
76
+ 'python <<EOF' \
77
+ 'import os' \
78
+ 'from google.oauth2 import service_account' \
79
+ 'from google.cloud import storage' \
80
+ 'p = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")' \
81
+ 'if not p or not os.path.exists(p):' \
82
+ ' print("❌ No valid Google credentials found:", p)' \
83
+ 'else:' \
84
+ ' creds = service_account.Credentials.from_service_account_file(p)' \
85
+ ' client = storage.Client(credentials=creds)' \
86
+ ' try:' \
87
+ ' client.list_buckets(max_results=1)' \
88
+ ' print("πŸ” GCP credentials verified successfully")' \
89
+ ' except Exception as e:' \
90
+ ' print("❌ GCP credential test failed:", e)' \
91
+ 'EOF' \
92
+ '' \
93
+ '# -------------------------------' \
94
+ '# Preload MLflow pipeline model' \
95
+ '# -------------------------------' \
96
+ 'python <<EOF' \
97
+ 'import mlflow, json' \
98
+ 'info = json.load(open("/app/reports/last_run_info.json"))' \
99
+ 'uri = info.get("pipeline_model_uri") or info.get("model_uri")' \
100
+ 'print(f"πŸ“¦ Loading MLflow model: {uri}")' \
101
+ 'mlflow.pyfunc.load_model(uri)' \
102
+ 'print("βœ… MLflow model loaded successfully")' \
103
+ 'EOF' \
104
+ '' \
105
+ 'echo "πŸš€ Launching FastAPI server"' \
106
+ 'exec uvicorn main:app --host 0.0.0.0 --port 7860' \
107
+ > /app/start.sh && chmod +x /app/start.sh
108
+
109
+ CMD ["/app/start.sh"]
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TrueNest API
3
+ emoji: πŸ“‘
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_file: Dockerfile
8
+ pinned: false
9
+ ---
handler.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import json
3
+ import mlflow
4
+ import mlflow.pyfunc
5
+ import pandas as pd
6
+ from dotenv import load_dotenv
7
+
8
+ # Load .env BEFORE anything else
9
+ load_dotenv()
10
+
11
+ # Ensure the project root (which contains 'src') is in sys.path
12
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
13
+ if project_root not in sys.path:
14
+ sys.path.insert(0, project_root)
15
+
16
+
17
+ class FastApiHandler:
18
+ """Handler for rent price prediction using MLflow pipeline model."""
19
+
20
+ def __init__(
21
+ self,
22
+ run_info_path: str = "reports/last_run_info.json",
23
+ ):
24
+ self.run_info_path = run_info_path
25
+ self.model = None
26
+ self.run_id = None
27
+ self.model_uri = None
28
+
29
+ self._configure_gcp_credentials()
30
+ self.load_model() # Load once at startup
31
+
32
+ # -----------------------------------------------------------
33
+ # Configure Google Cloud authentication
34
+ # -----------------------------------------------------------
35
+ def _configure_gcp_credentials(self):
36
+ """Loads GCP credentials from HF ENV or system ENV."""
37
+
38
+ # Hugging Face Spaces: JSON secret
39
+ creds_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
40
+
41
+ if creds_json:
42
+ print("πŸ” Configuring GCP credentials from ENV JSON...")
43
+ with open("/tmp/gcp_creds.json", "w") as f:
44
+ f.write(creds_json)
45
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcp_creds.json"
46
+
47
+ # Local dev or Docker with .env
48
+ elif os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
49
+ print("πŸ” Using GOOGLE_APPLICATION_CREDENTIALS from environment")
50
+
51
+ else:
52
+ print("⚠️ WARNING: No GCP credentials provided! GCS model loading may fail.")
53
+
54
+ # -----------------------------------------------------------
55
+ # Load the MLflow model
56
+ # -----------------------------------------------------------
57
+ def load_model(self):
58
+ if not os.path.exists(self.run_info_path):
59
+ raise FileNotFoundError(
60
+ f"❌ {self.run_info_path} not found β€” train the model first."
61
+ )
62
+
63
+ with open(self.run_info_path) as f:
64
+ info = json.load(f)
65
+
66
+ self.run_id = info.get("run_id")
67
+ self.model_uri = info.get("pipeline_model_uri")
68
+
69
+ print(f"πŸ”— Loading MLflow model: {self.model_uri}")
70
+
71
+ # MLflow resolves GCS path automatically from runs:/ URI
72
+ self.model = mlflow.pyfunc.load_model(self.model_uri)
73
+
74
+ print(f"βœ… Model loaded successfully (run_id={self.run_id})")
75
+
76
+ # -----------------------------------------------------------
77
+ # Predict
78
+ # -----------------------------------------------------------
79
+ def predict(self, model_params: dict) -> float:
80
+ if self.model is None:
81
+ raise RuntimeError("Model not loaded")
82
+
83
+ df = pd.DataFrame([model_params])
84
+ preds = self.model.predict(df)
85
+ return float(preds[0])
86
+
87
+
88
+ def explain_prediction(self, model_params: dict) -> dict:
89
+ if self.model is None:
90
+ raise RuntimeError("Model not loaded")
91
+
92
+ df = pd.DataFrame([model_params])
93
+
94
+ # πŸ”₯ Unwrap the custom RentPricePipeline
95
+ python_model = self.model.unwrap_python_model()
96
+
97
+ explanation = python_model.explain_predictions(df)
98
+ return explanation
99
+
100
+ # -----------------------------------------------------------
101
+ # FastAPI-compatible handler
102
+ # -----------------------------------------------------------
103
+ def handle(self, params: dict) -> dict:
104
+ if "model_params" not in params:
105
+ return {"error": "Missing 'model_params' in request"}
106
+
107
+ try:
108
+ prediction = self.predict(params["model_params"])
109
+ except Exception as e:
110
+ return {"error": str(e)}
111
+
112
+ return {
113
+ "prediction": prediction,
114
+ "inputs": params["model_params"],
115
+ "run_id": self.run_id,
116
+ }
main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, Field
3
+ from handler import FastApiHandler
4
+
5
+ app = FastAPI(title="TrueNest Rent Prediction API")
6
+ handler = None
7
+
8
+
9
+ # ---------- Request schema with example ----------
10
+
11
+ class PredictRequest(BaseModel):
12
+ model_params: dict = Field(
13
+ ...,
14
+ json_schema_extra={
15
+ "example": {
16
+ "bathrooms": 1,
17
+ "bedrooms": 2,
18
+ "propertyType": "Flat",
19
+ "deposit": False,
20
+ "letType": "Long term",
21
+ "furnishType": "Furnished",
22
+ "latitude": 51.49199,
23
+ "longitude": -0.17134
24
+ }
25
+ },
26
+ )
27
+
28
+
29
+
30
+ # ---------- Startup: load model once ----------
31
+ @app.on_event("startup")
32
+ def load_model_once():
33
+ global handler
34
+ handler = FastApiHandler()
35
+ print("βœ… MLflow model loaded at startup")
36
+
37
+
38
+ # ---------- Routes ----------
39
+ @app.get("/")
40
+ def root():
41
+ return {"message": "🏑 Rent Prediction API is running", "run_id": handler.run_id}
42
+
43
+
44
+ @app.post("/predict")
45
+ def predict(req: PredictRequest):
46
+ result = handler.handle(req.dict())
47
+ if "error" in result:
48
+ raise HTTPException(status_code=400, detail=result["error"])
49
+ return result
50
+
51
+
52
+ @app.post("/explain")
53
+ def explain(req: PredictRequest):
54
+ try:
55
+ explanation = handler.explain_prediction(req.model_params)
56
+ return explanation
57
+ except Exception as e:
58
+ raise HTTPException(status_code=400, detail=str(e))
59
+
reports/eval_metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "r2": 0.8474,
3
+ "mae": 427.68,
4
+ "mape": 0.1372
5
+ }
reports/last_run_id.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ d198679a3542411ca2082e4d9832038d
reports/last_run_info.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "d198679a3542411ca2082e4d9832038d",
3
+ "pipeline_model_uri": "gs://rent_price_bucket/artifacts/8/models/m-344397ce2d7344b9b143d7e0049bb907/artifacts",
4
+ "timestamp": "2025-11-18T10:42:44.464536Z",
5
+ "mlflow_experiment": "Rent_Price_Pipeline",
6
+ "mlflow_ui_link": "http://127.0.0.1:5000/#/experiments/8/runs/d198679a3542411ca2082e4d9832038d"
7
+ }
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mlflow==3.6.0
2
+ catboost==1.2.3
3
+ google-cloud-storage==2.13.0 # Required for GCS access
4
+ numpy==1.26.4
5
+ pandas==2.3.3
6
+ pyarrow==15.0.2
7
+ aiohttp==3.9.1
8
+ psutil==5.9.6
9
+ geopandas==1.0.1
10
+ geopy==2.4.1
11
+ scikit-learn==1.7.2
12
+ scipy==1.15.3
13
+ cloudpickle==3.1.2
14
+ fastapi==0.104.0
15
+ uvicorn==0.24.0
16
+ pydantic==2.5.0
17
+ shap==0.49.1
src/app/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TrueNest API
3
+ emoji: πŸ“‘
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_file: Dockerfile
8
+ pinned: false
9
+ ---
src/app/handler.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import json
3
+ import mlflow
4
+ import mlflow.pyfunc
5
+ import pandas as pd
6
+ from dotenv import load_dotenv
7
+
8
+ # Load .env BEFORE anything else
9
+ load_dotenv()
10
+
11
+ # Ensure the project root (which contains 'src') is in sys.path
12
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
13
+ if project_root not in sys.path:
14
+ sys.path.insert(0, project_root)
15
+
16
+
17
+ class FastApiHandler:
18
+ """Handler for rent price prediction using MLflow pipeline model."""
19
+
20
+ def __init__(
21
+ self,
22
+ run_info_path: str = "reports/last_run_info.json",
23
+ ):
24
+ self.run_info_path = run_info_path
25
+ self.model = None
26
+ self.run_id = None
27
+ self.model_uri = None
28
+
29
+ self._configure_gcp_credentials()
30
+ self.load_model() # Load once at startup
31
+
32
+ # -----------------------------------------------------------
33
+ # Configure Google Cloud authentication
34
+ # -----------------------------------------------------------
35
+ def _configure_gcp_credentials(self):
36
+ """Loads GCP credentials from HF ENV or system ENV."""
37
+
38
+ # Hugging Face Spaces: JSON secret
39
+ creds_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
40
+
41
+ if creds_json:
42
+ print("πŸ” Configuring GCP credentials from ENV JSON...")
43
+ with open("/tmp/gcp_creds.json", "w") as f:
44
+ f.write(creds_json)
45
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcp_creds.json"
46
+
47
+ # Local dev or Docker with .env
48
+ elif os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
49
+ print("πŸ” Using GOOGLE_APPLICATION_CREDENTIALS from environment")
50
+
51
+ else:
52
+ print("⚠️ WARNING: No GCP credentials provided! GCS model loading may fail.")
53
+
54
+ # -----------------------------------------------------------
55
+ # Load the MLflow model
56
+ # -----------------------------------------------------------
57
+ def load_model(self):
58
+ if not os.path.exists(self.run_info_path):
59
+ raise FileNotFoundError(
60
+ f"❌ {self.run_info_path} not found β€” train the model first."
61
+ )
62
+
63
+ with open(self.run_info_path) as f:
64
+ info = json.load(f)
65
+
66
+ self.run_id = info.get("run_id")
67
+ self.model_uri = info.get("pipeline_model_uri")
68
+
69
+ print(f"πŸ”— Loading MLflow model: {self.model_uri}")
70
+
71
+ # MLflow resolves GCS path automatically from runs:/ URI
72
+ self.model = mlflow.pyfunc.load_model(self.model_uri)
73
+
74
+ print(f"βœ… Model loaded successfully (run_id={self.run_id})")
75
+
76
+ # -----------------------------------------------------------
77
+ # Predict
78
+ # -----------------------------------------------------------
79
+ def predict(self, model_params: dict) -> float:
80
+ if self.model is None:
81
+ raise RuntimeError("Model not loaded")
82
+
83
+ df = pd.DataFrame([model_params])
84
+ preds = self.model.predict(df)
85
+ return float(preds[0])
86
+
87
+
88
+ def explain_prediction(self, model_params: dict) -> dict:
89
+ if self.model is None:
90
+ raise RuntimeError("Model not loaded")
91
+
92
+ df = pd.DataFrame([model_params])
93
+
94
+ # πŸ”₯ Unwrap the custom RentPricePipeline
95
+ python_model = self.model.unwrap_python_model()
96
+
97
+ explanation = python_model.explain_predictions(df)
98
+ return explanation
99
+
100
+ # -----------------------------------------------------------
101
+ # FastAPI-compatible handler
102
+ # -----------------------------------------------------------
103
+ def handle(self, params: dict) -> dict:
104
+ if "model_params" not in params:
105
+ return {"error": "Missing 'model_params' in request"}
106
+
107
+ try:
108
+ prediction = self.predict(params["model_params"])
109
+ except Exception as e:
110
+ return {"error": str(e)}
111
+
112
+ return {
113
+ "prediction": prediction,
114
+ "inputs": params["model_params"],
115
+ "run_id": self.run_id,
116
+ }
src/app/main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, Field
3
+ from handler import FastApiHandler
4
+
5
+ app = FastAPI(title="TrueNest Rent Prediction API")
6
+ handler = None
7
+
8
+
9
+ # ---------- Request schema with example ----------
10
+
11
+ class PredictRequest(BaseModel):
12
+ model_params: dict = Field(
13
+ ...,
14
+ json_schema_extra={
15
+ "example": {
16
+ "bathrooms": 1,
17
+ "bedrooms": 2,
18
+ "propertyType": "Flat",
19
+ "deposit": False,
20
+ "letType": "Long term",
21
+ "furnishType": "Furnished",
22
+ "latitude": 51.49199,
23
+ "longitude": -0.17134
24
+ }
25
+ },
26
+ )
27
+
28
+
29
+
30
+ # ---------- Startup: load model once ----------
31
+ @app.on_event("startup")
32
+ def load_model_once():
33
+ global handler
34
+ handler = FastApiHandler()
35
+ print("βœ… MLflow model loaded at startup")
36
+
37
+
38
+ # ---------- Routes ----------
39
+ @app.get("/")
40
+ def root():
41
+ return {"message": "🏑 Rent Prediction API is running", "run_id": handler.run_id}
42
+
43
+
44
+ @app.post("/predict")
45
+ def predict(req: PredictRequest):
46
+ result = handler.handle(req.dict())
47
+ if "error" in result:
48
+ raise HTTPException(status_code=400, detail=result["error"])
49
+ return result
50
+
51
+
52
+ @app.post("/explain")
53
+ def explain(req: PredictRequest):
54
+ try:
55
+ explanation = handler.explain_prediction(req.model_params)
56
+ return explanation
57
+ except Exception as e:
58
+ raise HTTPException(status_code=400, detail=str(e))
59
+
src/app/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mlflow==3.6.0
2
+ catboost==1.2.3
3
+ google-cloud-storage==2.13.0 # Required for GCS access
4
+ numpy==1.26.4
5
+ pandas==2.3.3
6
+ pyarrow==15.0.2
7
+ aiohttp==3.9.1
8
+ psutil==5.9.6
9
+ geopandas==1.0.1
10
+ geopy==2.4.1
11
+ scikit-learn==1.7.2
12
+ scipy==1.15.3
13
+ cloudpickle==3.1.2
14
+ fastapi==0.104.0
15
+ uvicorn==0.24.0
16
+ pydantic==2.5.0
17
+ shap==0.49.1
src/evaluate.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import json
3
+ import mlflow
4
+ import pandas as pd
5
+ import requests
6
+ from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+ # Add project root to sys.path for imports
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
12
+
13
+ from src.utils import Timer
14
+
15
+ # ------------------------------------------------------------------
16
+ # Constants
17
+ # ------------------------------------------------------------------
18
+ TEST_PATH = "data/processed/test.parquet"
19
+ RUN_INFO_PATH = "reports/last_run_info.json"
20
+ METRICS_PATH = "reports/eval_metrics.json"
21
+
22
+ # Tracking server info (should match training script)
23
+ TRACKING_SERVER_HOST = "127.0.0.1"
24
+ TRACKING_SERVER_PORT = 5000
25
+
26
+
27
+ def main():
28
+ # ---------------------------------------------------------------
29
+ # 1. Load metadata
30
+ # ---------------------------------------------------------------
31
+ if not os.path.exists(RUN_INFO_PATH):
32
+ raise FileNotFoundError(f"❌ {RUN_INFO_PATH} not found. Run training first.")
33
+
34
+ print("πŸ“„ Loading last MLflow run info...")
35
+ with open(RUN_INFO_PATH) as f:
36
+ run_info = json.load(f)
37
+
38
+ run_id = run_info["run_id"]
39
+ model_uri = run_info["pipeline_model_uri"]
40
+
41
+ print(f"πŸ” Run ID: {run_id}")
42
+ print(f"πŸ”„ Model URI: {model_uri}")
43
+ print()
44
+
45
+ # ---------------------------------------------------------------
46
+ # Ensure Google Cloud credentials are set
47
+ # ---------------------------------------------------------------
48
+ GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
49
+ if not GOOGLE_CREDS or not os.path.isfile(GOOGLE_CREDS):
50
+ raise FileNotFoundError(
51
+ f"❌ GOOGLE_APPLICATION_CREDENTIALS not set or invalid: {GOOGLE_CREDS}"
52
+ )
53
+ print(f"πŸ” Using Google credentials: {GOOGLE_CREDS}")
54
+
55
+ # ---------------------------------------------------------------
56
+ # 2. Connect to MLflow tracking server
57
+ # ---------------------------------------------------------------
58
+ try:
59
+ r = requests.get(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}", timeout=3)
60
+ if r.status_code != 200:
61
+ raise requests.exceptions.RequestException
62
+ except requests.exceptions.RequestException:
63
+ raise ConnectionError(
64
+ f"❌ MLflow tracking server not reachable at "
65
+ f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}. "
66
+ f"Start the server before evaluation."
67
+ )
68
+
69
+ mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
70
+ mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
71
+
72
+ print(f"πŸ”— Connected to MLflow: {mlflow.get_tracking_uri()}")
73
+ print(f" Using run ID: {run_id}")
74
+ print()
75
+
76
+ # ---------------------------------------------------------------
77
+ # 3. Load model from GCS
78
+ # ---------------------------------------------------------------
79
+ with Timer("Load MLflow model"):
80
+ model = mlflow.pyfunc.load_model(model_uri)
81
+
82
+ # ---------------------------------------------------------------
83
+ # 4. Load test data
84
+ # ---------------------------------------------------------------
85
+ print("πŸ“¦ Loading test data...")
86
+ df_test = pd.read_parquet(TEST_PATH)
87
+ X_test = df_test.drop(columns=["price"])
88
+ y_test = df_test["price"]
89
+
90
+ # ---------------------------------------------------------------
91
+ # 5. Run inference
92
+ # ---------------------------------------------------------------
93
+ print("βš™οΈ Running inference on test set...")
94
+ with Timer("Model inference"):
95
+ preds = model.predict(X_test)
96
+
97
+ # ---------------------------------------------------------------
98
+ # 6. Compute metrics
99
+ # ---------------------------------------------------------------
100
+ print("πŸ“Š Computing metrics...")
101
+ metrics = {
102
+ "r2": round(r2_score(y_test, preds), 4),
103
+ "mae": round(mean_absolute_error(y_test, preds), 2),
104
+ "mape": round(mean_absolute_percentage_error(y_test, preds), 4),
105
+ }
106
+
107
+ # ---------------------------------------------------------------
108
+ # 7. Log metrics to MLflow (same run)
109
+ # ---------------------------------------------------------------
110
+ print("πŸ“ Logging metrics to MLflow...")
111
+ mlflow.start_run(run_id=run_id)
112
+ mlflow.log_metrics(metrics)
113
+ mlflow.end_run()
114
+
115
+ # ---------------------------------------------------------------
116
+ # 8. Save metrics locally for DVC
117
+ # ---------------------------------------------------------------
118
+ os.makedirs(os.path.dirname(METRICS_PATH), exist_ok=True)
119
+ with open(METRICS_PATH, "w") as f:
120
+ json.dump(metrics, f, indent=2)
121
+
122
+ print("βœ… Evaluation complete!")
123
+ print(json.dumps(metrics, indent=2))
124
+ print(f"πŸ”— MLflow UI: {run_info['mlflow_ui_link']}")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
129
+
src/features/build_features.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from src.features.geo_features import LondonPropertyGeoFeatures
3
+ # Optional:
4
+ # from src.features.feature_engineering import add_non_geo_features
5
+ # from src.features.nlp_features import PropertyTextEncoder
6
+
7
+ def is_numeric_and_true(value):
8
+ return isinstance(value, (int, float)) and bool(value)
9
+
10
+
11
+ def build_features(df: pd.DataFrame, geo_dir: str = "data/geo") -> pd.DataFrame:
12
+ """
13
+ Compute all features used by the rent price model.
14
+ Combines geospatial, engineered, and NLP-derived features.
15
+ """
16
+
17
+ # 1. Geospatial engineered features
18
+ geo = LondonPropertyGeoFeatures(geo_dir)
19
+ df = geo.add_features_to_df(df)
20
+
21
+ # 2. Other engineered features (optional)
22
+ # df = add_non_geo_features(df)
23
+
24
+ # 3. NLP / embeddings (optional)
25
+ # encoder = PropertyTextEncoder()
26
+ # df = encoder.add_nlp_embeddings(df, text_column="description")
27
+
28
+ df["deposit"] = df["deposit"].apply(is_numeric_and_true)
29
+
30
+ return df
31
+
32
+
src/features/geo_features.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Imports ---
2
+ import numpy as np
3
+ import geopandas as gpd
4
+ from shapely.geometry import Point
5
+ from geopy.distance import geodesic
6
+ from sklearn.neighbors import BallTree
7
+ import pandas as pd
8
+
9
+ # --- Constants ---
10
+ CITY_CENTER = (51.5072, -0.1276) # London
11
+ EPSG = "EPSG:4326"
12
+
13
+
14
+ class LondonPropertyGeoFeatures:
15
+ """Extract London property geo features for model inference."""
16
+
17
+ def __init__(self, geo_dir):
18
+ self.CITY_CENTER = CITY_CENTER
19
+ self.EPSG = EPSG
20
+ self.geo_dir = geo_dir
21
+ self.load_datasets()
22
+ self.prepare_station_tree()
23
+
24
+ def load_datasets(self):
25
+ """Load and prepare geographic datasets."""
26
+ self.london_boundaries = gpd.read_file(f"{self.geo_dir}/london_boroughs.geojson").to_crs(self.EPSG)
27
+ self.hex_gdf = gpd.read_parquet(f"{self.geo_dir}/noize.parquet").to_crs(self.EPSG)
28
+ self.zone_fares = gpd.read_parquet(f"{self.geo_dir}/zone_fares.parquet").to_crs(self.EPSG)
29
+ self.stations = gpd.read_parquet(f"{self.geo_dir}/rail_tfl.parquet").to_crs(self.EPSG)
30
+
31
+
32
+ def prepare_station_tree(self):
33
+ """Prepare BallTree for fast station distance queries."""
34
+ # Convert stations to UTM for accurate distance calculations
35
+ self.stations_utm = self.stations.to_crs(self.stations.estimate_utm_crs())
36
+ station_coords = np.array([[p.x, p.y] for p in self.stations_utm.geometry])
37
+ self.station_tree = BallTree(station_coords, leaf_size=15, metric='euclidean')
38
+ self.station_names = self.stations_utm['CommonName'].values
39
+ self.station_tfl = self.stations_utm['TFL'].values
40
+ self.station_rail = self.stations_utm['RAIL'].values
41
+
42
+ def _create_point_gdf(self, lat, lon):
43
+ """Create a GeoDataFrame for the point (internal helper)."""
44
+ point = Point(lon, lat)
45
+ return gpd.GeoDataFrame(geometry=[point], crs=self.EPSG)
46
+
47
+ def borough(self, lat, lon):
48
+ """Return the London borough name containing the given coordinates."""
49
+ prop_gdf = self._create_point_gdf(lat, lon)
50
+ joined = gpd.sjoin(prop_gdf, self.london_boundaries, how="left", predicate="within")
51
+ return joined.iloc[0].get("name", None)
52
+
53
+ def compute_angle(self, lat, lon):
54
+ """Compute angle (in radians) of a point relative to London center."""
55
+ lat1, lon1 = np.radians(self.CITY_CENTER[0]), np.radians(self.CITY_CENTER[1])
56
+ lat2, lon2 = np.radians(lat), np.radians(lon)
57
+
58
+ dlon = lon2 - lon1
59
+ x = np.cos(lat2) * np.sin(dlon)
60
+ y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
61
+ return np.arctan2(x, y)
62
+
63
+ def distance_to_center(self, lat, lon):
64
+ """Return distance from city center (in miles)."""
65
+ return geodesic((lat, lon), self.CITY_CENTER).miles
66
+
67
+ def noize_class(self, lat, lon):
68
+ """Return noise class for given coordinates."""
69
+ prop_gdf = self._create_point_gdf(lat, lon)
70
+ joined = gpd.sjoin(prop_gdf, self.hex_gdf, how="left", predicate="within")
71
+ return joined.iloc[0].get("NoiseClass", None)
72
+
73
+ def zone_fare(self, lat, lon):
74
+ """Return transport fare zone for given coordinates."""
75
+ prop_gdf = self._create_point_gdf(lat, lon)
76
+ joined = gpd.sjoin(prop_gdf, self.zone_fares, how="left", predicate="within")
77
+ zone_name = joined.iloc[0].get("Name", None)
78
+ # Extract just the zone number if format is "Zone X"
79
+ if zone_name and "Zone" in zone_name:
80
+ return zone_name.split(" ")[-1]
81
+ return zone_name
82
+
83
+ def find_nearest_stations(self, lat, lon, k=3, max_distance_meters=50000):
84
+ """
85
+ Find k nearest stations with distances and TFL/RAIL flags.
86
+ Returns distances in miles.
87
+ """
88
+ prop_gdf = self._create_point_gdf(lat, lon)
89
+ prop_utm = prop_gdf.to_crs(self.stations_utm.crs)
90
+
91
+ # Query the BallTree
92
+ prop_coords = np.array([[p.x, p.y] for p in prop_utm.geometry])
93
+ distances_m, indices = self.station_tree.query(prop_coords, k=k)
94
+
95
+ results = []
96
+ for dist_m, idx in zip(distances_m[0], indices[0]):
97
+ if dist_m <= max_distance_meters:
98
+ station_data = {
99
+ 'distance_miles': dist_m / 1609.34,
100
+ 'name': self.station_names[idx],
101
+ 'TFL': bool(self.station_tfl[idx]),
102
+ 'RAIL': bool(self.station_rail[idx])
103
+ }
104
+ results.append(station_data)
105
+
106
+ return results
107
+
108
+
109
+ def extract_geo_features(self, lat, lon):
110
+ """
111
+ Extract all GEO features for model inference in the required format.
112
+ """
113
+ # Geographic features
114
+ borough_name = self.borough(lat, lon)
115
+ angle = self.compute_angle(lat, lon)
116
+ center_distance = self.distance_to_center(lat, lon)
117
+ noise_class = self.noize_class(lat, lon)
118
+ zone = self.zone_fare(lat, lon)
119
+
120
+ # Station features
121
+ nearest_stations = self.find_nearest_stations(lat, lon, k=3)
122
+
123
+ # Prepare station features with proper naming
124
+ station_features = {}
125
+ for i, station in enumerate(nearest_stations[:3], 1):
126
+ station_features[f'distance_to_station{i}'] = round(station['distance_miles'], 6)
127
+ station_features[f'TFL{i}'] = station['TFL']
128
+ station_features[f'RAIL{i}'] = station['RAIL']
129
+
130
+ # Fill missing stations with default values
131
+ for i in range(len(nearest_stations) + 1, 4):
132
+ station_features[f'distance_to_station{i}'] = None
133
+ station_features[f'TFL{i}'] = False
134
+ station_features[f'RAIL{i}'] = False
135
+
136
+ geo_features = {
137
+ "distance_to_center": round(center_distance, 6),
138
+ "angle_from_center": round(angle, 6),
139
+ "zone": zone,
140
+ "borough": borough_name,
141
+ "NoiseClass": noise_class,
142
+ **station_features
143
+ }
144
+
145
+ return geo_features
146
+
147
+
148
+ def add_features_to_df(self, df: pd.DataFrame) -> pd.DataFrame:
149
+ """Vectorized feature extraction for a full DataFrame."""
150
+ features = df.apply(
151
+ lambda row: pd.Series(self.extract_geo_features(row["latitude"], row["longitude"])),
152
+ axis=1
153
+ )
154
+ return pd.concat([df, features], axis=1)
155
+
156
+
157
+
158
+
src/rent_price_pipeline.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mlflow.pyfunc
2
+ import pandas as pd
3
+ import shap
4
+ from catboost import CatBoostRegressor
5
+ from src.features.build_features import build_features
6
+
7
+
8
+ class RentPricePipeline(mlflow.pyfunc.PythonModel):
9
+ """
10
+ MLflow-wrapped pipeline that:
11
+ - loads CatBoost model and geo datasets
12
+ - uses build_features() to compute all features
13
+ - predicts rent prices
14
+ """
15
+
16
+ def __init__(self, cb_model_path=None, geo_dir=None):
17
+ self.cb_model_path = cb_model_path
18
+ self.geo_dir = geo_dir
19
+ self.explainer = None # Initialize as None, set in load_context
20
+ # Define feature list for consistency
21
+ self.numerical_features = [
22
+ "latitude", "longitude",
23
+ "distance_to_center", "angle_from_center",
24
+ "distance_to_station1", "distance_to_station2", "distance_to_station3"
25
+ ]
26
+ self.categorical_features = [
27
+ "bedrooms", "bathrooms", "deposit", "zone", "borough", "propertyType",
28
+ "furnishType", "NoiseClass", "letType", "TFL1", "TFL2", "TFL3",
29
+ "RAIL1", "RAIL2", "RAIL3"
30
+ ]
31
+ self.features = self.numerical_features + self.categorical_features
32
+
33
+
34
+ def load_context(self, context):
35
+ """Load CatBoost model and geo datasets from MLflow artifacts."""
36
+ model_path = context.artifacts.get("catboost_model", self.cb_model_path)
37
+ self.model = CatBoostRegressor()
38
+ self.model.load_model(model_path)
39
+
40
+ # Initialize SHAP explainer after model is loaded
41
+ self.explainer = shap.TreeExplainer(self.model)
42
+
43
+ # βœ… Prefer MLflow artifact path if available
44
+ self.geo_dir = context.artifacts.get("geo_dir", self.geo_dir or "data/geo")
45
+
46
+
47
+
48
+ def predict(self, context, model_input):
49
+ """Compute features and predict rent price."""
50
+ if not isinstance(model_input, pd.DataFrame):
51
+ model_input = pd.DataFrame(model_input)
52
+
53
+ enriched = build_features(model_input, geo_dir=self.geo_dir)
54
+ return self.model.predict(enriched[self.features])
55
+
56
+
57
+ def explain_predictions(self, model_input):
58
+ if not isinstance(model_input, pd.DataFrame):
59
+ model_input = pd.DataFrame(model_input)
60
+
61
+ enriched = build_features(model_input, geo_dir=self.geo_dir)
62
+ enriched_features = enriched[self.features]
63
+
64
+ shap_values = self.explainer(enriched_features)
65
+ preds = self.model.predict(enriched_features)
66
+
67
+ return {
68
+ "prediction": float(preds[0]),
69
+ "base_value": float(self.explainer.expected_value),
70
+ "shap_values": shap_values.values.tolist(), # numpy β†’ list
71
+ "feature_names": self.features,
72
+ "data": enriched_features.to_dict(orient="records"), # DataFrame β†’ JSON
73
+ }
src/train_and_log_pipeline.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import yaml
3
+ import mlflow
4
+ import pandas as pd
5
+ from catboost import CatBoostRegressor
6
+ from mlflow.models import infer_signature
7
+ from time import perf_counter
8
+ from datetime import datetime
9
+ import json
10
+ import requests
11
+ from dotenv import load_dotenv
12
+
13
+ # Add project root to sys.path for imports
14
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
15
+
16
+ from src.features.build_features import build_features
17
+ from src.rent_price_pipeline import RentPricePipeline
18
+ from src.utils import Timer
19
+
20
+
21
+ # ============================================================
22
+ # 0. Setup: environment, credentials, MLflow connection
23
+ # ============================================================
24
+ overall_start = perf_counter()
25
+ load_dotenv()
26
+
27
+ GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
28
+ if not GOOGLE_CREDS or not os.path.isfile(GOOGLE_CREDS):
29
+ raise FileNotFoundError(f"❌ GOOGLE_APPLICATION_CREDENTIALS invalid: {GOOGLE_CREDS}")
30
+
31
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_CREDS
32
+
33
+ TRACKING_SERVER_HOST = "127.0.0.1" # or external IP if remote
34
+ TRACKING_SERVER_PORT = 5000
35
+ EXPERIMENT_NAME = "Rent_Price_Pipeline"
36
+
37
+ # ---- Verify MLflow server ----
38
+ try:
39
+ r = requests.get(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}", timeout=3)
40
+ if r.status_code != 200:
41
+ raise requests.exceptions.RequestException
42
+ except requests.exceptions.RequestException:
43
+ raise ConnectionError(
44
+ f"❌ MLflow tracking server not reachable at http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}. "
45
+ f"Please start it before running this script."
46
+ )
47
+
48
+ # ---- Configure MLflow ----
49
+ mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
50
+ mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
51
+ mlflow.set_experiment(EXPERIMENT_NAME)
52
+
53
+ print(f"πŸ”— Connected to MLflow tracking server: {mlflow.get_tracking_uri()}")
54
+ print(f"Experiment: {EXPERIMENT_NAME}")
55
+ print()
56
+
57
+ # ============================================================
58
+ # 1. Load parameters and prepare data
59
+ # ============================================================
60
+ with Timer("Load parameters"):
61
+ with open("params.yaml") as f:
62
+ params = yaml.safe_load(f)
63
+
64
+ train_params = params["train"]
65
+ model_meta = params["model"]
66
+ TARGET = model_meta["target"]
67
+ NUMERIC = model_meta["numerical_features"]
68
+ CATEGORICAL = model_meta["categorical_features"]
69
+ FEATURES = NUMERIC + CATEGORICAL
70
+ NOT_USED_COLUMNS = model_meta["not_used_features"]
71
+
72
+ print("πŸ“¦ Loading training data...")
73
+ with Timer("Load training data"):
74
+ train_df = pd.read_parquet("data/processed/train.parquet")
75
+ train_df = train_df.drop(columns=NOT_USED_COLUMNS)
76
+
77
+ print("🧩 Feature engineering...")
78
+ with Timer("Feature engineering"):
79
+ train_df = build_features(train_df, geo_dir="data/geo")
80
+ X_train, y_train = train_df[FEATURES], train_df[TARGET]
81
+
82
+ # ============================================================
83
+ # 2. Train CatBoost model
84
+ # ============================================================
85
+ print("πŸš€ Training CatBoost model...")
86
+ with Timer("Training CatBoost"):
87
+ model = CatBoostRegressor(
88
+ iterations=train_params["iterations"],
89
+ depth=train_params["depth"],
90
+ learning_rate=train_params["learning_rate"],
91
+ l2_leaf_reg=train_params["l2_leaf_reg"],
92
+ bagging_temperature=train_params["bagging_temperature"],
93
+ cat_features=CATEGORICAL,
94
+ verbose=False,
95
+ )
96
+ model.fit(X_train, y_train)
97
+
98
+ # ============================================================
99
+ # 3. Save model and prepare signatures
100
+ # ============================================================
101
+ with Timer("Save base model"):
102
+ os.makedirs("models", exist_ok=True)
103
+ cbm_path = "models/catboost_model_v1.cbm"
104
+ model.save_model(cbm_path)
105
+
106
+ with Timer("Infer signature for CatBoost"):
107
+ signature_catboost = infer_signature(X_train, model.predict(X_train[:5]))
108
+
109
+ print("🧠 Preparing wrapper pipeline...")
110
+ with Timer("Prepare wrapper and raw input example"):
111
+ wrapped = RentPricePipeline(cb_model_path=cbm_path, geo_dir="data/geo")
112
+ wrapped.model = CatBoostRegressor()
113
+ wrapped.model.load_model(cbm_path)
114
+
115
+ raw_df = pd.read_parquet("data/processed/train.parquet")
116
+ raw_df = raw_df.drop(columns=NOT_USED_COLUMNS)
117
+ input_example = raw_df.sample(1, random_state=42).drop(columns=[TARGET])
118
+
119
+ with Timer("Infer wrapper signature"):
120
+ pred_example = wrapped.predict(None, input_example)
121
+ signature_pipeline = infer_signature(input_example, pred_example)
122
+
123
+ # ============================================================
124
+ # 4. Log everything to MLflow
125
+ # ============================================================
126
+ print("πŸ“ Logging models to MLflow...")
127
+ with Timer("Log to MLflow"):
128
+ with mlflow.start_run(run_name=f"{model_meta['type']}_v1") as run:
129
+ # ---- Log CatBoost base model ----
130
+ mlflow.catboost.log_model(
131
+ cb_model=model,
132
+ name="catboost_model",
133
+ input_example=X_train.sample(1, random_state=42),
134
+ signature=signature_catboost,
135
+ )
136
+ base_uri = f"runs:/{run.info.run_id}/catboost_model"
137
+
138
+ # ---- Log wrapper pipeline ----
139
+ logged = mlflow.pyfunc.log_model(
140
+ artifact_path="pipeline_model",
141
+ python_model=wrapped,
142
+ code_paths=[
143
+ "src/features/geo_features.py",
144
+ "src/features/build_features.py",
145
+ "src/rent_price_pipeline.py",
146
+ ],
147
+ artifacts={
148
+ "catboost_model": cbm_path,
149
+ "geo_dir": "data/geo",
150
+ },
151
+ signature=signature_pipeline,
152
+ input_example=input_example,
153
+ )
154
+
155
+ # ---- Tags for traceability ----
156
+ mlflow.set_tags({
157
+ "type": "rent_price_pipeline",
158
+ "base_model_uri": base_uri,
159
+ "features_version": "v1",
160
+ "input_schema": "raw_property_data",
161
+ })
162
+
163
+ # ---- Save run metadata for DVC ----
164
+ print("🧩 Saving MLflow run metadata for DVC linkage...")
165
+ reports_dir = "reports"
166
+ os.makedirs(reports_dir, exist_ok=True)
167
+ # ============================================================
168
+ # Capture run ID (MLflow-native)
169
+ # ============================================================
170
+ run_id = run.info.run_id
171
+ experiment_id = run.info.experiment_id
172
+ # MLflow-native model URI (recommended for all environments)
173
+ # pipeline_model_uri = logged.model_uri
174
+
175
+ model_registry_uri = logged.model_uri
176
+ model_id = model_registry_uri.replace("models:/", "")
177
+ pipeline_model_uri = f"gs://rent_price_bucket/artifacts/{experiment_id}/models/{model_id}/artifacts"
178
+ print("saved link:", pipeline_model_uri)
179
+ # ============================================================
180
+ # Save metadata to JSON β€” clean and portable
181
+ # ============================================================
182
+ run_info = {
183
+ "run_id": run_id,
184
+ "pipeline_model_uri": pipeline_model_uri,
185
+ "timestamp": datetime.utcnow().isoformat() + "Z",
186
+ "mlflow_experiment": mlflow.get_experiment(run.info.experiment_id).name,
187
+ "mlflow_ui_link": (
188
+ f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}/#/experiments/"
189
+ f"{run.info.experiment_id}/runs/{run_id}"
190
+ ),
191
+ }
192
+
193
+ with open(os.path.join(reports_dir, "last_run_info.json"), "w") as f:
194
+ json.dump(run_info, f, indent=2)
195
+
196
+ with open(os.path.join(reports_dir, "last_run_id.txt"), "w") as f:
197
+ f.write(run_id)
198
+
199
+ print(" πŸ“„ Saved run metadata to reports/last_run_info.json")
200
+ print(" Run ID:", run_id)
201
+ print(" Pipeline model URI:", pipeline_model_uri)
202
+ print(" MLflow UI:", run_info["mlflow_ui_link"])
203
+
204
+ # ============================================================
205
+ # 5. Completion
206
+ # ============================================================
207
+ print("βœ… Training and remote logging completed!")
208
+ print(" Base model URI:", base_uri)
209
+ print(" Wrapper pipeline logged as: pipeline_model/")
210
+ print(f"🏁 Total script time: {perf_counter() - overall_start:.2f}s")
src/utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from time import perf_counter
2
+
3
+ class Timer:
4
+ def __init__(self, label: str):
5
+ self.label = label
6
+
7
+ def __enter__(self):
8
+ self.start = perf_counter()
9
+ return self
10
+
11
+ def __exit__(self, exc_type, exc, tb):
12
+ elapsed = perf_counter() - self.start
13
+ print(f"⏱ {self.label} took {elapsed:.2f}s")