Spaces:

servisgas
/

pricing

Sleeping

App Files Files Community

GitHub Actions commited on Nov 18, 2025

Commit

ffdb9be

0 Parent(s):

Deploy selected files

Browse files

Files changed (18) hide show

Dockerfile +109 -0
README.md +9 -0
handler.py +116 -0
main.py +59 -0
reports/eval_metrics.json +5 -0
reports/last_run_id.txt +1 -0
reports/last_run_info.json +7 -0
requirements.txt +17 -0
src/app/README.md +9 -0
src/app/handler.py +116 -0
src/app/main.py +59 -0
src/app/requirements.txt +17 -0
src/evaluate.py +129 -0
src/features/build_features.py +32 -0
src/features/geo_features.py +158 -0
src/rent_price_pipeline.py +73 -0
src/train_and_log_pipeline.py +210 -0
src/utils.py +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,109 @@

+# =============================================================
+# TrueNest - MLflow + FastAPI Inference Image (GeoPandas-ready)
+# =============================================================
+FROM python:3.10
+ARG DEBIAN_FRONTEND=noninteractive
+# ------------------------------------------------
+# Install system deps for GeoPandas stack
+# ------------------------------------------------
+RUN apt-get update && apt-get install -y \
+    gdal-bin \
+    libgdal-dev \
+    libgeos-dev \
+    proj-bin \
+    proj-data \
+    libproj-dev \
+    build-essential \
+    curl \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+ENV GDAL_DATA=/usr/share/gdal
+ENV PROJ_LIB=/usr/share/proj
+ENV PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 LANG=C.UTF-8
+WORKDIR /app
+# ------------------------------------------------
+# Copy inference code & metadata
+# ------------------------------------------------
+COPY ./main.py /app/main.py
+COPY ./handler.py /app/handler.py
+COPY ./reports /app/reports
+COPY ./requirements.txt /app/requirements.txt
+# ------------------------------------------------
+# NEW — Copy source code for MLflow unpickling
+# ------------------------------------------------
+COPY ./src /app/src
+ENV PYTHONPATH="/app"
+# ------------------------------------------------
+# Install Python deps
+# ------------------------------------------------
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+RUN pip install --no-cache-dir uvicorn fastapi python-dotenv
+RUN pip install --no-cache-dir -r /app/requirements.txt
+EXPOSE 7860
+# ------------------------------------------------
+# Startup script with GCP credential fix
+# ------------------------------------------------
+RUN printf '%s\n' \
+  '#!/bin/bash' \
+  'set -e' \
+  'echo "🚀 Starting TrueNest GeoPandas-enabled inference container"' \
+  '' \
+  '# -------------------------------' \
+  '# Configure GCP credentials' \
+  '# -------------------------------' \
+  'if [ -n "$GOOGLE_APPLICATION_CREDENTIALS_JSON" ]; then' \
+  '  echo "$GOOGLE_APPLICATION_CREDENTIALS_JSON" > /tmp/creds.json' \
+  '  export GOOGLE_APPLICATION_CREDENTIALS=/tmp/creds.json' \
+  '  export CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=/tmp/creds.json' \
+  '  echo "✅ GCP credentials written to /tmp/creds.json"' \
+  'else' \
+  '  echo "⚠️ No GOOGLE_APPLICATION_CREDENTIALS_JSON provided. GCS access will fail."' \
+  'fi' \
+  '' \
+  '# -------------------------------' \
+  '# Force GCP credential load BEFORE MLflow loads model' \
+  'echo "🔐 Verifying GCP credentials..."' \
+  'python <<EOF' \
+  'import os' \
+  'from google.oauth2 import service_account' \
+  'from google.cloud import storage' \
+  'p = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")' \
+  'if not p or not os.path.exists(p):' \
+  '    print("❌ No valid Google credentials found:", p)' \
+  'else:' \
+  '    creds = service_account.Credentials.from_service_account_file(p)' \
+  '    client = storage.Client(credentials=creds)' \
+  '    try:' \
+  '        client.list_buckets(max_results=1)' \
+  '        print("🔐 GCP credentials verified successfully")' \
+  '    except Exception as e:' \
+  '        print("❌ GCP credential test failed:", e)' \
+  'EOF' \
+  '' \
+  '# -------------------------------' \
+  '# Preload MLflow pipeline model' \
+  '# -------------------------------' \
+  'python <<EOF' \
+  'import mlflow, json' \
+  'info = json.load(open("/app/reports/last_run_info.json"))' \
+  'uri = info.get("pipeline_model_uri") or info.get("model_uri")' \
+  'print(f"📦 Loading MLflow model: {uri}")' \
+  'mlflow.pyfunc.load_model(uri)' \
+  'print("✅ MLflow model loaded successfully")' \
+  'EOF' \
+  '' \
+  'echo "🚀 Launching FastAPI server"' \
+  'exec uvicorn main:app --host 0.0.0.0 --port 7860' \
+  > /app/start.sh && chmod +x /app/start.sh
+CMD ["/app/start.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: TrueNest API
+emoji: 📡
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_file: Dockerfile
+pinned: false
+---

handler.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os, sys
+import json
+import mlflow
+import mlflow.pyfunc
+import pandas as pd
+from dotenv import load_dotenv
+# Load .env BEFORE anything else
+load_dotenv()
+# Ensure the project root (which contains 'src') is in sys.path
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+class FastApiHandler:
+    """Handler for rent price prediction using MLflow pipeline model."""
+    def __init__(
+        self,
+        run_info_path: str = "reports/last_run_info.json",
+    ):
+        self.run_info_path = run_info_path
+        self.model = None
+        self.run_id = None
+        self.model_uri = None
+        self._configure_gcp_credentials()
+        self.load_model()  # Load once at startup
+    # -----------------------------------------------------------
+    # Configure Google Cloud authentication
+    # -----------------------------------------------------------
+    def _configure_gcp_credentials(self):
+        """Loads GCP credentials from HF ENV or system ENV."""
+        # Hugging Face Spaces: JSON secret
+        creds_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
+        if creds_json:
+            print("🔐 Configuring GCP credentials from ENV JSON...")
+            with open("/tmp/gcp_creds.json", "w") as f:
+                f.write(creds_json)
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcp_creds.json"
+        # Local dev or Docker with .env
+        elif os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
+            print("🔐 Using GOOGLE_APPLICATION_CREDENTIALS from environment")
+        else:
+            print("⚠️ WARNING: No GCP credentials provided! GCS model loading may fail.")
+    # -----------------------------------------------------------
+    # Load the MLflow model
+    # -----------------------------------------------------------
+    def load_model(self):
+        if not os.path.exists(self.run_info_path):
+            raise FileNotFoundError(
+                f"❌ {self.run_info_path} not found — train the model first."
+            )
+        with open(self.run_info_path) as f:
+            info = json.load(f)
+        self.run_id = info.get("run_id")
+        self.model_uri = info.get("pipeline_model_uri")
+        print(f"🔗 Loading MLflow model: {self.model_uri}")
+        # MLflow resolves GCS path automatically from runs:/ URI
+        self.model = mlflow.pyfunc.load_model(self.model_uri)
+        print(f"✅ Model loaded successfully (run_id={self.run_id})")
+    # -----------------------------------------------------------
+    # Predict
+    # -----------------------------------------------------------
+    def predict(self, model_params: dict) -> float:
+        if self.model is None:
+            raise RuntimeError("Model not loaded")
+        df = pd.DataFrame([model_params])
+        preds = self.model.predict(df)
+        return float(preds[0])
+    def explain_prediction(self, model_params: dict) -> dict:
+        if self.model is None:
+            raise RuntimeError("Model not loaded")
+        df = pd.DataFrame([model_params])
+        # 🔥 Unwrap the custom RentPricePipeline
+        python_model = self.model.unwrap_python_model()
+        explanation = python_model.explain_predictions(df)
+        return explanation
+    # -----------------------------------------------------------
+    # FastAPI-compatible handler
+    # -----------------------------------------------------------
+    def handle(self, params: dict) -> dict:
+        if "model_params" not in params:
+            return {"error": "Missing 'model_params' in request"}
+        try:
+            prediction = self.predict(params["model_params"])
+        except Exception as e:
+            return {"error": str(e)}
+        return {
+            "prediction": prediction,
+            "inputs": params["model_params"],
+            "run_id": self.run_id,
+        }

main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from handler import FastApiHandler
+app = FastAPI(title="TrueNest Rent Prediction API")
+handler = None
+# ---------- Request schema with example ----------
+class PredictRequest(BaseModel):
+    model_params: dict = Field(
+        ...,
+        json_schema_extra={
+            "example": {
+                "bathrooms": 1,
+                "bedrooms": 2,
+                "propertyType": "Flat",
+                "deposit": False,
+                "letType": "Long term",
+                "furnishType": "Furnished",
+                "latitude": 51.49199,
+                "longitude": -0.17134
+            }
+        },
+    )
+# ---------- Startup: load model once ----------
+@app.on_event("startup")
+def load_model_once():
+    global handler
+    handler = FastApiHandler()
+    print("✅ MLflow model loaded at startup")
+# ---------- Routes ----------
+@app.get("/")
+def root():
+    return {"message": "🏡 Rent Prediction API is running", "run_id": handler.run_id}
+@app.post("/predict")
+def predict(req: PredictRequest):
+    result = handler.handle(req.dict())
+    if "error" in result:
+        raise HTTPException(status_code=400, detail=result["error"])
+    return result
+@app.post("/explain")
+def explain(req: PredictRequest):
+    try:
+        explanation = handler.explain_prediction(req.model_params)
+        return explanation
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))

reports/eval_metrics.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "r2": 0.8474,
+  "mae": 427.68,
+  "mape": 0.1372
+}

reports/last_run_id.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ d198679a3542411ca2082e4d9832038d

reports/last_run_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "run_id": "d198679a3542411ca2082e4d9832038d",
+  "pipeline_model_uri": "gs://rent_price_bucket/artifacts/8/models/m-344397ce2d7344b9b143d7e0049bb907/artifacts",
+  "timestamp": "2025-11-18T10:42:44.464536Z",
+  "mlflow_experiment": "Rent_Price_Pipeline",
+  "mlflow_ui_link": "http://127.0.0.1:5000/#/experiments/8/runs/d198679a3542411ca2082e4d9832038d"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+mlflow==3.6.0
+catboost==1.2.3
+google-cloud-storage==2.13.0  # Required for GCS access
+numpy==1.26.4
+pandas==2.3.3
+pyarrow==15.0.2
+aiohttp==3.9.1
+psutil==5.9.6
+geopandas==1.0.1
+geopy==2.4.1
+scikit-learn==1.7.2
+scipy==1.15.3
+cloudpickle==3.1.2
+fastapi==0.104.0
+uvicorn==0.24.0
+pydantic==2.5.0
+shap==0.49.1

src/app/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: TrueNest API
+emoji: 📡
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_file: Dockerfile
+pinned: false
+---

src/app/handler.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os, sys
+import json
+import mlflow
+import mlflow.pyfunc
+import pandas as pd
+from dotenv import load_dotenv
+# Load .env BEFORE anything else
+load_dotenv()
+# Ensure the project root (which contains 'src') is in sys.path
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+class FastApiHandler:
+    """Handler for rent price prediction using MLflow pipeline model."""
+    def __init__(
+        self,
+        run_info_path: str = "reports/last_run_info.json",
+    ):
+        self.run_info_path = run_info_path
+        self.model = None
+        self.run_id = None
+        self.model_uri = None
+        self._configure_gcp_credentials()
+        self.load_model()  # Load once at startup
+    # -----------------------------------------------------------
+    # Configure Google Cloud authentication
+    # -----------------------------------------------------------
+    def _configure_gcp_credentials(self):
+        """Loads GCP credentials from HF ENV or system ENV."""
+        # Hugging Face Spaces: JSON secret
+        creds_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
+        if creds_json:
+            print("🔐 Configuring GCP credentials from ENV JSON...")
+            with open("/tmp/gcp_creds.json", "w") as f:
+                f.write(creds_json)
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcp_creds.json"
+        # Local dev or Docker with .env
+        elif os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
+            print("🔐 Using GOOGLE_APPLICATION_CREDENTIALS from environment")
+        else:
+            print("⚠️ WARNING: No GCP credentials provided! GCS model loading may fail.")
+    # -----------------------------------------------------------
+    # Load the MLflow model
+    # -----------------------------------------------------------
+    def load_model(self):
+        if not os.path.exists(self.run_info_path):
+            raise FileNotFoundError(
+                f"❌ {self.run_info_path} not found — train the model first."
+            )
+        with open(self.run_info_path) as f:
+            info = json.load(f)
+        self.run_id = info.get("run_id")
+        self.model_uri = info.get("pipeline_model_uri")
+        print(f"🔗 Loading MLflow model: {self.model_uri}")
+        # MLflow resolves GCS path automatically from runs:/ URI
+        self.model = mlflow.pyfunc.load_model(self.model_uri)
+        print(f"✅ Model loaded successfully (run_id={self.run_id})")
+    # -----------------------------------------------------------
+    # Predict
+    # -----------------------------------------------------------
+    def predict(self, model_params: dict) -> float:
+        if self.model is None:
+            raise RuntimeError("Model not loaded")
+        df = pd.DataFrame([model_params])
+        preds = self.model.predict(df)
+        return float(preds[0])
+    def explain_prediction(self, model_params: dict) -> dict:
+        if self.model is None:
+            raise RuntimeError("Model not loaded")
+        df = pd.DataFrame([model_params])
+        # 🔥 Unwrap the custom RentPricePipeline
+        python_model = self.model.unwrap_python_model()
+        explanation = python_model.explain_predictions(df)
+        return explanation
+    # -----------------------------------------------------------
+    # FastAPI-compatible handler
+    # -----------------------------------------------------------
+    def handle(self, params: dict) -> dict:
+        if "model_params" not in params:
+            return {"error": "Missing 'model_params' in request"}
+        try:
+            prediction = self.predict(params["model_params"])
+        except Exception as e:
+            return {"error": str(e)}
+        return {
+            "prediction": prediction,
+            "inputs": params["model_params"],
+            "run_id": self.run_id,
+        }

src/app/main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from handler import FastApiHandler
+app = FastAPI(title="TrueNest Rent Prediction API")
+handler = None
+# ---------- Request schema with example ----------
+class PredictRequest(BaseModel):
+    model_params: dict = Field(
+        ...,
+        json_schema_extra={
+            "example": {
+                "bathrooms": 1,
+                "bedrooms": 2,
+                "propertyType": "Flat",
+                "deposit": False,
+                "letType": "Long term",
+                "furnishType": "Furnished",
+                "latitude": 51.49199,
+                "longitude": -0.17134
+            }
+        },
+    )
+# ---------- Startup: load model once ----------
+@app.on_event("startup")
+def load_model_once():
+    global handler
+    handler = FastApiHandler()
+    print("✅ MLflow model loaded at startup")
+# ---------- Routes ----------
+@app.get("/")
+def root():
+    return {"message": "🏡 Rent Prediction API is running", "run_id": handler.run_id}
+@app.post("/predict")
+def predict(req: PredictRequest):
+    result = handler.handle(req.dict())
+    if "error" in result:
+        raise HTTPException(status_code=400, detail=result["error"])
+    return result
+@app.post("/explain")
+def explain(req: PredictRequest):
+    try:
+        explanation = handler.explain_prediction(req.model_params)
+        return explanation
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))

src/app/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+mlflow==3.6.0
+catboost==1.2.3
+google-cloud-storage==2.13.0  # Required for GCS access
+numpy==1.26.4
+pandas==2.3.3
+pyarrow==15.0.2
+aiohttp==3.9.1
+psutil==5.9.6
+geopandas==1.0.1
+geopy==2.4.1
+scikit-learn==1.7.2
+scipy==1.15.3
+cloudpickle==3.1.2
+fastapi==0.104.0
+uvicorn==0.24.0
+pydantic==2.5.0
+shap==0.49.1

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os, sys
+import json
+import mlflow
+import pandas as pd
+import requests
+from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
+from dotenv import load_dotenv
+load_dotenv()
+# Add project root to sys.path for imports
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from src.utils import Timer
+# ------------------------------------------------------------------
+# Constants
+# ------------------------------------------------------------------
+TEST_PATH = "data/processed/test.parquet"
+RUN_INFO_PATH = "reports/last_run_info.json"
+METRICS_PATH = "reports/eval_metrics.json"
+# Tracking server info (should match training script)
+TRACKING_SERVER_HOST = "127.0.0.1"
+TRACKING_SERVER_PORT = 5000
+def main():
+    # ---------------------------------------------------------------
+    # 1. Load metadata
+    # ---------------------------------------------------------------
+    if not os.path.exists(RUN_INFO_PATH):
+        raise FileNotFoundError(f"❌ {RUN_INFO_PATH} not found. Run training first.")
+    print("📄 Loading last MLflow run info...")
+    with open(RUN_INFO_PATH) as f:
+        run_info = json.load(f)
+    run_id = run_info["run_id"]
+    model_uri = run_info["pipeline_model_uri"]
+    print(f"🔍 Run ID: {run_id}")
+    print(f"🔄 Model URI: {model_uri}")
+    print()
+    # ---------------------------------------------------------------
+    # Ensure Google Cloud credentials are set
+    # ---------------------------------------------------------------
+    GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if not GOOGLE_CREDS or not os.path.isfile(GOOGLE_CREDS):
+        raise FileNotFoundError(
+            f"❌ GOOGLE_APPLICATION_CREDENTIALS not set or invalid: {GOOGLE_CREDS}"
+        )
+    print(f"🔐 Using Google credentials: {GOOGLE_CREDS}")
+    # ---------------------------------------------------------------
+    # 2. Connect to MLflow tracking server
+    # ---------------------------------------------------------------
+    try:
+        r = requests.get(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}", timeout=3)
+        if r.status_code != 200:
+            raise requests.exceptions.RequestException
+    except requests.exceptions.RequestException:
+        raise ConnectionError(
+            f"❌ MLflow tracking server not reachable at "
+            f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}. "
+            f"Start the server before evaluation."
+        )
+    mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
+    mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
+    print(f"🔗 Connected to MLflow: {mlflow.get_tracking_uri()}")
+    print(f"   Using run ID: {run_id}")
+    print()
+    # ---------------------------------------------------------------
+    # 3. Load model from GCS
+    # ---------------------------------------------------------------
+    with Timer("Load MLflow model"):
+        model = mlflow.pyfunc.load_model(model_uri)
+    # ---------------------------------------------------------------
+    # 4. Load test data
+    # ---------------------------------------------------------------
+    print("📦 Loading test data...")
+    df_test = pd.read_parquet(TEST_PATH)
+    X_test = df_test.drop(columns=["price"])
+    y_test = df_test["price"]
+    # ---------------------------------------------------------------
+    # 5. Run inference
+    # ---------------------------------------------------------------
+    print("⚙️ Running inference on test set...")
+    with Timer("Model inference"):
+        preds = model.predict(X_test)
+    # ---------------------------------------------------------------
+    # 6. Compute metrics
+    # ---------------------------------------------------------------
+    print("📊 Computing metrics...")
+    metrics = {
+        "r2": round(r2_score(y_test, preds), 4),
+        "mae": round(mean_absolute_error(y_test, preds), 2),
+        "mape": round(mean_absolute_percentage_error(y_test, preds), 4),
+    }
+    # ---------------------------------------------------------------
+    # 7. Log metrics to MLflow (same run)
+    # ---------------------------------------------------------------
+    print("📝 Logging metrics to MLflow...")
+    mlflow.start_run(run_id=run_id)
+    mlflow.log_metrics(metrics)
+    mlflow.end_run()
+    # ---------------------------------------------------------------
+    # 8. Save metrics locally for DVC
+    # ---------------------------------------------------------------
+    os.makedirs(os.path.dirname(METRICS_PATH), exist_ok=True)
+    with open(METRICS_PATH, "w") as f:
+        json.dump(metrics, f, indent=2)
+    print("✅ Evaluation complete!")
+    print(json.dumps(metrics, indent=2))
+    print(f"🔗 MLflow UI: {run_info['mlflow_ui_link']}")
+if __name__ == "__main__":
+    main()

src/features/build_features.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pandas as pd
+from src.features.geo_features import LondonPropertyGeoFeatures
+# Optional:
+# from src.features.feature_engineering import add_non_geo_features
+# from src.features.nlp_features import PropertyTextEncoder
+def is_numeric_and_true(value):
+    return isinstance(value, (int, float)) and bool(value)
+def build_features(df: pd.DataFrame, geo_dir: str = "data/geo") -> pd.DataFrame:
+    """
+    Compute all features used by the rent price model.
+    Combines geospatial, engineered, and NLP-derived features.
+    """
+    # 1. Geospatial engineered features
+    geo = LondonPropertyGeoFeatures(geo_dir)
+    df = geo.add_features_to_df(df)
+    # 2. Other engineered features (optional)
+    # df = add_non_geo_features(df)
+    # 3. NLP / embeddings (optional)
+    # encoder = PropertyTextEncoder()
+    # df = encoder.add_nlp_embeddings(df, text_column="description")
+    df["deposit"] = df["deposit"].apply(is_numeric_and_true)
+    return df

src/features/geo_features.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# --- Imports ---
+import numpy as np
+import geopandas as gpd
+from shapely.geometry import Point
+from geopy.distance import geodesic
+from sklearn.neighbors import BallTree
+import pandas as pd
+# --- Constants ---
+CITY_CENTER = (51.5072, -0.1276)  # London
+EPSG = "EPSG:4326"
+class LondonPropertyGeoFeatures:
+    """Extract London property geo features for model inference."""
+    def __init__(self, geo_dir):
+        self.CITY_CENTER = CITY_CENTER
+        self.EPSG = EPSG
+        self.geo_dir = geo_dir
+        self.load_datasets()
+        self.prepare_station_tree()
+    def load_datasets(self):
+        """Load and prepare geographic datasets."""
+        self.london_boundaries = gpd.read_file(f"{self.geo_dir}/london_boroughs.geojson").to_crs(self.EPSG)
+        self.hex_gdf = gpd.read_parquet(f"{self.geo_dir}/noize.parquet").to_crs(self.EPSG)
+        self.zone_fares = gpd.read_parquet(f"{self.geo_dir}/zone_fares.parquet").to_crs(self.EPSG)
+        self.stations = gpd.read_parquet(f"{self.geo_dir}/rail_tfl.parquet").to_crs(self.EPSG)
+    def prepare_station_tree(self):
+        """Prepare BallTree for fast station distance queries."""
+        # Convert stations to UTM for accurate distance calculations
+        self.stations_utm = self.stations.to_crs(self.stations.estimate_utm_crs())
+        station_coords = np.array([[p.x, p.y] for p in self.stations_utm.geometry])
+        self.station_tree = BallTree(station_coords, leaf_size=15, metric='euclidean')
+        self.station_names = self.stations_utm['CommonName'].values
+        self.station_tfl = self.stations_utm['TFL'].values
+        self.station_rail = self.stations_utm['RAIL'].values
+    def _create_point_gdf(self, lat, lon):
+        """Create a GeoDataFrame for the point (internal helper)."""
+        point = Point(lon, lat)
+        return gpd.GeoDataFrame(geometry=[point], crs=self.EPSG)
+    def borough(self, lat, lon):
+        """Return the London borough name containing the given coordinates."""
+        prop_gdf = self._create_point_gdf(lat, lon)
+        joined = gpd.sjoin(prop_gdf, self.london_boundaries, how="left", predicate="within")
+        return joined.iloc[0].get("name", None)
+    def compute_angle(self, lat, lon):
+        """Compute angle (in radians) of a point relative to London center."""
+        lat1, lon1 = np.radians(self.CITY_CENTER[0]), np.radians(self.CITY_CENTER[1])
+        lat2, lon2 = np.radians(lat), np.radians(lon)
+        dlon = lon2 - lon1
+        x = np.cos(lat2) * np.sin(dlon)
+        y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
+        return np.arctan2(x, y)
+    def distance_to_center(self, lat, lon):
+        """Return distance from city center (in miles)."""
+        return geodesic((lat, lon), self.CITY_CENTER).miles
+    def noize_class(self, lat, lon):
+        """Return noise class for given coordinates."""
+        prop_gdf = self._create_point_gdf(lat, lon)
+        joined = gpd.sjoin(prop_gdf, self.hex_gdf, how="left", predicate="within")
+        return joined.iloc[0].get("NoiseClass", None)
+    def zone_fare(self, lat, lon):
+        """Return transport fare zone for given coordinates."""
+        prop_gdf = self._create_point_gdf(lat, lon)
+        joined = gpd.sjoin(prop_gdf, self.zone_fares, how="left", predicate="within")
+        zone_name = joined.iloc[0].get("Name", None)
+        # Extract just the zone number if format is "Zone X"
+        if zone_name and "Zone" in zone_name:
+            return zone_name.split(" ")[-1]
+        return zone_name
+    def find_nearest_stations(self, lat, lon, k=3, max_distance_meters=50000):
+        """
+        Find k nearest stations with distances and TFL/RAIL flags.
+        Returns distances in miles.
+        """
+        prop_gdf = self._create_point_gdf(lat, lon)
+        prop_utm = prop_gdf.to_crs(self.stations_utm.crs)
+        # Query the BallTree
+        prop_coords = np.array([[p.x, p.y] for p in prop_utm.geometry])
+        distances_m, indices = self.station_tree.query(prop_coords, k=k)
+        results = []
+        for dist_m, idx in zip(distances_m[0], indices[0]):
+            if dist_m <= max_distance_meters:
+                station_data = {
+                    'distance_miles': dist_m / 1609.34,
+                    'name': self.station_names[idx],
+                    'TFL': bool(self.station_tfl[idx]),
+                    'RAIL': bool(self.station_rail[idx])
+                }
+                results.append(station_data)
+        return results
+    def extract_geo_features(self, lat, lon):
+        """
+        Extract all GEO features for model inference in the required format.
+        """
+        # Geographic features
+        borough_name = self.borough(lat, lon)
+        angle = self.compute_angle(lat, lon)
+        center_distance = self.distance_to_center(lat, lon)
+        noise_class = self.noize_class(lat, lon)
+        zone = self.zone_fare(lat, lon)
+        # Station features
+        nearest_stations = self.find_nearest_stations(lat, lon, k=3)
+        # Prepare station features with proper naming
+        station_features = {}
+        for i, station in enumerate(nearest_stations[:3], 1):
+            station_features[f'distance_to_station{i}'] = round(station['distance_miles'], 6)
+            station_features[f'TFL{i}'] = station['TFL']
+            station_features[f'RAIL{i}'] = station['RAIL']
+        # Fill missing stations with default values
+        for i in range(len(nearest_stations) + 1, 4):
+            station_features[f'distance_to_station{i}'] = None
+            station_features[f'TFL{i}'] = False
+            station_features[f'RAIL{i}'] = False
+        geo_features = {
+                "distance_to_center": round(center_distance, 6),
+                "angle_from_center": round(angle, 6),
+                "zone": zone,
+                "borough": borough_name,
+                "NoiseClass": noise_class,
+                **station_features
+            }
+        return geo_features
+    def add_features_to_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Vectorized feature extraction for a full DataFrame."""
+        features = df.apply(
+            lambda row: pd.Series(self.extract_geo_features(row["latitude"], row["longitude"])),
+            axis=1
+        )
+        return pd.concat([df, features], axis=1)

src/rent_price_pipeline.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import mlflow.pyfunc
+import pandas as pd
+import shap
+from catboost import CatBoostRegressor
+from src.features.build_features import build_features
+class RentPricePipeline(mlflow.pyfunc.PythonModel):
+    """
+    MLflow-wrapped pipeline that:
+    - loads CatBoost model and geo datasets
+    - uses build_features() to compute all features
+    - predicts rent prices
+    """
+    def __init__(self, cb_model_path=None, geo_dir=None):
+        self.cb_model_path = cb_model_path
+        self.geo_dir = geo_dir
+        self.explainer = None  # Initialize as None, set in load_context
+        # Define feature list for consistency
+        self.numerical_features = [
+            "latitude", "longitude",
+            "distance_to_center", "angle_from_center",
+            "distance_to_station1", "distance_to_station2", "distance_to_station3"
+        ]
+        self.categorical_features = [
+            "bedrooms", "bathrooms", "deposit", "zone", "borough", "propertyType",
+            "furnishType", "NoiseClass", "letType", "TFL1", "TFL2", "TFL3",
+            "RAIL1", "RAIL2", "RAIL3"
+        ]
+        self.features = self.numerical_features + self.categorical_features
+    def load_context(self, context):
+        """Load CatBoost model and geo datasets from MLflow artifacts."""
+        model_path = context.artifacts.get("catboost_model", self.cb_model_path)
+        self.model = CatBoostRegressor()
+        self.model.load_model(model_path)
+        # Initialize SHAP explainer after model is loaded
+        self.explainer = shap.TreeExplainer(self.model)
+        # ✅ Prefer MLflow artifact path if available
+        self.geo_dir = context.artifacts.get("geo_dir", self.geo_dir or "data/geo")
+    def predict(self, context, model_input):
+        """Compute features and predict rent price."""
+        if not isinstance(model_input, pd.DataFrame):
+            model_input = pd.DataFrame(model_input)
+        enriched = build_features(model_input, geo_dir=self.geo_dir)
+        return self.model.predict(enriched[self.features])
+    def explain_predictions(self, model_input):
+        if not isinstance(model_input, pd.DataFrame):
+            model_input = pd.DataFrame(model_input)
+        enriched = build_features(model_input, geo_dir=self.geo_dir)
+        enriched_features = enriched[self.features]
+        shap_values = self.explainer(enriched_features)
+        preds = self.model.predict(enriched_features)
+        return {
+            "prediction": float(preds[0]),
+            "base_value": float(self.explainer.expected_value),
+            "shap_values": shap_values.values.tolist(),  # numpy → list
+            "feature_names": self.features,
+            "data": enriched_features.to_dict(orient="records"),  # DataFrame → JSON
+        }

src/train_and_log_pipeline.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os, sys
+import yaml
+import mlflow
+import pandas as pd
+from catboost import CatBoostRegressor
+from mlflow.models import infer_signature
+from time import perf_counter
+from datetime import datetime
+import json
+import requests
+from dotenv import load_dotenv
+# Add project root to sys.path for imports
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from src.features.build_features import build_features
+from src.rent_price_pipeline import RentPricePipeline
+from src.utils import Timer
+# ============================================================
+# 0. Setup: environment, credentials, MLflow connection
+# ============================================================
+overall_start = perf_counter()
+load_dotenv()
+GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+if not GOOGLE_CREDS or not os.path.isfile(GOOGLE_CREDS):
+    raise FileNotFoundError(f"❌ GOOGLE_APPLICATION_CREDENTIALS invalid: {GOOGLE_CREDS}")
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_CREDS
+TRACKING_SERVER_HOST = "127.0.0.1"   # or external IP if remote
+TRACKING_SERVER_PORT = 5000
+EXPERIMENT_NAME = "Rent_Price_Pipeline"
+# ---- Verify MLflow server ----
+try:
+    r = requests.get(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}", timeout=3)
+    if r.status_code != 200:
+        raise requests.exceptions.RequestException
+except requests.exceptions.RequestException:
+    raise ConnectionError(
+        f"❌ MLflow tracking server not reachable at http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}. "
+        f"Please start it before running this script."
+    )
+# ---- Configure MLflow ----
+mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
+mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
+mlflow.set_experiment(EXPERIMENT_NAME)
+print(f"🔗 Connected to MLflow tracking server: {mlflow.get_tracking_uri()}")
+print(f"Experiment: {EXPERIMENT_NAME}")
+print()
+# ============================================================
+# 1. Load parameters and prepare data
+# ============================================================
+with Timer("Load parameters"):
+    with open("params.yaml") as f:
+        params = yaml.safe_load(f)
+train_params = params["train"]
+model_meta = params["model"]
+TARGET = model_meta["target"]
+NUMERIC = model_meta["numerical_features"]
+CATEGORICAL = model_meta["categorical_features"]
+FEATURES = NUMERIC + CATEGORICAL
+NOT_USED_COLUMNS = model_meta["not_used_features"]
+print("📦 Loading training data...")
+with Timer("Load training data"):
+    train_df = pd.read_parquet("data/processed/train.parquet")
+    train_df = train_df.drop(columns=NOT_USED_COLUMNS)
+print("🧩 Feature engineering...")
+with Timer("Feature engineering"):
+    train_df = build_features(train_df, geo_dir="data/geo")
+    X_train, y_train = train_df[FEATURES], train_df[TARGET]
+# ============================================================
+# 2. Train CatBoost model
+# ============================================================
+print("🚀 Training CatBoost model...")
+with Timer("Training CatBoost"):
+    model = CatBoostRegressor(
+        iterations=train_params["iterations"],
+        depth=train_params["depth"],
+        learning_rate=train_params["learning_rate"],
+        l2_leaf_reg=train_params["l2_leaf_reg"],
+        bagging_temperature=train_params["bagging_temperature"],
+        cat_features=CATEGORICAL,
+        verbose=False,
+    )
+    model.fit(X_train, y_train)
+# ============================================================
+# 3. Save model and prepare signatures
+# ============================================================
+with Timer("Save base model"):
+    os.makedirs("models", exist_ok=True)
+    cbm_path = "models/catboost_model_v1.cbm"
+    model.save_model(cbm_path)
+with Timer("Infer signature for CatBoost"):
+    signature_catboost = infer_signature(X_train, model.predict(X_train[:5]))
+print("🧠 Preparing wrapper pipeline...")
+with Timer("Prepare wrapper and raw input example"):
+    wrapped = RentPricePipeline(cb_model_path=cbm_path, geo_dir="data/geo")
+    wrapped.model = CatBoostRegressor()
+    wrapped.model.load_model(cbm_path)
+    raw_df = pd.read_parquet("data/processed/train.parquet")
+    raw_df = raw_df.drop(columns=NOT_USED_COLUMNS)
+    input_example = raw_df.sample(1, random_state=42).drop(columns=[TARGET])
+with Timer("Infer wrapper signature"):
+    pred_example = wrapped.predict(None, input_example)
+    signature_pipeline = infer_signature(input_example, pred_example)
+# ============================================================
+# 4. Log everything to MLflow
+# ============================================================
+print("📝 Logging models to MLflow...")
+with Timer("Log to MLflow"):
+    with mlflow.start_run(run_name=f"{model_meta['type']}_v1") as run:
+        # ---- Log CatBoost base model ----
+        mlflow.catboost.log_model(
+            cb_model=model,
+            name="catboost_model",
+            input_example=X_train.sample(1, random_state=42),
+            signature=signature_catboost,
+        )
+        base_uri = f"runs:/{run.info.run_id}/catboost_model"
+        # ---- Log wrapper pipeline ----
+        logged = mlflow.pyfunc.log_model(
+            artifact_path="pipeline_model",
+            python_model=wrapped,
+            code_paths=[
+                "src/features/geo_features.py",
+                "src/features/build_features.py",
+                "src/rent_price_pipeline.py",
+            ],
+            artifacts={
+                "catboost_model": cbm_path,
+                "geo_dir": "data/geo",
+            },
+            signature=signature_pipeline,
+            input_example=input_example,
+        )
+        # ---- Tags for traceability ----
+        mlflow.set_tags({
+            "type": "rent_price_pipeline",
+            "base_model_uri": base_uri,
+            "features_version": "v1",
+            "input_schema": "raw_property_data",
+        })
+        # ---- Save run metadata for DVC ----
+        print("🧩 Saving MLflow run metadata for DVC linkage...")
+        reports_dir = "reports"
+        os.makedirs(reports_dir, exist_ok=True)
+        # ============================================================
+        # Capture run ID (MLflow-native)
+        # ============================================================
+        run_id = run.info.run_id
+        experiment_id = run.info.experiment_id
+        # MLflow-native model URI (recommended for all environments)
+        # pipeline_model_uri = logged.model_uri
+        model_registry_uri = logged.model_uri
+        model_id = model_registry_uri.replace("models:/", "")
+        pipeline_model_uri = f"gs://rent_price_bucket/artifacts/{experiment_id}/models/{model_id}/artifacts"
+        print("saved link:", pipeline_model_uri)
+        # ============================================================
+        # Save metadata to JSON — clean and portable
+        # ============================================================
+        run_info = {
+            "run_id": run_id,
+            "pipeline_model_uri": pipeline_model_uri,
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "mlflow_experiment": mlflow.get_experiment(run.info.experiment_id).name,
+            "mlflow_ui_link": (
+                f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}/#/experiments/"
+                f"{run.info.experiment_id}/runs/{run_id}"
+            ),
+        }
+        with open(os.path.join(reports_dir, "last_run_info.json"), "w") as f:
+            json.dump(run_info, f, indent=2)
+        with open(os.path.join(reports_dir, "last_run_id.txt"), "w") as f:
+            f.write(run_id)
+        print("   📄 Saved run metadata to reports/last_run_info.json")
+        print("   Run ID:", run_id)
+        print("   Pipeline model URI:", pipeline_model_uri)
+        print("   MLflow UI:", run_info["mlflow_ui_link"])
+# ============================================================
+# 5. Completion
+# ============================================================
+print("✅ Training and remote logging completed!")
+print("   Base model URI:", base_uri)
+print("   Wrapper pipeline logged as: pipeline_model/")
+print(f"🏁 Total script time: {perf_counter() - overall_start:.2f}s")

src/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from time import perf_counter
+class Timer:
+    def __init__(self, label: str):
+        self.label = label
+    def __enter__(self):
+        self.start = perf_counter()
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        elapsed = perf_counter() - self.start
+        print(f"⏱ {self.label} took {elapsed:.2f}s")