HappySardines / predictor.py
AxelHolst's picture
feat: add high-res heatmap v3 with Hopsworks storage
0fd960e
"""
Model loading and prediction logic for HappySardines.
Loads the XGBoost model from Hopsworks Model Registry and makes predictions.
"""
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
# Global model cache
_model = None
_model_loaded = False
# Occupancy class labels with display info
OCCUPANCY_LABELS = {
0: {
"label": "Empty",
"message": "Plenty of room - pick any seat!",
"color": "green",
"icon": "🟢"
},
1: {
"label": "Many seats available",
"message": "Lots of seats to choose from.",
"color": "green",
"icon": "🟢"
},
2: {
"label": "Few seats available",
"message": "Some seats left - you might need to look around.",
"color": "yellow",
"icon": "🟡"
},
3: {
"label": "Standing room only",
"message": "Expect to stand - pack your patience!",
"color": "orange",
"icon": "🟠"
},
4: {
"label": "Crushed standing",
"message": "Very crowded - consider waiting for the next one.",
"color": "red",
"icon": "🔴"
},
5: {
"label": "Full",
"message": "Bus is full - you may not get on.",
"color": "red",
"icon": "🔴"
},
6: {
"label": "Not accepting passengers",
"message": "Bus is not accepting passengers.",
"color": "gray",
"icon": "⚫"
}
}
# Feature order expected by the model (occupancy_xgboost_model_new v4)
# Must match training pipeline exactly - includes lat/lon bounds and bearing
FEATURE_ORDER = [
"trip_id",
"vehicle_id",
"max_speed",
"n_positions",
"lat_min",
"lat_max",
"lat_mean",
"lon_min",
"lon_max",
"lon_mean",
"bearing_min",
"bearing_max",
"hour",
"day_of_week",
"temperature_2m",
"precipitation",
"cloud_cover",
"wind_speed_10m",
"rain",
"snowfall",
"is_work_free",
"is_red_day",
"is_day_before_holiday",
]
# Default values for vehicle features (we don't have real-time vehicle data)
# These are approximate averages from the training data
DEFAULT_VEHICLE_FEATURES = {
"max_speed": 45.0, # typical max speed
"n_positions": 30, # typical GPS points per trip window
"bearing_min": 0.0, # neutral bearing
"bearing_max": 360.0, # full range (stationary/unknown direction)
}
def load_model():
"""
Load model from Hopsworks Model Registry.
Caches the model globally for reuse.
"""
global _model, _model_loaded
if _model_loaded:
return _model
# Check for API key before attempting connection
api_key = os.environ.get("HOPSWORKS_API_KEY")
project = os.environ.get("HOPSWORKS_PROJECT")
if not api_key:
raise ValueError("HOPSWORKS_API_KEY environment variable not set. Please add it in Space settings.")
try:
import hopsworks
from xgboost import XGBClassifier
print("Connecting to Hopsworks...")
project = hopsworks.login(project=project, api_key_value=api_key)
mr = project.get_model_registry()
print("Fetching model from registry...")
# Get version 4 explicitly (the model trained with 23 features)
model_entry = mr.get_model("occupancy_xgboost_model_new", version=4)
print(f"Downloading model version {model_entry.version}...")
model_dir = model_entry.download()
print("Loading XGBoost model...")
model = XGBClassifier()
model.load_model(os.path.join(model_dir, "model.json"))
_model = model
_model_loaded = True
print("Model loaded successfully!")
return model
except Exception as e:
print(f"Error loading model: {e}")
raise
def predict_occupancy(lat, lon, hour, day_of_week, weather, holidays):
"""
Predict occupancy for given inputs.
Args:
lat: Latitude
lon: Longitude
hour: Hour of day (0-23)
day_of_week: Day of week (0=Monday, 6=Sunday)
weather: Dict with temperature_2m, precipitation, cloud_cover, wind_speed_10m
holidays: Dict with is_work_free, is_red_day, is_day_before_holiday
Returns:
Tuple of (predicted_class, confidence, all_probabilities)
"""
model = load_model()
# Assemble feature vector
features = {
# Vehicle features - use defaults
"trip_id": 0, # placeholder
"vehicle_id": 0, # placeholder
"max_speed": DEFAULT_VEHICLE_FEATURES["max_speed"],
"n_positions": DEFAULT_VEHICLE_FEATURES["n_positions"],
# Location bounds (set equal to point for single-location prediction)
"lat_min": lat,
"lat_max": lat,
"lat_mean": lat,
"lon_min": lon,
"lon_max": lon,
"lon_mean": lon,
# Bearing (neutral values for point prediction)
"bearing_min": DEFAULT_VEHICLE_FEATURES["bearing_min"],
"bearing_max": DEFAULT_VEHICLE_FEATURES["bearing_max"],
# Time
"hour": hour,
"day_of_week": day_of_week,
# Weather
"temperature_2m": weather.get("temperature_2m", 10.0),
"precipitation": weather.get("precipitation", 0.0),
"cloud_cover": weather.get("cloud_cover", 50.0),
"wind_speed_10m": weather.get("wind_speed_10m", 5.0),
"rain": weather.get("rain", 0.0),
"snowfall": weather.get("snowfall", 0.0),
# Holidays (convert bool to int)
"is_work_free": int(holidays.get("is_work_free", False)),
"is_red_day": int(holidays.get("is_red_day", False)),
"is_day_before_holiday": int(holidays.get("is_day_before_holiday", False)),
}
# Create DataFrame with correct feature order
X = pd.DataFrame([features])[FEATURE_ORDER]
# Get prediction probabilities
probabilities = model.predict_proba(X)[0]
# Get predicted class (highest probability)
predicted_class = int(np.argmax(probabilities))
confidence = float(probabilities[predicted_class])
return predicted_class, confidence, probabilities.tolist()
def predict_occupancy_batch(locations, hour, day_of_week, weather, holidays):
"""
Predict occupancy for multiple locations in a single batch.
Much faster than calling predict_occupancy() in a loop.
Args:
locations: List of (lat, lon) tuples
hour: Hour of day (0-23)
day_of_week: Day of week (0=Monday, 6=Sunday)
weather: Dict with temperature_2m, precipitation, cloud_cover, wind_speed_10m
holidays: Dict with is_work_free, is_red_day, is_day_before_holiday
Returns:
List of (predicted_class, confidence) tuples
"""
model = load_model()
# Build all feature rows at once
rows = []
for lat, lon in locations:
rows.append({
"trip_id": 0,
"vehicle_id": 0,
"max_speed": DEFAULT_VEHICLE_FEATURES["max_speed"],
"n_positions": DEFAULT_VEHICLE_FEATURES["n_positions"],
"lat_min": lat,
"lat_max": lat,
"lat_mean": lat,
"lon_min": lon,
"lon_max": lon,
"lon_mean": lon,
"bearing_min": DEFAULT_VEHICLE_FEATURES["bearing_min"],
"bearing_max": DEFAULT_VEHICLE_FEATURES["bearing_max"],
"hour": hour,
"day_of_week": day_of_week,
"temperature_2m": weather.get("temperature_2m", 10.0),
"precipitation": weather.get("precipitation", 0.0),
"cloud_cover": weather.get("cloud_cover", 50.0),
"wind_speed_10m": weather.get("wind_speed_10m", 5.0),
"rain": weather.get("rain", 0.0),
"snowfall": weather.get("snowfall", 0.0),
"is_work_free": int(holidays.get("is_work_free", False)),
"is_red_day": int(holidays.get("is_red_day", False)),
"is_day_before_holiday": int(holidays.get("is_day_before_holiday", False)),
})
# Single DataFrame, single predict call
X = pd.DataFrame(rows)[FEATURE_ORDER]
probabilities = model.predict_proba(X)
# Extract results
results = []
for i, (lat, lon) in enumerate(locations):
probs = probabilities[i]
predicted_class = int(np.argmax(probs))
confidence = float(probs[predicted_class])
results.append((predicted_class, confidence))
return results