Spaces:

AxelHolst
/

HappySardines

Sleeping

AxelHolst commited on Jan 8

Commit

764eb6f

1 Parent(s): 37b8251

feat: add monitoring dashboard and improve prediction speed

- Add Monitoring page (pages/Monitoring.py) showing model performance metrics
- Key metrics: accuracy, F1, precision, MAE
- Accuracy trend chart over time
- Per-class performance breakdown for all 7 occupancy classes
- Model version filter with v4 as default
- Alert banner when accuracy drops below threshold

- Improve real-time prediction speed in app.py
- Make trip info loading optional (was causing 2+ min delays)
- Add "Load nearby bus info" button for on-demand loading
- Trip forecast display when available from forecast_fg

- Update trip_info.py with haversine distance functions

Files changed (3) hide show

app.py +122 -22
pages/Monitoring.py +400 -0
trip_info.py +133 -28

app.py CHANGED Viewed

@@ -27,7 +27,9 @@ import hopsworks
 from predictor import predict_occupancy, load_model, OCCUPANCY_LABELS
 from weather import get_weather_for_prediction
 from holidays import get_holiday_features
-from trip_info import load_static_trip_info, find_nearest_trip, load_static_stops_info, find_closest_stop
 from contours import load_contours_from_file, grid_to_cells_geojson
 # Constants
@@ -75,6 +77,12 @@ def get_static_stops_df():
         return None
 @st.cache_resource
 def get_model():
     """Load model once and cache it."""
@@ -90,6 +98,33 @@ def cached_predict_occupancy(lat, lon, hour, day_of_week, weather, holidays):
     return predict_occupancy(lat, lon, hour, day_of_week, weather, holidays)
 @st.cache_resource
 def fetch_heatmaps_from_hopsworks():
     """
@@ -292,8 +327,12 @@ def create_map(selected_lat=None, selected_lon=None, show_heatmap=False,
     return m
-def make_prediction(lat, lon, selected_datetime):
-    """Make prediction and return formatted result."""
     if lat is None or lon is None:
         return None, None, None
@@ -314,16 +353,44 @@ def make_prediction(lat, lon, selected_datetime):
             holidays=holidays
         )
         trip_info = None
-        static_trip_df = get_static_trip_df()
-        if static_trip_df is not None:
-            trip_info = find_nearest_trip(lat, lon, selected_datetime, static_trip_df)
         return pred_class, confidence, {
             "weather": weather,
             "holidays": holidays,
             "datetime": selected_datetime,
-            "trip_info": trip_info
         }
     except Exception as e:
         return None, None, str(e)
@@ -446,12 +513,16 @@ with col2:
     # Show selected coordinates
     st.markdown(f"**Location:** {st.session_state.selected_lat:.4f}, {st.session_state.selected_lon:.4f}")
-    # Make prediction
     with st.spinner("Fetching prediction..."):
         pred_class, confidence, result = make_prediction(
             st.session_state.selected_lat,
             st.session_state.selected_lon,
-            selected_datetime
         )
     if pred_class is not None:
@@ -507,22 +578,51 @@ with col2:
                 if route_desc:
                     info_lines.append(f"Type: {route_desc}")
-                # Trip ID
-                trip_id = trip_info.get("trip_id")
-                if trip_id is not None:
-                    # Closest stop
-                    static_stops_df = get_static_stops_df()
-                    closest_stop = find_closest_stop(
-                        st.session_state.selected_lat,
-                        st.session_state.selected_lon,
-                        trip_id,
-                        static_stops_df
-                    )
-                    if closest_stop:
-                        info_lines.append(f"Nearest stop: {closest_stop}")
                 if info_lines:
                     st.markdown("**Bus Info:**\n- " + "\n- ".join(info_lines))
             # Weather conditions
             conditions = []

 from predictor import predict_occupancy, load_model, OCCUPANCY_LABELS
 from weather import get_weather_for_prediction
 from holidays import get_holiday_features
+from trip_info import (
+    load_static_trip_info, find_nearest_trip, load_static_stops_info
+)
 from contours import load_contours_from_file, grid_to_cells_geojson
 # Constants
         return None
+def is_stops_data_cached():
+    """Check if stops data is already in cache without triggering load."""
+    # Check if the cache has been populated by looking at session state
+    return "stops_data_loaded" in st.session_state and st.session_state.stops_data_loaded
 @st.cache_resource
 def get_model():
     """Load model once and cache it."""
     return predict_occupancy(lat, lon, hour, day_of_week, weather, holidays)
+@st.cache_data(ttl=3600)
+def fetch_trip_forecasts_from_hopsworks():
+    """
+    Fetch trip forecasts from Hopsworks forecast_fg.
+    Returns DataFrame with columns: trip_id, hour, weekday, predicted_occupancy, confidence
+    Returns None if forecast_fg doesn't exist or is empty.
+    """
+    try:
+        project = hopsworks.login()
+        fs = project.get_feature_store()
+        # Try v2 (new schema with hour/weekday), fall back to v1
+        for version in [2, 1]:
+            try:
+                forecast_fg = fs.get_feature_group("forecast_fg", version=version)
+                df = forecast_fg.read()
+                if df is not None and not df.empty:
+                    print(f"Loaded {len(df)} trip forecasts from Hopsworks v{version}")
+                    return df
+            except Exception:
+                continue
+        return None
+    except Exception as e:
+        print(f"Could not load trip forecasts: {e}")
+        return None
 @st.cache_resource
 def fetch_heatmaps_from_hopsworks():
     """
     return m
+def make_prediction(lat, lon, selected_datetime, skip_trip_info=False):
+    """Make prediction and return formatted result.
+    Args:
+        skip_trip_info: If True, skip the slow trip info lookup
+    """
     if lat is None or lon is None:
         return None, None, None
             holidays=holidays
         )
+        # Find nearest trip from static data (only if not skipping)
         trip_info = None
+        trip_forecast = None
+        if not skip_trip_info:
+            static_stops_df = get_static_stops_df()
+            # Mark that we've loaded the data (for future quick checks)
+            st.session_state.stops_data_loaded = True
+            if static_stops_df is not None:
+                trip_info = find_nearest_trip(lat, lon, selected_datetime, static_stops_df)
+            # Try to get trip forecast if available
+            if trip_info and trip_info.get("trip_id"):
+                forecasts_df = fetch_trip_forecasts_from_hopsworks()
+                if forecasts_df is not None:
+                    trip_id = trip_info["trip_id"]
+                    hour = selected_datetime.hour
+                    weekday = selected_datetime.weekday()
+                    # Find matching forecast
+                    match = forecasts_df[
+                        (forecasts_df["trip_id"] == trip_id) &
+                        (forecasts_df["hour"] == hour) &
+                        (forecasts_df["weekday"] == weekday)
+                    ]
+                    if not match.empty:
+                        row = match.iloc[0]
+                        trip_forecast = {
+                            "predicted_occupancy": int(row.get("predicted_occupancy", 0)),
+                            "confidence": float(row.get("confidence", 0)),
+                        }
         return pred_class, confidence, {
             "weather": weather,
             "holidays": holidays,
             "datetime": selected_datetime,
+            "trip_info": trip_info,
+            "trip_forecast": trip_forecast
         }
     except Exception as e:
         return None, None, str(e)
     # Show selected coordinates
     st.markdown(f"**Location:** {st.session_state.selected_lat:.4f}, {st.session_state.selected_lon:.4f}")
+    # Check if stops data is already cached (fast check)
+    stops_already_loaded = st.session_state.get("stops_data_loaded", False)
+    # Make prediction (skip trip info on first load to be fast)
     with st.spinner("Fetching prediction..."):
         pred_class, confidence, result = make_prediction(
             st.session_state.selected_lat,
             st.session_state.selected_lon,
+            selected_datetime,
+            skip_trip_info=not stops_already_loaded
         )
     if pred_class is not None:
                 if route_desc:
                     info_lines.append(f"Type: {route_desc}")
+                # Closest stop from trip info (already computed)
+                closest_stop = trip_info.get("closest_stop")
+                if closest_stop:
+                    info_lines.append(f"Nearest stop: {closest_stop}")
+                # Distance to stop
+                distance = trip_info.get("distance_m")
+                if distance is not None:
+                    info_lines.append(f"Distance: {distance}m")
                 if info_lines:
                     st.markdown("**Bus Info:**\n- " + "\n- ".join(info_lines))
+            elif not stops_already_loaded:
+                # Offer to load trip info (it's slow on first load)
+                if st.button("Load nearby bus info", help="First load takes ~1-2 minutes"):
+                    with st.spinner("Loading trip data from Hopsworks (this may take a minute)..."):
+                        # Trigger the load and rerun
+                        get_static_stops_df()
+                        st.session_state.stops_data_loaded = True
+                        st.rerun()
+            # Show trip-specific forecast if available
+            trip_forecast = result.get("trip_forecast")
+            if trip_forecast:
+                forecast_class = trip_forecast["predicted_occupancy"]
+                forecast_conf = trip_forecast["confidence"]
+                forecast_label = OCCUPANCY_LABELS.get(forecast_class, OCCUPANCY_LABELS[0])
+                forecast_color = OCCUPANCY_COLORS.get(forecast_class, "#6b7280")
+                st.markdown(f"""
+                <div style="
+                    background: {forecast_color}11;
+                    border: 1px solid {forecast_color}44;
+                    border-radius: 8px;
+                    padding: 12px;
+                    margin: 8px 0;
+                ">
+                    <div style="font-size: 0.85em; color: #6b7280; margin-bottom: 4px;">
+                        Trip-specific forecast:
+                    </div>
+                    <div style="font-weight: 600; color: {forecast_color};">
+                        {forecast_label['icon']} {forecast_label['label']} ({forecast_conf:.0%})
+                    </div>
+                </div>
+                """, unsafe_allow_html=True)
             # Weather conditions
             conditions = []

pages/Monitoring.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+HappySardines - Model Monitoring Dashboard
+Displays model performance metrics from hindcast analysis:
+- Accuracy trends over time
+- Actual vs predicted occupancy comparison
+- Per-class performance breakdown
+- Alerts for model drift
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import hopsworks
+from datetime import datetime, timedelta
+# Page config
+st.set_page_config(
+    page_title="HappySardines - Monitoring",
+    page_icon="📊",
+    layout="wide"
+)
+# Constants
+ALERT_THRESHOLD = 0.65  # Alert if accuracy drops below this
+MODEL_NAME = "occupancy_xgboost_model_new"
+CURRENT_MODEL_VERSION = 4  # Current production model version
+# Occupancy class labels
+OCCUPANCY_LABELS = {
+    0: "Empty",
+    1: "Many seats",
+    2: "Few seats",
+    3: "Standing",
+    4: "Crowded",
+    5: "Full",
+    6: "Not accepting",
+}
+# Colors for occupancy levels
+OCCUPANCY_COLORS = {
+    0: "#22c55e",  # Green
+    1: "#84cc16",  # Lime
+    2: "#eab308",  # Yellow
+    3: "#f97316",  # Orange
+    4: "#ef4444",  # Red
+    5: "#ef4444",  # Red
+    6: "#6b7280",  # Gray
+}
+@st.cache_data(ttl=3600)
+def fetch_monitoring_data():
+    """
+    Fetch monitoring data from Hopsworks monitor_fg.
+    Returns DataFrame with columns:
+    - window_start, trip_id
+    - actual_occupancy_mode, predicted_occupancy_mode
+    - accuracy, precision, recall, f1_weighted, mae
+    - model_version, generated_at
+    """
+    try:
+        project = hopsworks.login()
+        fs = project.get_feature_store()
+        monitor_fg = fs.get_feature_group("monitor_fg", version=1)
+        df = monitor_fg.read()
+        if df is not None and not df.empty:
+            # Ensure datetime columns are properly typed
+            if "window_start" in df.columns:
+                df["window_start"] = pd.to_datetime(df["window_start"])
+            if "generated_at" in df.columns:
+                df["generated_at"] = pd.to_datetime(df["generated_at"])
+            print(f"Loaded {len(df)} monitoring records from Hopsworks")
+            return df
+        return pd.DataFrame()
+    except Exception as e:
+        print(f"Error loading monitoring data: {e}")
+        return pd.DataFrame()
+def get_daily_metrics(df: pd.DataFrame) -> pd.DataFrame:
+    """Aggregate monitoring data by day."""
+    if df.empty:
+        return pd.DataFrame()
+    df = df.copy()
+    df["date"] = df["window_start"].dt.date
+    # Get first record per day (metrics are already daily aggregates)
+    daily = df.groupby("date").agg({
+        "accuracy": "first",
+        "precision": "first",
+        "recall": "first",
+        "f1_weighted": "first",
+        "mae": "first",
+        "model_version": "first",
+    }).reset_index()
+    daily["date"] = pd.to_datetime(daily["date"])
+    return daily.sort_values("date")
+def get_hourly_comparison(df: pd.DataFrame) -> pd.DataFrame:
+    """Aggregate actual vs predicted by hour."""
+    if df.empty:
+        return pd.DataFrame()
+    df = df.copy()
+    df["hour"] = df["window_start"].dt.floor("H")
+    hourly = df.groupby("hour").agg({
+        "actual_occupancy_mode": "mean",
+        "predicted_occupancy_mode": "mean",
+    }).reset_index()
+    return hourly.sort_values("hour")
+def get_per_class_metrics(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate per-class accuracy and counts for all 7 occupancy classes."""
+    if df.empty:
+        return pd.DataFrame()
+    results = []
+    # Always show all 7 classes (0-6), even if some have no data
+    for cls in range(7):
+        mask = df["actual_occupancy_mode"] == cls
+        subset = df[mask]
+        if len(subset) > 0:
+            correct = (subset["actual_occupancy_mode"] == subset["predicted_occupancy_mode"]).sum()
+            total = len(subset)
+            accuracy = correct / total
+        else:
+            correct = 0
+            total = 0
+            accuracy = None  # No data for this class
+        results.append({
+            "class": cls,
+            "label": OCCUPANCY_LABELS.get(cls, f"Class {cls}"),
+            "count": total,
+            "correct": correct,
+            "accuracy": accuracy,
+        })
+    return pd.DataFrame(results)
+def render_metric_card(label: str, value: float, format_str: str = "{:.1%}",
+                       threshold_low: float = None, threshold_high: float = None):
+    """Render a metric with conditional coloring."""
+    formatted = format_str.format(value) if value is not None else "N/A"
+    # Determine color
+    if threshold_low is not None and value < threshold_low:
+        color = "#ef4444"  # Red
+    elif threshold_high is not None and value >= threshold_high:
+        color = "#22c55e"  # Green
+    else:
+        color = "#eab308"  # Yellow
+    st.markdown(f"""
+    <div style="
+        background: {color}11;
+        border: 1px solid {color}44;
+        border-radius: 8px;
+        padding: 16px;
+        text-align: center;
+    ">
+        <div style="font-size: 0.9em; color: #6b7280; margin-bottom: 4px;">
+            {label}
+        </div>
+        <div style="font-size: 1.8em; font-weight: 600; color: {color};">
+            {formatted}
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+# Main page content
+st.title("📊 Model Monitoring")
+st.markdown("Track model performance over time using hindcast analysis.")
+# Load data
+with st.spinner("Loading monitoring data..."):
+    monitor_df = fetch_monitoring_data()
+if monitor_df.empty:
+    st.warning("""
+    **No monitoring data available yet.**
+    Monitoring data is generated daily by the inference pipeline, which compares
+    yesterday's predictions to actual observed occupancy.
+    The inference pipeline runs at 09:00 UTC. Check back after it has run at least once.
+    """)
+    st.stop()
+# Model version filter
+available_versions = sorted(monitor_df["model_version"].dropna().unique())
+if len(available_versions) > 1:
+    st.sidebar.subheader("Filter")
+    # Default to current model version if available
+    default_idx = available_versions.index(CURRENT_MODEL_VERSION) if CURRENT_MODEL_VERSION in available_versions else len(available_versions) - 1
+    selected_version = st.sidebar.selectbox(
+        "Model Version",
+        options=available_versions,
+        index=default_idx,
+        format_func=lambda x: f"v{int(x)}" + (" (current)" if x == CURRENT_MODEL_VERSION else "")
+    )
+    # Filter data by selected version
+    monitor_df = monitor_df[monitor_df["model_version"] == selected_version]
+    if monitor_df.empty:
+        st.warning(f"No monitoring data available for model v{int(selected_version)}.")
+        st.stop()
+else:
+    selected_version = available_versions[0] if available_versions else None
+# Show warning if viewing old model data
+if selected_version is not None and selected_version != CURRENT_MODEL_VERSION:
+    st.info(f"""
+    **Viewing historical data from model v{int(selected_version)}.**
+    The current production model is v{CURRENT_MODEL_VERSION}.
+    Data for v{CURRENT_MODEL_VERSION} will appear after the inference pipeline runs.
+    """)
+# Calculate aggregates
+daily_metrics = get_daily_metrics(monitor_df)
+hourly_comparison = get_hourly_comparison(monitor_df)
+per_class = get_per_class_metrics(monitor_df)
+# Get latest metrics
+latest = daily_metrics.iloc[-1] if not daily_metrics.empty else None
+# Header with model info
+if latest is not None:
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.markdown(f"**Model Version:** v{int(latest['model_version'])}")
+    with col2:
+        last_date = latest["date"].strftime("%Y-%m-%d")
+        st.markdown(f"**Last Updated:** {last_date}")
+# Alert banner
+if latest is not None and latest["accuracy"] < ALERT_THRESHOLD:
+    st.error(f"""
+    ⚠️ **Model Performance Alert**
+    Accuracy ({latest['accuracy']:.1%}) is below the threshold ({ALERT_THRESHOLD:.0%}).
+    Consider investigating recent data quality or retraining the model.
+    """)
+st.divider()
+# Key metrics cards
+st.subheader("Latest Performance")
+if latest is not None:
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        render_metric_card(
+            "Accuracy",
+            latest["accuracy"],
+            threshold_low=0.60,
+            threshold_high=0.70
+        )
+    with col2:
+        render_metric_card(
+            "F1 Score (Weighted)",
+            latest["f1_weighted"],
+            threshold_low=0.55,
+            threshold_high=0.65
+        )
+    with col3:
+        render_metric_card(
+            "Precision",
+            latest["precision"],
+            threshold_low=0.55,
+            threshold_high=0.70
+        )
+    with col4:
+        render_metric_card(
+            "MAE",
+            latest["mae"],
+            format_str="{:.2f}",
+            threshold_low=0.3,  # Lower is better for MAE
+            threshold_high=0.6
+        )
+st.divider()
+# Accuracy trend chart
+st.subheader("Accuracy Over Time")
+if not daily_metrics.empty and len(daily_metrics) > 1:
+    chart_data = daily_metrics[["date", "accuracy"]].set_index("date")
+    st.line_chart(chart_data, use_container_width=True)
+    # Show trend
+    if len(daily_metrics) >= 2:
+        recent = daily_metrics["accuracy"].iloc[-1]
+        previous = daily_metrics["accuracy"].iloc[-2]
+        delta = recent - previous
+        trend = "📈" if delta > 0 else "📉" if delta < 0 else "➡️"
+        st.caption(f"{trend} Change from previous: {delta:+.1%}")
+else:
+    st.info("Need at least 2 days of data to show trend chart.")
+st.divider()
+# Actual vs Predicted comparison
+st.subheader("Actual vs Predicted Occupancy")
+if not hourly_comparison.empty:
+    # Rename columns for display
+    chart_df = hourly_comparison.rename(columns={
+        "actual_occupancy_mode": "Actual",
+        "predicted_occupancy_mode": "Predicted"
+    }).set_index("hour")
+    st.line_chart(chart_df, use_container_width=True)
+    st.caption("Hourly average occupancy levels (0=Empty, 3=Standing)")
+else:
+    st.info("No hourly comparison data available.")
+st.divider()
+# Per-class performance
+st.subheader("Per-Class Performance")
+if not per_class.empty:
+    # Color-coded display for all 7 classes
+    for _, row in per_class.iterrows():
+        cls = int(row["class"])
+        label = row["label"]
+        accuracy = row["accuracy"]
+        count = int(row["count"])
+        color = OCCUPANCY_COLORS.get(cls, "#6b7280")
+        col1, col2, col3 = st.columns([2, 1, 3])
+        with col1:
+            st.markdown(f"**{cls}** - {label}")
+        with col2:
+            if count > 0:
+                st.markdown(f"{count:,} samples")
+            else:
+                st.markdown("No data", help="No samples of this class in the monitoring data")
+        with col3:
+            if accuracy is not None and count > 0:
+                # Progress bar with color
+                st.progress(accuracy, text=f"{accuracy:.1%}")
+            else:
+                st.markdown("—", help="No accuracy data for this class")
+    # Explanation
+    st.caption("""
+    Per-class accuracy (recall) shows how well the model predicts each occupancy level.
+    Classes 4-6 are rare in Swedish transit data. Lower accuracy for rare classes is expected.
+    """)
+else:
+    st.info("No per-class metrics available.")
+st.divider()
+# Raw data expander
+with st.expander("View Raw Monitoring Data"):
+    if not monitor_df.empty:
+        st.dataframe(
+            monitor_df.sort_values("window_start", ascending=False).head(100),
+            use_container_width=True
+        )
+        st.caption(f"Showing latest 100 of {len(monitor_df):,} total records")
+    else:
+        st.info("No raw data available.")
+# Footer
+st.divider()
+st.markdown(
+    "<div style='text-align: center; opacity: 0.6;'>Model monitoring powered by Hopsworks Feature Store</div>",
+    unsafe_allow_html=True
+)

trip_info.py CHANGED Viewed

@@ -1,65 +1,170 @@
 import hopsworks
 import os
 import numpy as np
 def load_static_trip_info():
     api_key = os.environ.get("HOPSWORKS_API_KEY")
     project_name = os.environ.get("HOPSWORKS_PROJECT")
     project = hopsworks.login(project=project_name, api_key_value=api_key)
     fs = project.get_feature_store()
-    fg = fs.get_feature_group("static_trip_info_fg", version=1)  # adjust version
     df = fg.read()
     return df
 def load_static_stops_info():
     api_key = os.environ.get("HOPSWORKS_API_KEY")
     project_name = os.environ.get("HOPSWORKS_PROJECT")
     project = hopsworks.login(project=project_name, api_key_value=api_key)
     fs = project.get_feature_store()
-    fg = fs.get_feature_group("static_trip_and_stops_info_fg", version=1)  # adjust version
     df = fg.read()
     return df
-def find_nearest_trip(lat, lon, datetime_obj, static_trip_df):
     """
-    Return the trip closest to the requested location/time.
-    Currently just filters static trips; could be enhanced with stops & routing.
     """
-    # For static data, we can only match by service_id/date/time if available
-    # Here we just pick a random trip as placeholder
-    if static_trip_df is None or len(static_trip_df) == 0:
         return None
-    trip = static_trip_df.sample(1).iloc[0]  # pick 1 random trip for demo
     return {
-        "trip_id": trip["trip_id"],
-        "route_short_name": trip["route_short_name"],
-        "route_long_name": trip["route_long_name"],
-        "trip_headsign": trip.get("trip_headsign", None)
     }
 def find_closest_stop(lat, lon, trip_id, stops_df):
     """
     Returns the closest stop to a given lat/lon for the specified trip_id.
     """
-    if stops_df is None:
-        return None
     # Filter stops for this trip
     trip_stops = stops_df[stops_df["trip_id"] == trip_id]
     if trip_stops.empty:
-        return None
-    # Compute distances
-    lat_array = trip_stops["stop_lat"].to_numpy()
-    lon_array = trip_stops["stop_lon"].to_numpy()
-    distances = np.sqrt((lat_array - lat)**2 + (lon_array - lon)**2)
-    idx_min = distances.argmin()
-    closest_stop = trip_stops.iloc[idx_min]
-    return closest_stop["stop_name"]

 import hopsworks
 import os
 import numpy as np
+from math import radians, sin, cos, sqrt, atan2
+def haversine_distance(lat1, lon1, lat2, lon2):
+    """Calculate distance in meters between two points using haversine formula."""
+    R = 6371000  # Earth's radius in meters
+    dlat = radians(lat2 - lat1)
+    dlon = radians(lon2 - lon1)
+    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
+    c = 2 * atan2(sqrt(a), sqrt(1-a))
+    return R * c
 def load_static_trip_info():
     api_key = os.environ.get("HOPSWORKS_API_KEY")
     project_name = os.environ.get("HOPSWORKS_PROJECT")
     project = hopsworks.login(project=project_name, api_key_value=api_key)
     fs = project.get_feature_store()
+    fg = fs.get_feature_group("static_trip_info_fg", version=1)
     df = fg.read()
     return df
 def load_static_stops_info():
     api_key = os.environ.get("HOPSWORKS_API_KEY")
     project_name = os.environ.get("HOPSWORKS_PROJECT")
     project = hopsworks.login(project=project_name, api_key_value=api_key)
     fs = project.get_feature_store()
+    fg = fs.get_feature_group("static_trip_and_stops_info_fg", version=1)
     df = fg.read()
     return df
+def time_to_seconds(t):
+    """Convert time string (HH:MM:SS) to seconds since midnight."""
+    if t is None:
+        return None
+    try:
+        h, m, s = map(int, str(t).split(":"))
+        return h * 3600 + m * 60 + s
+    except (ValueError, AttributeError):
+        return None
+def find_nearest_trip(lat, lon, datetime_obj, static_trip_and_stops_df, max_radius_m=500):
     """
+    Find the nearest trip to a given location and time.
+    Uses haversine distance and filters by time window.
+    Args:
+        lat, lon: Location to search near
+        datetime_obj: Target datetime
+        static_trip_and_stops_df: DataFrame with trip and stop info
+        max_radius_m: Maximum search radius in meters (default 500m)
+    Returns:
+        Dict with trip info or None if no nearby trip found
     """
+    if static_trip_and_stops_df is None or static_trip_and_stops_df.empty:
+        return None
+    target_s = datetime_obj.hour * 3600 + datetime_obj.minute * 60
+    # Compute distance to each stop
+    df = static_trip_and_stops_df.copy()
+    # Check if required columns exist
+    if "stop_lat" not in df.columns or "stop_lon" not in df.columns:
         return None
+    df["distance_m"] = df.apply(
+        lambda r: haversine_distance(lat, lon, r["stop_lat"], r["stop_lon"]),
+        axis=1
+    )
+    # Geographic filter
+    nearby = df[df["distance_m"] <= max_radius_m]
+    if nearby.empty:
+        # Try with larger radius
+        nearby = df[df["distance_m"] <= max_radius_m * 2]
+        if nearby.empty:
+            return None
+    # Build time window check if arrival/departure times are available
+    if "arrival_time" in nearby.columns and "departure_time" in nearby.columns:
+        nearby = nearby.copy()
+        nearby["arr_s"] = nearby["arrival_time"].apply(time_to_seconds)
+        nearby["dep_s"] = nearby["departure_time"].apply(time_to_seconds)
+        # Keep trips where we're near a scheduled stop time
+        time_filtered = nearby[
+            (nearby["arr_s"].notna()) &
+            ((nearby["arr_s"] <= target_s + 3600) & (nearby["arr_s"] >= target_s - 3600))
+        ]
+        if not time_filtered.empty:
+            nearby = time_filtered
+    # Choose the one whose stop is closest to the click
+    best = nearby.sort_values("distance_m").iloc[0]
     return {
+        "trip_id": best.get("trip_id"),
+        "route_short_name": best.get("route_short_name"),
+        "route_long_name": best.get("route_long_name"),
+        "trip_headsign": best.get("trip_headsign"),
+        "closest_stop": best.get("stop_name"),
+        "closest_stop_headsign": best.get("stop_headsign"),
+        "distance_m": round(best["distance_m"]),
     }
 def find_closest_stop(lat, lon, trip_id, stops_df):
     """
     Returns the closest stop to a given lat/lon for the specified trip_id.
+    Returns tuple of (stop_name, stop_headsign) or (None, None) if not found.
     """
+    if stops_df is None or stops_df.empty:
+        return None, None
     # Filter stops for this trip
     trip_stops = stops_df[stops_df["trip_id"] == trip_id]
     if trip_stops.empty:
+        return None, None
+    # Compute distances using haversine
+    distances = trip_stops.apply(
+        lambda r: haversine_distance(lat, lon, r["stop_lat"], r["stop_lon"]),
+        axis=1
+    )
+    closest_stop = trip_stops.loc[distances.idxmin()]
+    return closest_stop.get("stop_name"), closest_stop.get("stop_headsign")
+def load_trip_forecasts(fs, hour, weekday):
+    """
+    Load trip forecasts from forecast_fg for a specific hour and weekday.
+    Returns DataFrame with trip predictions or empty DataFrame if not available.
+    """
+    try:
+        forecast_fg = fs.get_feature_group("forecast_fg", version=1)
+        df = forecast_fg.read()
+        if df.empty:
+            return df
+        # Filter to matching hour and weekday
+        df["window_start"] = pd.to_datetime(df["window_start"])
+        df["hour"] = df["window_start"].dt.hour
+        df["weekday"] = df["window_start"].dt.weekday
+        filtered = df[(df["hour"] == hour) & (df["weekday"] == weekday)]
+        return filtered
+    except Exception as e:
+        print(f"Could not load trip forecasts: {e}")
+        return pd.DataFrame()