Spaces:
Sleeping
Sleeping
| """Feature preprocessing pipeline shared by training and inference.""" | |
| from __future__ import annotations | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Optional | |
| from src.utils.config import settings | |
| from src.utils.logging_config import get_logger | |
| log = get_logger(__name__) | |
| FEATURE_COLS: list[str] = settings.data.features | |
| TARGET_COL: str = settings.data.target | |
| _CLIP_RANGES: dict[str, tuple[float, float]] = { | |
| "trip_distance": (0.1, 50.0), | |
| "passenger_count": (1.0, 6.0), | |
| "pickup_hour": (0.0, 23.0), | |
| "pickup_dow": (0.0, 6.0), | |
| "pickup_month": (1.0, 12.0), | |
| } | |
| _CATEGORICAL_COLS: set[str] = { | |
| "vendor_id", | |
| "rate_code_id", | |
| "payment_type", | |
| "pu_location_zone", | |
| "do_location_zone", | |
| "pickup_is_weekend", | |
| } | |
| class Preprocessor: | |
| """Stateless preprocessing utility.""" | |
| def transform(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """Return a clean feature DataFrame aligned to FEATURE_COLS.""" | |
| df = df.copy() | |
| for col, (lo, hi) in _CLIP_RANGES.items(): | |
| if col in df.columns: | |
| df[col] = df[col].clip(lo, hi) | |
| for col in _CATEGORICAL_COLS: | |
| if col in df.columns: | |
| df[col] = df[col].astype(int) | |
| for col in FEATURE_COLS: | |
| if col not in df.columns: | |
| log.warning("Missing feature '%s' — filling with 0", col) | |
| df[col] = 0 | |
| return df[FEATURE_COLS] | |
| def transform_with_target( | |
| self, df: pd.DataFrame | |
| ) -> tuple[pd.DataFrame, Optional[pd.Series]]: | |
| """Return (X, y). y is None if target not in df.""" | |
| y = df[TARGET_COL].copy() if TARGET_COL in df.columns else None | |
| X = self.transform(df) | |
| return X, y | |
| def feature_names(self) -> list[str]: | |
| return list(FEATURE_COLS) | |