Spaces:
Sleeping
Sleeping
File size: 1,814 Bytes
1aa566a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | """Feature preprocessing pipeline shared by training and inference."""
from __future__ import annotations
import numpy as np
import pandas as pd
from typing import Optional
from src.utils.config import settings
from src.utils.logging_config import get_logger
log = get_logger(__name__)
FEATURE_COLS: list[str] = settings.data.features
TARGET_COL: str = settings.data.target
_CLIP_RANGES: dict[str, tuple[float, float]] = {
"trip_distance": (0.1, 50.0),
"passenger_count": (1.0, 6.0),
"pickup_hour": (0.0, 23.0),
"pickup_dow": (0.0, 6.0),
"pickup_month": (1.0, 12.0),
}
_CATEGORICAL_COLS: set[str] = {
"vendor_id",
"rate_code_id",
"payment_type",
"pu_location_zone",
"do_location_zone",
"pickup_is_weekend",
}
class Preprocessor:
"""Stateless preprocessing utility."""
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Return a clean feature DataFrame aligned to FEATURE_COLS."""
df = df.copy()
for col, (lo, hi) in _CLIP_RANGES.items():
if col in df.columns:
df[col] = df[col].clip(lo, hi)
for col in _CATEGORICAL_COLS:
if col in df.columns:
df[col] = df[col].astype(int)
for col in FEATURE_COLS:
if col not in df.columns:
log.warning("Missing feature '%s' — filling with 0", col)
df[col] = 0
return df[FEATURE_COLS]
def transform_with_target(
self, df: pd.DataFrame
) -> tuple[pd.DataFrame, Optional[pd.Series]]:
"""Return (X, y). y is None if target not in df."""
y = df[TARGET_COL].copy() if TARGET_COL in df.columns else None
X = self.transform(df)
return X, y
def feature_names(self) -> list[str]:
return list(FEATURE_COLS)
|