argus-mlops / src /data /preprocessing.py
hodfa840's picture
Fix scroll reset for HF Spaces double-iframe context
1aa566a
"""Feature preprocessing pipeline shared by training and inference."""
from __future__ import annotations
import numpy as np
import pandas as pd
from typing import Optional
from src.utils.config import settings
from src.utils.logging_config import get_logger
log = get_logger(__name__)
FEATURE_COLS: list[str] = settings.data.features
TARGET_COL: str = settings.data.target
_CLIP_RANGES: dict[str, tuple[float, float]] = {
"trip_distance": (0.1, 50.0),
"passenger_count": (1.0, 6.0),
"pickup_hour": (0.0, 23.0),
"pickup_dow": (0.0, 6.0),
"pickup_month": (1.0, 12.0),
}
_CATEGORICAL_COLS: set[str] = {
"vendor_id",
"rate_code_id",
"payment_type",
"pu_location_zone",
"do_location_zone",
"pickup_is_weekend",
}
class Preprocessor:
"""Stateless preprocessing utility."""
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Return a clean feature DataFrame aligned to FEATURE_COLS."""
df = df.copy()
for col, (lo, hi) in _CLIP_RANGES.items():
if col in df.columns:
df[col] = df[col].clip(lo, hi)
for col in _CATEGORICAL_COLS:
if col in df.columns:
df[col] = df[col].astype(int)
for col in FEATURE_COLS:
if col not in df.columns:
log.warning("Missing feature '%s' — filling with 0", col)
df[col] = 0
return df[FEATURE_COLS]
def transform_with_target(
self, df: pd.DataFrame
) -> tuple[pd.DataFrame, Optional[pd.Series]]:
"""Return (X, y). y is None if target not in df."""
y = df[TARGET_COL].copy() if TARGET_COL in df.columns else None
X = self.transform(df)
return X, y
def feature_names(self) -> list[str]:
return list(FEATURE_COLS)