File size: 1,814 Bytes
1aa566a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Feature preprocessing pipeline shared by training and inference."""
from __future__ import annotations

import numpy as np
import pandas as pd
from typing import Optional

from src.utils.config import settings
from src.utils.logging_config import get_logger

log = get_logger(__name__)

FEATURE_COLS: list[str] = settings.data.features
TARGET_COL: str = settings.data.target

_CLIP_RANGES: dict[str, tuple[float, float]] = {
    "trip_distance": (0.1, 50.0),
    "passenger_count": (1.0, 6.0),
    "pickup_hour": (0.0, 23.0),
    "pickup_dow": (0.0, 6.0),
    "pickup_month": (1.0, 12.0),
}

_CATEGORICAL_COLS: set[str] = {
    "vendor_id",
    "rate_code_id",
    "payment_type",
    "pu_location_zone",
    "do_location_zone",
    "pickup_is_weekend",
}


class Preprocessor:
    """Stateless preprocessing utility."""

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Return a clean feature DataFrame aligned to FEATURE_COLS."""
        df = df.copy()

        for col, (lo, hi) in _CLIP_RANGES.items():
            if col in df.columns:
                df[col] = df[col].clip(lo, hi)

        for col in _CATEGORICAL_COLS:
            if col in df.columns:
                df[col] = df[col].astype(int)

        for col in FEATURE_COLS:
            if col not in df.columns:
                log.warning("Missing feature '%s' — filling with 0", col)
                df[col] = 0

        return df[FEATURE_COLS]

    def transform_with_target(
        self, df: pd.DataFrame
    ) -> tuple[pd.DataFrame, Optional[pd.Series]]:
        """Return (X, y). y is None if target not in df."""
        y = df[TARGET_COL].copy() if TARGET_COL in df.columns else None
        X = self.transform(df)
        return X, y

    def feature_names(self) -> list[str]:
        return list(FEATURE_COLS)