File size: 3,379 Bytes
97b9bf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
preprocessing.py — Feature engineering and data preprocessing pipeline.
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD


def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["TotalSF"] = (
        df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
        + df.get("1stFlrSF",   pd.Series(0, index=df.index)).fillna(0)
        + df.get("2ndFlrSF",   pd.Series(0, index=df.index)).fillna(0)
    )
    df["TotalBath"] = (
        df.get("FullBath",      pd.Series(0, index=df.index)).fillna(0)
        + 0.5 * df.get("HalfBath",    pd.Series(0, index=df.index)).fillna(0)
        + df.get("BsmtFullBath",  pd.Series(0, index=df.index)).fillna(0)
        + 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
    )
    df["HasPool"]     = (df.get("PoolArea",   pd.Series(0,      index=df.index)) > 0).astype(int)
    df["HasGarage"]   = df.get("GarageArea",  pd.Series(np.nan, index=df.index)).notnull().astype(int)
    df["HasBsmt"]     = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
    df["IsRemodeled"] = (
        df.get("YearRemodAdd", pd.Series(0, index=df.index))
        != df.get("YearBuilt", pd.Series(0, index=df.index))
    ).astype(int)
    return df


def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1 — Drop unwanted / high-NaN columns
    to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
    missing_pct = df.isnull().sum() * 100 / len(df)
    to_drop += missing_pct[missing_pct > 50].index.tolist()
    df = df.drop(columns=list(set(to_drop)), errors="ignore")

    # Step 2 — Feature engineering
    df = engineer_features(df)

    # Step 3 — Fix dtypes: numeric columns must be float, not object
    for col in df.columns:
        if col in df.select_dtypes(include=["object"]).columns:
            # Try to convert to numeric; if it fails keep as string
            converted = pd.to_numeric(df[col], errors="coerce")
            if converted.notna().sum() > 0.5 * len(df):
                df[col] = converted

    # Step 4 — Log-transform skewed numerics
    num_cols = df.select_dtypes(include=[np.number]).columns
    skewed   = df[num_cols].apply(lambda x: x.dropna().skew())
    for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
        df[feat] = np.log1p(df[feat].clip(lower=0))

    # Step 5 — Fill missing values
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].fillna("Missing").astype(str)
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].median()).astype(float)

    return df


def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  RobustScaler()),
    ])
    cat_pipeline = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
    return ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("cat", cat_pipeline, categorical_features),
    ])