RX12-Housing-Competition / preprocessing.py
GBDB02's picture
Upload 8 files
97b9bf6 verified
"""
preprocessing.py — Feature engineering and data preprocessing pipeline.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df["TotalSF"] = (
df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
+ df.get("1stFlrSF", pd.Series(0, index=df.index)).fillna(0)
+ df.get("2ndFlrSF", pd.Series(0, index=df.index)).fillna(0)
)
df["TotalBath"] = (
df.get("FullBath", pd.Series(0, index=df.index)).fillna(0)
+ 0.5 * df.get("HalfBath", pd.Series(0, index=df.index)).fillna(0)
+ df.get("BsmtFullBath", pd.Series(0, index=df.index)).fillna(0)
+ 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
)
df["HasPool"] = (df.get("PoolArea", pd.Series(0, index=df.index)) > 0).astype(int)
df["HasGarage"] = df.get("GarageArea", pd.Series(np.nan, index=df.index)).notnull().astype(int)
df["HasBsmt"] = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
df["IsRemodeled"] = (
df.get("YearRemodAdd", pd.Series(0, index=df.index))
!= df.get("YearBuilt", pd.Series(0, index=df.index))
).astype(int)
return df
def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
# Step 1 — Drop unwanted / high-NaN columns
to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
missing_pct = df.isnull().sum() * 100 / len(df)
to_drop += missing_pct[missing_pct > 50].index.tolist()
df = df.drop(columns=list(set(to_drop)), errors="ignore")
# Step 2 — Feature engineering
df = engineer_features(df)
# Step 3 — Fix dtypes: numeric columns must be float, not object
for col in df.columns:
if col in df.select_dtypes(include=["object"]).columns:
# Try to convert to numeric; if it fails keep as string
converted = pd.to_numeric(df[col], errors="coerce")
if converted.notna().sum() > 0.5 * len(df):
df[col] = converted
# Step 4 — Log-transform skewed numerics
num_cols = df.select_dtypes(include=[np.number]).columns
skewed = df[num_cols].apply(lambda x: x.dropna().skew())
for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
df[feat] = np.log1p(df[feat].clip(lower=0))
# Step 5 — Fill missing values
for col in df.select_dtypes(include=["object"]).columns:
df[col] = df[col].fillna("Missing").astype(str)
for col in df.select_dtypes(include=[np.number]).columns:
df[col] = df[col].fillna(df[col].median()).astype(float)
return df
def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
num_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", RobustScaler()),
])
cat_pipeline = Pipeline([
("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])
return ColumnTransformer([
("num", num_pipeline, numerical_features),
("cat", cat_pipeline, categorical_features),
])