Spaces:

ESCP
/

RX12-Housing-Competition

Sleeping

App Files Files Community

RX12-Housing-Competition / preprocessing.py

GBDB02

Upload 8 files

97b9bf6 verified about 1 month ago

raw

history blame contribute delete

3.38 kB

	"""
	preprocessing.py — Feature engineering and data preprocessing pipeline.
	"""

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import RobustScaler, OneHotEncoder
	from sklearn.impute import SimpleImputer
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline

	from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD


	def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	df["TotalSF"] = (
	df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
	+ df.get("1stFlrSF", pd.Series(0, index=df.index)).fillna(0)
	+ df.get("2ndFlrSF", pd.Series(0, index=df.index)).fillna(0)
	)
	df["TotalBath"] = (
	df.get("FullBath", pd.Series(0, index=df.index)).fillna(0)
	+ 0.5 * df.get("HalfBath", pd.Series(0, index=df.index)).fillna(0)
	+ df.get("BsmtFullBath", pd.Series(0, index=df.index)).fillna(0)
	+ 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
	)
	df["HasPool"] = (df.get("PoolArea", pd.Series(0, index=df.index)) > 0).astype(int)
	df["HasGarage"] = df.get("GarageArea", pd.Series(np.nan, index=df.index)).notnull().astype(int)
	df["HasBsmt"] = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
	df["IsRemodeled"] = (
	df.get("YearRemodAdd", pd.Series(0, index=df.index))
	!= df.get("YearBuilt", pd.Series(0, index=df.index))
	).astype(int)
	return df


	def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
	# Step 1 — Drop unwanted / high-NaN columns
	to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
	missing_pct = df.isnull().sum() * 100 / len(df)
	to_drop += missing_pct[missing_pct > 50].index.tolist()
	df = df.drop(columns=list(set(to_drop)), errors="ignore")

	# Step 2 — Feature engineering
	df = engineer_features(df)

	# Step 3 — Fix dtypes: numeric columns must be float, not object
	for col in df.columns:
	if col in df.select_dtypes(include=["object"]).columns:
	# Try to convert to numeric; if it fails keep as string
	converted = pd.to_numeric(df[col], errors="coerce")
	if converted.notna().sum() > 0.5 * len(df):
	df[col] = converted

	# Step 4 — Log-transform skewed numerics
	num_cols = df.select_dtypes(include=[np.number]).columns
	skewed = df[num_cols].apply(lambda x: x.dropna().skew())
	for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
	df[feat] = np.log1p(df[feat].clip(lower=0))

	# Step 5 — Fill missing values
	for col in df.select_dtypes(include=["object"]).columns:
	df[col] = df[col].fillna("Missing").astype(str)
	for col in df.select_dtypes(include=[np.number]).columns:
	df[col] = df[col].fillna(df[col].median()).astype(float)

	return df


	def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
	num_pipeline = Pipeline([
	("imputer", SimpleImputer(strategy="median")),
	("scaler", RobustScaler()),
	])
	cat_pipeline = Pipeline([
	("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
	])
	return ColumnTransformer([
	("num", num_pipeline, numerical_features),
	("cat", cat_pipeline, categorical_features),
	])