telco-churn-predictor / src /features /build_features.py
logan-codes's picture
Add Dockerfile, Gradio app, and core src modules
4ba360f
import pandas as pd
def _map_binary_series(s: pd.Series) -> pd.Series:
"""
Apply deterministic binary encoding to 2-category features.
This function implements the core binary encoding logic that converts
categorical features with exactly 2 values into 0/1 integers. The mappings
are deterministic and must be consistent between training and serving.
"""
# Get unique values and remove NaN
vals = list(pd.Series(s.dropna().unique()).astype(str))
valset = set(vals)
# === DETERMINISTIC BINARY MAPPINGS ===
# CRITICAL: These exact mappings are hardcoded in serving pipeline
# Yes/No mapping (most common pattern in telecom data)
if valset == {"Yes", "No"}:
return s.map({"No": 0, "Yes": 1}).astype("Int64")
# Gender mapping (demographic feature)
if valset == {"Male", "Female"}:
return s.map({"Female": 0, "Male": 1}).astype("Int64")
# === GENERIC BINARY MAPPING ===
# For any other 2-category feature, use stable alphabetical ordering
if len(vals) == 2:
# Sort values to ensure consistent mapping across runs
sorted_vals = sorted(vals)
mapping = {sorted_vals[0]: 0, sorted_vals[1]: 1}
return s.astype(str).map(mapping).astype("Int64")
# === NON-BINARY FEATURES ===
# Return unchanged - will be handled by one-hot encoding
return s
def build_features(df: pd.DataFrame, target_col: str = "Churn") -> pd.DataFrame:
"""
Apply complete feature engineering pipeline for training data.
This is the main feature engineering function that transforms raw customer data
into ML-ready features. The transformations must be exactly replicated in the
serving pipeline to ensure prediction accuracy.
"""
df = df.copy()
print(f"πŸ”§ Starting feature engineering on {df.shape[1]} columns...")
# === STEP 1: Identify Feature Types ===
# Find categorical columns (object dtype) excluding the target variable
obj_cols = [c for c in df.select_dtypes(include=["object"]).columns if c != target_col]
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
print(f" πŸ“Š Found {len(obj_cols)} categorical and {len(numeric_cols)} numeric columns")
# === STEP 2: Split Categorical by Cardinality ===
# Binary features (exactly 2 unique values) get binary encoding
# Multi-category features (>2 unique values) get one-hot encoding
binary_cols = [c for c in obj_cols if df[c].dropna().nunique() == 2]
multi_cols = [c for c in obj_cols if df[c].dropna().nunique() > 2]
print(f" πŸ”’ Binary features: {len(binary_cols)} | Multi-category features: {len(multi_cols)}")
if binary_cols:
print(f" Binary: {binary_cols}")
if multi_cols:
print(f" Multi-category: {multi_cols}")
# === STEP 3: Apply Binary Encoding ===
# Convert 2-category features to 0/1 using deterministic mappings
for c in binary_cols:
original_dtype = df[c].dtype
df[c] = _map_binary_series(df[c].astype(str))
print(f" βœ… {c}: {original_dtype} β†’ binary (0/1)")
# === STEP 4: Convert Boolean Columns ===
# XGBoost requires integer inputs, not boolean
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
if bool_cols:
df[bool_cols] = df[bool_cols].astype(int)
print(f" πŸ”„ Converted {len(bool_cols)} boolean columns to int: {bool_cols}")
# === STEP 5: One-Hot Encoding for Multi-Category Features ===
# CRITICAL: drop_first=True prevents multicollinearity
if multi_cols:
print(f" 🌟 Applying one-hot encoding to {len(multi_cols)} multi-category columns...")
original_shape = df.shape
# Apply one-hot encoding with drop_first=True (same as serving)
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)
new_features = df.shape[1] - original_shape[1] + len(multi_cols)
print(f" βœ… Created {new_features} new features from {len(multi_cols)} categorical columns")
# === STEP 6: Data Type Cleanup ===
# Convert nullable integers (Int64) to standard integers for XGBoost
for c in binary_cols:
if pd.api.types.is_integer_dtype(df[c]):
# Fill any NaN values with 0 and convert to int
df[c] = df[c].fillna(0).astype(int)
print(f"βœ… Feature engineering complete: {df.shape[1]} final features")
return df