Spaces:
Sleeping
Sleeping
File size: 4,589 Bytes
4ba360f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | import pandas as pd
def _map_binary_series(s: pd.Series) -> pd.Series:
"""
Apply deterministic binary encoding to 2-category features.
This function implements the core binary encoding logic that converts
categorical features with exactly 2 values into 0/1 integers. The mappings
are deterministic and must be consistent between training and serving.
"""
# Get unique values and remove NaN
vals = list(pd.Series(s.dropna().unique()).astype(str))
valset = set(vals)
# === DETERMINISTIC BINARY MAPPINGS ===
# CRITICAL: These exact mappings are hardcoded in serving pipeline
# Yes/No mapping (most common pattern in telecom data)
if valset == {"Yes", "No"}:
return s.map({"No": 0, "Yes": 1}).astype("Int64")
# Gender mapping (demographic feature)
if valset == {"Male", "Female"}:
return s.map({"Female": 0, "Male": 1}).astype("Int64")
# === GENERIC BINARY MAPPING ===
# For any other 2-category feature, use stable alphabetical ordering
if len(vals) == 2:
# Sort values to ensure consistent mapping across runs
sorted_vals = sorted(vals)
mapping = {sorted_vals[0]: 0, sorted_vals[1]: 1}
return s.astype(str).map(mapping).astype("Int64")
# === NON-BINARY FEATURES ===
# Return unchanged - will be handled by one-hot encoding
return s
def build_features(df: pd.DataFrame, target_col: str = "Churn") -> pd.DataFrame:
"""
Apply complete feature engineering pipeline for training data.
This is the main feature engineering function that transforms raw customer data
into ML-ready features. The transformations must be exactly replicated in the
serving pipeline to ensure prediction accuracy.
"""
df = df.copy()
print(f"π§ Starting feature engineering on {df.shape[1]} columns...")
# === STEP 1: Identify Feature Types ===
# Find categorical columns (object dtype) excluding the target variable
obj_cols = [c for c in df.select_dtypes(include=["object"]).columns if c != target_col]
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
print(f" π Found {len(obj_cols)} categorical and {len(numeric_cols)} numeric columns")
# === STEP 2: Split Categorical by Cardinality ===
# Binary features (exactly 2 unique values) get binary encoding
# Multi-category features (>2 unique values) get one-hot encoding
binary_cols = [c for c in obj_cols if df[c].dropna().nunique() == 2]
multi_cols = [c for c in obj_cols if df[c].dropna().nunique() > 2]
print(f" π’ Binary features: {len(binary_cols)} | Multi-category features: {len(multi_cols)}")
if binary_cols:
print(f" Binary: {binary_cols}")
if multi_cols:
print(f" Multi-category: {multi_cols}")
# === STEP 3: Apply Binary Encoding ===
# Convert 2-category features to 0/1 using deterministic mappings
for c in binary_cols:
original_dtype = df[c].dtype
df[c] = _map_binary_series(df[c].astype(str))
print(f" β
{c}: {original_dtype} β binary (0/1)")
# === STEP 4: Convert Boolean Columns ===
# XGBoost requires integer inputs, not boolean
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
if bool_cols:
df[bool_cols] = df[bool_cols].astype(int)
print(f" π Converted {len(bool_cols)} boolean columns to int: {bool_cols}")
# === STEP 5: One-Hot Encoding for Multi-Category Features ===
# CRITICAL: drop_first=True prevents multicollinearity
if multi_cols:
print(f" π Applying one-hot encoding to {len(multi_cols)} multi-category columns...")
original_shape = df.shape
# Apply one-hot encoding with drop_first=True (same as serving)
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)
new_features = df.shape[1] - original_shape[1] + len(multi_cols)
print(f" β
Created {new_features} new features from {len(multi_cols)} categorical columns")
# === STEP 6: Data Type Cleanup ===
# Convert nullable integers (Int64) to standard integers for XGBoost
for c in binary_cols:
if pd.api.types.is_integer_dtype(df[c]):
# Fill any NaN values with 0 and convert to int
df[c] = df[c].fillna(0).astype(int)
print(f"β
Feature engineering complete: {df.shape[1]} final features")
return df
|