Spaces:

logan-codes
/

telco-churn-predictor

Sleeping

App Files Files Community

telco-churn-predictor / src /features /build_features.py

logan-codes

Add Dockerfile, Gradio app, and core src modules

4ba360f about 2 months ago

raw

history blame contribute delete

4.59 kB

	import pandas as pd


	def _map_binary_series(s: pd.Series) -> pd.Series:
	"""
	Apply deterministic binary encoding to 2-category features.

	This function implements the core binary encoding logic that converts
	categorical features with exactly 2 values into 0/1 integers. The mappings
	are deterministic and must be consistent between training and serving.

	"""
	# Get unique values and remove NaN
	vals = list(pd.Series(s.dropna().unique()).astype(str))
	valset = set(vals)

	# === DETERMINISTIC BINARY MAPPINGS ===
	# CRITICAL: These exact mappings are hardcoded in serving pipeline

	# Yes/No mapping (most common pattern in telecom data)
	if valset == {"Yes", "No"}:
	return s.map({"No": 0, "Yes": 1}).astype("Int64")

	# Gender mapping (demographic feature)
	if valset == {"Male", "Female"}:
	return s.map({"Female": 0, "Male": 1}).astype("Int64")

	# === GENERIC BINARY MAPPING ===
	# For any other 2-category feature, use stable alphabetical ordering
	if len(vals) == 2:
	# Sort values to ensure consistent mapping across runs
	sorted_vals = sorted(vals)
	mapping = {sorted_vals[0]: 0, sorted_vals[1]: 1}
	return s.astype(str).map(mapping).astype("Int64")

	# === NON-BINARY FEATURES ===
	# Return unchanged - will be handled by one-hot encoding
	return s


	def build_features(df: pd.DataFrame, target_col: str = "Churn") -> pd.DataFrame:
	"""
	Apply complete feature engineering pipeline for training data.

	This is the main feature engineering function that transforms raw customer data
	into ML-ready features. The transformations must be exactly replicated in the
	serving pipeline to ensure prediction accuracy.

	"""
	df = df.copy()
	print(f"🔧 Starting feature engineering on {df.shape[1]} columns...")

	# === STEP 1: Identify Feature Types ===
	# Find categorical columns (object dtype) excluding the target variable
	obj_cols = [c for c in df.select_dtypes(include=["object"]).columns if c != target_col]
	numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

	print(f" 📊 Found {len(obj_cols)} categorical and {len(numeric_cols)} numeric columns")

	# === STEP 2: Split Categorical by Cardinality ===
	# Binary features (exactly 2 unique values) get binary encoding
	# Multi-category features (>2 unique values) get one-hot encoding
	binary_cols = [c for c in obj_cols if df[c].dropna().nunique() == 2]
	multi_cols = [c for c in obj_cols if df[c].dropna().nunique() > 2]

	print(f" 🔢 Binary features: {len(binary_cols)} \| Multi-category features: {len(multi_cols)}")
	if binary_cols:
	print(f" Binary: {binary_cols}")
	if multi_cols:
	print(f" Multi-category: {multi_cols}")

	# === STEP 3: Apply Binary Encoding ===
	# Convert 2-category features to 0/1 using deterministic mappings
	for c in binary_cols:
	original_dtype = df[c].dtype
	df[c] = _map_binary_series(df[c].astype(str))
	print(f" ✅ {c}: {original_dtype} → binary (0/1)")

	# === STEP 4: Convert Boolean Columns ===
	# XGBoost requires integer inputs, not boolean
	bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
	if bool_cols:
	df[bool_cols] = df[bool_cols].astype(int)
	print(f" 🔄 Converted {len(bool_cols)} boolean columns to int: {bool_cols}")

	# === STEP 5: One-Hot Encoding for Multi-Category Features ===
	# CRITICAL: drop_first=True prevents multicollinearity
	if multi_cols:
	print(f" 🌟 Applying one-hot encoding to {len(multi_cols)} multi-category columns...")
	original_shape = df.shape

	# Apply one-hot encoding with drop_first=True (same as serving)
	df = pd.get_dummies(df, columns=multi_cols, drop_first=True)

	new_features = df.shape[1] - original_shape[1] + len(multi_cols)
	print(f" ✅ Created {new_features} new features from {len(multi_cols)} categorical columns")

	# === STEP 6: Data Type Cleanup ===
	# Convert nullable integers (Int64) to standard integers for XGBoost
	for c in binary_cols:
	if pd.api.types.is_integer_dtype(df[c]):
	# Fill any NaN values with 0 and convert to int
	df[c] = df[c].fillna(0).astype(int)

	print(f"✅ Feature engineering complete: {df.shape[1]} final features")
	return df