Spaces:

logan-codes
/

telco-churn-predictor

Sleeping

Add Dockerfile, Gradio app, and core src modules

4ba360f about 1 month ago

1.38 kB

	import pandas as pd


	def preprocess_data(df: pd.DataFrame, target_col: str = "Churn") -> pd.DataFrame:
	"""
	Basic cleaning for Telco churn.
	- trim column names
	- drop obvious ID cols
	- fix TotalCharges to numeric
	- map target Churn to 0/1 if needed
	- simple NA handling
	"""
	# tidy headers
	df.columns = df.columns.str.strip() # Remove leading/trailing whitespace

	# drop ids if present
	for col in ["customerID", "CustomerID", "customer_id"]:
	if col in df.columns:
	df = df.drop(columns=[col])

	# target to 0/1 if it's Yes/No
	if target_col in df.columns and df[target_col].dtype == "object":
	df[target_col] = df[target_col].str.strip().map({"No": 0, "Yes": 1})

	# TotalCharges often has blanks in this dataset -> coerce to float
	if "TotalCharges" in df.columns:
	df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

	# SeniorCitizen should be 0/1 ints if present
	if "SeniorCitizen" in df.columns:
	df["SeniorCitizen"] = df["SeniorCitizen"].fillna(0).astype(int)

	# simple NA strategy:
	# - numeric: fill with 0
	# - others: leave for encoders to handle (get_dummies ignores NaN safely)
	num_cols = df.select_dtypes(include=["number"]).columns
	df[num_cols] = df[num_cols].fillna(0)

	return df