Spaces:
Sleeping
Sleeping
File size: 1,381 Bytes
4ba360f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import pandas as pd
def preprocess_data(df: pd.DataFrame, target_col: str = "Churn") -> pd.DataFrame:
"""
Basic cleaning for Telco churn.
- trim column names
- drop obvious ID cols
- fix TotalCharges to numeric
- map target Churn to 0/1 if needed
- simple NA handling
"""
# tidy headers
df.columns = df.columns.str.strip() # Remove leading/trailing whitespace
# drop ids if present
for col in ["customerID", "CustomerID", "customer_id"]:
if col in df.columns:
df = df.drop(columns=[col])
# target to 0/1 if it's Yes/No
if target_col in df.columns and df[target_col].dtype == "object":
df[target_col] = df[target_col].str.strip().map({"No": 0, "Yes": 1})
# TotalCharges often has blanks in this dataset -> coerce to float
if "TotalCharges" in df.columns:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
# SeniorCitizen should be 0/1 ints if present
if "SeniorCitizen" in df.columns:
df["SeniorCitizen"] = df["SeniorCitizen"].fillna(0).astype(int)
# simple NA strategy:
# - numeric: fill with 0
# - others: leave for encoders to handle (get_dummies ignores NaN safely)
num_cols = df.select_dtypes(include=["number"]).columns
df[num_cols] = df[num_cols].fillna(0)
return df |