Spaces:
Sleeping
Sleeping
File size: 1,164 Bytes
07e37a4 4ba360f 07e37a4 4ba360f 07e37a4 4ba360f 07e37a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import os, sys
import pandas as pd
# make src importable
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.data_scripts.preprocess import preprocess_data
from src.features.build_features import build_features
RAW = r"E:\Coding\AIMl\Churn\data\raw\WA_Fn-UseC_-Telco-Customer-Churn.csv"
OUT = r"E:\Coding\AIMl\Churn\data\processed\WA_Fn-UseC_-Telco-Customer-Churn.csv"
# 1) load raw
df = pd.read_csv(RAW)
# 2) preprocess (drops id, fixes TotalCharges, etc.)
df = preprocess_data(df, target_col="Churn")
# 3) ensure target is 0/1 only if still object
# 🔥 Always normalize target column (robust)
df["Churn"] = (
df["Churn"]
.astype(str)
.str.strip()
.str.lower()
.map({"no": 0, "yes": 1})
)
# sanity checks
assert df["Churn"].isna().sum() == 0, "Churn has NaNs after preprocess"
assert set(df["Churn"].unique()) <= {0, 1}, "Churn not 0/1 after preprocess"
# 4) features
df_processed = build_features(df, target_col="Churn")
# 5) save
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df_processed.to_csv(OUT, index=False)
print(f"✅ Processed dataset saved to {OUT} | Shape: {df_processed.shape}")
|