File size: 1,164 Bytes
07e37a4
 
 
 
 
 
4ba360f
07e37a4
 
4ba360f
 
07e37a4
 
 
 
 
 
 
 
4ba360f
 
 
 
 
 
 
 
07e37a4
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os, sys
import pandas as pd

# make src importable
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from src.data_scripts.preprocess import preprocess_data
from src.features.build_features import build_features

RAW = r"E:\Coding\AIMl\Churn\data\raw\WA_Fn-UseC_-Telco-Customer-Churn.csv"
OUT = r"E:\Coding\AIMl\Churn\data\processed\WA_Fn-UseC_-Telco-Customer-Churn.csv"

# 1) load raw
df = pd.read_csv(RAW)

# 2) preprocess (drops id, fixes TotalCharges, etc.)
df = preprocess_data(df, target_col="Churn")

# 3) ensure target is 0/1 only if still object
# 🔥 Always normalize target column (robust)
df["Churn"] = (
    df["Churn"]
    .astype(str)
    .str.strip()
    .str.lower()
    .map({"no": 0, "yes": 1})
)
# sanity checks
assert df["Churn"].isna().sum() == 0, "Churn has NaNs after preprocess"
assert set(df["Churn"].unique()) <= {0, 1}, "Churn not 0/1 after preprocess"

# 4) features
df_processed = build_features(df, target_col="Churn")

# 5) save
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df_processed.to_csv(OUT, index=False)
print(f"✅ Processed dataset saved to {OUT} | Shape: {df_processed.shape}")