File size: 4,732 Bytes
f3a6f24
 
 
 
ddfcd41
99592de
f3a6f24
 
 
99592de
 
 
ddfcd41
 
 
 
 
f3a6f24
 
 
 
 
 
 
 
 
 
 
ddfcd41
 
 
 
 
 
 
 
99592de
 
ddfcd41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3a6f24
 
ddfcd41
f3a6f24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import streamlit as st
import pandas as pd
import numpy as np
import requests
import urllib3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Suppress SSL warnings for local development
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
DATA_PATH = os.path.join(DATA_DIR, "Telco-Customer-Churn.csv")

# Direct download URL from IBM GitHub repository
DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

CATEGORICAL_COLS = [
    "gender", "Partner", "Dependents", "PhoneService", "MultipleLines",
    "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "Contract",
    "PaperlessBilling", "PaymentMethod",
]

NUMERIC_COLS = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]


def download_data_if_needed():
    """Download the Telco Customer Churn dataset from IBM GitHub if it doesn't exist locally."""
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_DIR, exist_ok=True)
        
        with st.spinner("📥 Downloading Telco Customer Churn dataset from IBM (one-time only)..."):
            try:
                # Download CSV from IBM GitHub
                # verify=False is needed for some local SSL certificate issues
                response = requests.get(DATA_URL, timeout=60, verify=False)
                response.raise_for_status()
                
                # Save CSV
                with open(DATA_PATH, 'wb') as f:
                    f.write(response.content)
                
                st.success("✅ Dataset downloaded successfully!")
                
            except Exception as e:
                st.error(f"Failed to download dataset: {str(e)}")
                st.info(
                    "**Alternative:** Download manually from "
                    "https://github.com/IBM/telco-customer-churn-on-icp4d/tree/master/data "
                    "and place 'Telco-Customer-Churn.csv' in the data/ folder."
                )
                raise Exception(f"Could not download dataset: {e}")


@st.cache_data
def load_raw_data() -> pd.DataFrame:
    download_data_if_needed()
    df = pd.read_csv(DATA_PATH)
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())
    df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
    return df


@st.cache_data
def get_encoded_data() -> tuple[pd.DataFrame, dict[str, LabelEncoder]]:
    """Return encoded DataFrame and dict of fitted LabelEncoders (for inverse transforms)."""
    df = load_raw_data().copy()
    encoders: dict[str, LabelEncoder] = {}
    for col in CATEGORICAL_COLS:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
    return df, encoders


@st.cache_data
def get_train_test(test_size: float = 0.2, random_state: int = 42):
    df, encoders = get_encoded_data()
    feature_cols = CATEGORICAL_COLS + NUMERIC_COLS
    X = df[feature_cols]
    y = df["Churn"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    return X_train, X_test, y_train, y_test, encoders, feature_cols


@st.cache_data
def get_onehot_train_test(test_size: float = 0.2, random_state: int = 42):
    """One-Hot Encoded data for Logistic Regression. Same split indices as get_train_test."""
    df = load_raw_data().copy()
    df_oh = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True)
    feature_cols_oh = [c for c in df_oh.columns if c not in ["customerID", "Churn"]]
    X = df_oh[feature_cols_oh]
    y = df_oh["Churn"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    return X_train, X_test, y_train, y_test, feature_cols_oh


@st.cache_data
def get_scaled_train_test(test_size: float = 0.2, random_state: int = 42):
    """Return scaled features (needed for SGDClassifier, Logistic Regression, and Naive Bayes)."""
    X_train, X_test, y_train, y_test, encoders, feature_cols = get_train_test(
        test_size, random_state
    )
    scaler = StandardScaler()
    X_train_sc = pd.DataFrame(
        scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index
    )
    X_test_sc = pd.DataFrame(
        scaler.transform(X_test), columns=feature_cols, index=X_test.index
    )
    return X_train_sc, X_test_sc, y_train, y_test, encoders, feature_cols, scaler