import os import streamlit as st import pandas as pd import numpy as np import requests import urllib3 from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler # Suppress SSL warnings for local development urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data") DATA_PATH = os.path.join(DATA_DIR, "Telco-Customer-Churn.csv") # Direct download URL from IBM GitHub repository DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv" CATEGORICAL_COLS = [ "gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", ] NUMERIC_COLS = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"] def download_data_if_needed(): """Download the Telco Customer Churn dataset from IBM GitHub if it doesn't exist locally.""" if not os.path.exists(DATA_PATH): os.makedirs(DATA_DIR, exist_ok=True) with st.spinner("📥 Downloading Telco Customer Churn dataset from IBM (one-time only)..."): try: # Download CSV from IBM GitHub # verify=False is needed for some local SSL certificate issues response = requests.get(DATA_URL, timeout=60, verify=False) response.raise_for_status() # Save CSV with open(DATA_PATH, 'wb') as f: f.write(response.content) st.success("✅ Dataset downloaded successfully!") except Exception as e: st.error(f"Failed to download dataset: {str(e)}") st.info( "**Alternative:** Download manually from " "https://github.com/IBM/telco-customer-churn-on-icp4d/tree/master/data " "and place 'Telco-Customer-Churn.csv' in the data/ folder." ) raise Exception(f"Could not download dataset: {e}") @st.cache_data def load_raw_data() -> pd.DataFrame: download_data_if_needed() df = pd.read_csv(DATA_PATH) df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce") df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median()) df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0}) return df @st.cache_data def get_encoded_data() -> tuple[pd.DataFrame, dict[str, LabelEncoder]]: """Return encoded DataFrame and dict of fitted LabelEncoders (for inverse transforms).""" df = load_raw_data().copy() encoders: dict[str, LabelEncoder] = {} for col in CATEGORICAL_COLS: le = LabelEncoder() df[col] = le.fit_transform(df[col].astype(str)) encoders[col] = le return df, encoders @st.cache_data def get_train_test(test_size: float = 0.2, random_state: int = 42): df, encoders = get_encoded_data() feature_cols = CATEGORICAL_COLS + NUMERIC_COLS X = df[feature_cols] y = df["Churn"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, stratify=y, random_state=random_state ) return X_train, X_test, y_train, y_test, encoders, feature_cols @st.cache_data def get_onehot_train_test(test_size: float = 0.2, random_state: int = 42): """One-Hot Encoded data for Logistic Regression. Same split indices as get_train_test.""" df = load_raw_data().copy() df_oh = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True) feature_cols_oh = [c for c in df_oh.columns if c not in ["customerID", "Churn"]] X = df_oh[feature_cols_oh] y = df_oh["Churn"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, stratify=y, random_state=random_state ) return X_train, X_test, y_train, y_test, feature_cols_oh @st.cache_data def get_scaled_train_test(test_size: float = 0.2, random_state: int = 42): """Return scaled features (needed for SGDClassifier, Logistic Regression, and Naive Bayes).""" X_train, X_test, y_train, y_test, encoders, feature_cols = get_train_test( test_size, random_state ) scaler = StandardScaler() X_train_sc = pd.DataFrame( scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index ) X_test_sc = pd.DataFrame( scaler.transform(X_test), columns=feature_cols, index=X_test.index ) return X_train_sc, X_test_sc, y_train, y_test, encoders, feature_cols, scaler