Spaces:
Running
Running
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import requests | |
| import urllib3 | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| # Suppress SSL warnings for local development | |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
| DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data") | |
| DATA_PATH = os.path.join(DATA_DIR, "Telco-Customer-Churn.csv") | |
| # Direct download URL from IBM GitHub repository | |
| DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv" | |
| CATEGORICAL_COLS = [ | |
| "gender", "Partner", "Dependents", "PhoneService", "MultipleLines", | |
| "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", | |
| "TechSupport", "StreamingTV", "StreamingMovies", "Contract", | |
| "PaperlessBilling", "PaymentMethod", | |
| ] | |
| NUMERIC_COLS = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"] | |
| def download_data_if_needed(): | |
| """Download the Telco Customer Churn dataset from IBM GitHub if it doesn't exist locally.""" | |
| if not os.path.exists(DATA_PATH): | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| with st.spinner("📥 Downloading Telco Customer Churn dataset from IBM (one-time only)..."): | |
| try: | |
| # Download CSV from IBM GitHub | |
| # verify=False is needed for some local SSL certificate issues | |
| response = requests.get(DATA_URL, timeout=60, verify=False) | |
| response.raise_for_status() | |
| # Save CSV | |
| with open(DATA_PATH, 'wb') as f: | |
| f.write(response.content) | |
| st.success("✅ Dataset downloaded successfully!") | |
| except Exception as e: | |
| st.error(f"Failed to download dataset: {str(e)}") | |
| st.info( | |
| "**Alternative:** Download manually from " | |
| "https://github.com/IBM/telco-customer-churn-on-icp4d/tree/master/data " | |
| "and place 'Telco-Customer-Churn.csv' in the data/ folder." | |
| ) | |
| raise Exception(f"Could not download dataset: {e}") | |
| def load_raw_data() -> pd.DataFrame: | |
| download_data_if_needed() | |
| df = pd.read_csv(DATA_PATH) | |
| df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce") | |
| df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median()) | |
| df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0}) | |
| return df | |
| def get_encoded_data() -> tuple[pd.DataFrame, dict[str, LabelEncoder]]: | |
| """Return encoded DataFrame and dict of fitted LabelEncoders (for inverse transforms).""" | |
| df = load_raw_data().copy() | |
| encoders: dict[str, LabelEncoder] = {} | |
| for col in CATEGORICAL_COLS: | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col].astype(str)) | |
| encoders[col] = le | |
| return df, encoders | |
| def get_train_test(test_size: float = 0.2, random_state: int = 42): | |
| df, encoders = get_encoded_data() | |
| feature_cols = CATEGORICAL_COLS + NUMERIC_COLS | |
| X = df[feature_cols] | |
| y = df["Churn"] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, stratify=y, random_state=random_state | |
| ) | |
| return X_train, X_test, y_train, y_test, encoders, feature_cols | |
| def get_onehot_train_test(test_size: float = 0.2, random_state: int = 42): | |
| """One-Hot Encoded data for Logistic Regression. Same split indices as get_train_test.""" | |
| df = load_raw_data().copy() | |
| df_oh = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True) | |
| feature_cols_oh = [c for c in df_oh.columns if c not in ["customerID", "Churn"]] | |
| X = df_oh[feature_cols_oh] | |
| y = df_oh["Churn"] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, stratify=y, random_state=random_state | |
| ) | |
| return X_train, X_test, y_train, y_test, feature_cols_oh | |
| def get_scaled_train_test(test_size: float = 0.2, random_state: int = 42): | |
| """Return scaled features (needed for SGDClassifier, Logistic Regression, and Naive Bayes).""" | |
| X_train, X_test, y_train, y_test, encoders, feature_cols = get_train_test( | |
| test_size, random_state | |
| ) | |
| scaler = StandardScaler() | |
| X_train_sc = pd.DataFrame( | |
| scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index | |
| ) | |
| X_test_sc = pd.DataFrame( | |
| scaler.transform(X_test), columns=feature_cols, index=X_test.index | |
| ) | |
| return X_train_sc, X_test_sc, y_train, y_test, encoders, feature_cols, scaler | |