Spaces:

aliarafat-stack-ml
/

ml-demo

Running

App Files Files Community

ml-demo / utils /data_loader.py

aliarafat-stack-ml

fixed somethings

99592de about 1 month ago

raw

history blame contribute delete

4.73 kB

	import os
	import streamlit as st
	import pandas as pd
	import numpy as np
	import requests
	import urllib3
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder, StandardScaler

	# Suppress SSL warnings for local development
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
	DATA_PATH = os.path.join(DATA_DIR, "Telco-Customer-Churn.csv")

	# Direct download URL from IBM GitHub repository
	DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

	CATEGORICAL_COLS = [
	"gender", "Partner", "Dependents", "PhoneService", "MultipleLines",
	"InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
	"TechSupport", "StreamingTV", "StreamingMovies", "Contract",
	"PaperlessBilling", "PaymentMethod",
	]

	NUMERIC_COLS = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]


	def download_data_if_needed():
	"""Download the Telco Customer Churn dataset from IBM GitHub if it doesn't exist locally."""
	if not os.path.exists(DATA_PATH):
	os.makedirs(DATA_DIR, exist_ok=True)

	with st.spinner("📥 Downloading Telco Customer Churn dataset from IBM (one-time only)..."):
	try:
	# Download CSV from IBM GitHub
	# verify=False is needed for some local SSL certificate issues
	response = requests.get(DATA_URL, timeout=60, verify=False)
	response.raise_for_status()

	# Save CSV
	with open(DATA_PATH, 'wb') as f:
	f.write(response.content)

	st.success("✅ Dataset downloaded successfully!")

	except Exception as e:
	st.error(f"Failed to download dataset: {str(e)}")
	st.info(
	"Alternative: Download manually from "
	"https://github.com/IBM/telco-customer-churn-on-icp4d/tree/master/data "
	"and place 'Telco-Customer-Churn.csv' in the data/ folder."
	)
	raise Exception(f"Could not download dataset: {e}")


	@st.cache_data
	def load_raw_data() -> pd.DataFrame:
	download_data_if_needed()
	df = pd.read_csv(DATA_PATH)
	df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
	df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())
	df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
	return df


	@st.cache_data
	def get_encoded_data() -> tuple[pd.DataFrame, dict[str, LabelEncoder]]:
	"""Return encoded DataFrame and dict of fitted LabelEncoders (for inverse transforms)."""
	df = load_raw_data().copy()
	encoders: dict[str, LabelEncoder] = {}
	for col in CATEGORICAL_COLS:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col].astype(str))
	encoders[col] = le
	return df, encoders


	@st.cache_data
	def get_train_test(test_size: float = 0.2, random_state: int = 42):
	df, encoders = get_encoded_data()
	feature_cols = CATEGORICAL_COLS + NUMERIC_COLS
	X = df[feature_cols]
	y = df["Churn"]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size, stratify=y, random_state=random_state
	)
	return X_train, X_test, y_train, y_test, encoders, feature_cols


	@st.cache_data
	def get_onehot_train_test(test_size: float = 0.2, random_state: int = 42):
	"""One-Hot Encoded data for Logistic Regression. Same split indices as get_train_test."""
	df = load_raw_data().copy()
	df_oh = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True)
	feature_cols_oh = [c for c in df_oh.columns if c not in ["customerID", "Churn"]]
	X = df_oh[feature_cols_oh]
	y = df_oh["Churn"]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size, stratify=y, random_state=random_state
	)
	return X_train, X_test, y_train, y_test, feature_cols_oh


	@st.cache_data
	def get_scaled_train_test(test_size: float = 0.2, random_state: int = 42):
	"""Return scaled features (needed for SGDClassifier, Logistic Regression, and Naive Bayes)."""
	X_train, X_test, y_train, y_test, encoders, feature_cols = get_train_test(
	test_size, random_state
	)
	scaler = StandardScaler()
	X_train_sc = pd.DataFrame(
	scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index
	)
	X_test_sc = pd.DataFrame(
	scaler.transform(X_test), columns=feature_cols, index=X_test.index
	)
	return X_train_sc, X_test_sc, y_train, y_test, encoders, feature_cols, scaler