Spaces:

aliarafat-stack-ml
/

ml-demo

Sleeping

App Files Files Community

ml-demo / app.py

aliarafat-stack-ml

first commit

f3a6f24 about 2 months ago

raw

history blame contribute delete

13.7 kB

	import streamlit as st
	from utils.data_loader import load_raw_data
	from utils.rec_data_loader import load_clean_transactions, build_interaction_matrix

	st.set_page_config(
	page_title="Data Science Demo",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	st.title("Data Science Project Demo")
	st.markdown(
	"Demonstration of two core data science objectives: predicting customer churn "
	"and building a product recommendation system — from raw data to deployed models."
	)
	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# KPI Dashboard — Both Objectives
	# ═══════════════════════════════════════════════════════════════════════════════
	churn_col, rec_col = st.columns(2)

	with churn_col:
	st.markdown("### Customer Churn Prediction")
	df_churn = load_raw_data()
	total_customers = len(df_churn)
	churn_rate = df_churn["Churn"].mean()
	churned_count = int(df_churn["Churn"].sum())
	revenue_at_risk = df_churn.loc[df_churn["Churn"] == 1, "MonthlyCharges"].sum() * 12

	c1, c2 = st.columns(2)
	c1.metric("Total Customers", f"{total_customers:,}")
	c2.metric("Churn Rate", f"{churn_rate:.1%}", delta=f"-{churned_count:,} lost", delta_color="inverse")

	c3, c4 = st.columns(2)
	c3.metric("Avg Monthly Charge", f"${df_churn['MonthlyCharges'].mean():,.2f}")
	c4.metric("Annual Revenue at Risk", f"${revenue_at_risk:,.0f}")

	st.caption("Dataset: Telco Customer Churn — 7,043 customers, 21 features")

	with rec_col:
	st.markdown("### Product Recommendation")
	df_rec = load_clean_transactions()
	interactions, *_ = build_interaction_matrix()
	n_users, n_items = interactions.shape
	n_transactions = len(df_rec)
	sparsity = 1 - ((interactions > 0).sum().sum() / (n_users * n_items))
	total_revenue = df_rec["TotalPrice"].sum()

	r1, r2 = st.columns(2)
	r1.metric("Unique Customers", f"{n_users:,}")
	r2.metric("Unique Products", f"{n_items:,}")

	r3, r4 = st.columns(2)
	r3.metric("Transactions", f"{n_transactions:,}")
	r4.metric("Matrix Sparsity", f"{sparsity:.1%}")

	st.caption("Dataset: UCI Online Retail — 541K transactions, UK e-commerce 2010–2011")

	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# Two Objectives Overview
	# ═══════════════════════════════════════════════════════════════════════════════
	st.subheader("Two Objectives — Full Pipeline for Each")

	st.markdown(
	"""
	\| Objective \| Problem Type \| Key Algorithms \| Pages \|
	\|---\|---\|---\|---\|
	\| Customer Churn \| Binary Classification \| Logistic Regression, Naive Bayes, Random Forest, XGBoost, SGDClassifier \| EDA → Preprocessing → Churn Models → Live Updates \|
	\| Product Recommendation \| Matrix Factorization \| SVD, ALS, SGD (Funk SVD), NMF, Item-Based CF \| Rec EDA → Rec Preprocessing → Rec Models → Rec Live \|
	"""
	)

	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# The ML Journey — Step-by-step pipeline
	# ═══════════════════════════════════════════════════════════════════════════════
	st.subheader("The Machine Learning Journey — What We're Doing and Why")
	st.markdown(
	"Both objectives follow the same well-defined pipeline. "
	"Each step has a purpose and builds on the previous one:"
	)

	st.graphviz_chart("""
	digraph pipeline {
	rankdir=LR
	node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=11, margin="0.3,0.15"]
	edge [color="#888888", penwidth=1.5]

	data [label="1. Data\\nCollection", fillcolor="#dbeafe", color="#3b82f6"]
	eda [label="2. Exploratory\\nData Analysis", fillcolor="#e0e7ff", color="#6366f1"]
	preproc [label="3. Data\\nPreprocessing", fillcolor="#fce7f3", color="#ec4899"]
	train [label="4. Model\\nTraining", fillcolor="#d1fae5", color="#10b981"]
	eval [label="5. Evaluation\\n& Comparison", fillcolor="#fef3c7", color="#f59e0b"]
	deploy [label="6. Live Updates\\n& Deployment", fillcolor="#fee2e2", color="#ef4444"]

	data -> eda -> preproc -> train -> eval -> deploy
	}
	""")

	steps = [
	{
	"num": "1",
	"title": "Data Collection",
	"churn": "Telco Customer Churn — 7,043 customers with demographics, services, billing, and churn labels.",
	"rec": "UCI Online Retail — 541K purchase transactions from a UK e-commerce store.",
	"why": "You can't build a model without data. The quality and relevance of the data determines everything that follows.",
	},
	{
	"num": "2",
	"title": "Exploratory Data Analysis",
	"churn": "Visualize churn distribution, feature histograms, category-based churn rates, correlation heatmaps, and UMAP customer segments.",
	"rec": "Analyze purchase distributions, product popularity (long tail), user-item matrix sparsity, revenue trends, and geography.",
	"why": "Before building models, we need to understand the problem. EDA reveals patterns, catches data issues, and guides modeling choices.",
	},
	{
	"num": "3",
	"title": "Data Preprocessing",
	"churn": "Handle missing values (median imputation), encode categories (Label Encoding for trees, One-Hot for Logistic Regression), scale features (StandardScaler).",
	"rec": "Clean transactions (remove cancellations, missing IDs), aggregate into a user-item matrix, handle implicit feedback.",
	"why": "ML algorithms need clean, structured input. Raw data has missing values, text, and different scales — all of which confuse models.",
	},
	{
	"num": "4",
	"title": "Model Training",
	"churn": "Train Logistic Regression, Naive Bayes, Random Forest, and XGBoost — each with the encoding that's optimal for it.",
	"rec": "Train SVD, ALS, SGD (Funk SVD), NMF, and Item-Based CF — five approaches to matrix factorization and collaborative filtering.",
	"why": "Different algorithms have different strengths. Training multiple models lets us find the best approach for each problem.",
	},
	{
	"num": "5",
	"title": "Evaluation & Comparison",
	"churn": "Compare using Accuracy, Precision, Recall, F1, AUC. Explain predictions with SHAP.",
	"rec": "Compare using Precision@K, Recall@K, Hit Rate. Inspect individual recommendations for quality.",
	"why": "A model is only useful if it's accurate and we can measure how accurate. Different metrics capture different aspects of quality.",
	},
	{
	"num": "6",
	"title": "Live Updates & Deployment",
	"churn": "Incremental learning with SGDClassifier.partial_fit() — update the model as new customer data streams in.",
	"rec": "Monthly batch retraining of ALS — model quality improves as purchase history accumulates over time.",
	"why": "In the real world, data doesn't stop arriving. Deployed models must adapt to new patterns without expensive full retraining.",
	},
	]

	for step in steps:
	with st.expander(f"Step {step['num']}: {step['title']}", expanded=False):
	st.markdown(f"Why: {step['why']}")
	col_churn, col_rec = st.columns(2)
	with col_churn:
	st.markdown(f"Churn Prediction: \n{step['churn']}")
	with col_rec:
	st.markdown(f"Recommendation: \n{step['rec']}")

	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# How a model actually "learns" — quick primer
	# ═══════════════════════════════════════════════════════════════════════════════
	st.subheader("How Does a Machine Learning Model Actually \"Learn\"?")
	st.markdown(
	"""
	At its core, every ML model follows the same loop:
	"""
	)

	st.graphviz_chart("""
	digraph learning {
	rankdir=LR
	node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=11, margin="0.3,0.15"]
	edge [color="#888888", penwidth=1.5]

	input [label="Input Data\\n(features)", fillcolor="#dbeafe", color="#3b82f6"]
	guess [label="Model Makes\\na Prediction", fillcolor="#d1fae5", color="#10b981"]
	check [label="Compare to\\nActual Answer", fillcolor="#fef3c7", color="#f59e0b"]
	adjust [label="Adjust Model\\n(reduce error)", fillcolor="#fee2e2", color="#ef4444"]

	input -> guess -> check -> adjust
	adjust -> guess [style=dashed, label="repeat", fontsize=9]
	}
	""")

	st.markdown(
	"""
	1. Input — The model receives data (customer features for churn, purchase history for recommendations)
	2. Predict — It makes a guess (will they churn? what would they buy?)
	3. Compare — We check the guess against reality
	4. Adjust — The model tweaks its internal parameters to reduce the error
	5. Repeat — This cycle runs until the model gets good

	Different algorithms differ in how they make predictions and how they adjust.
	The algorithm explanation pages cover each one with diagrams.
	"""
	)

	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# Business Impact
	# ═══════════════════════════════════════════════════════════════════════════════
	st.subheader("Business Impact")

	impact_churn, impact_rec = st.columns(2)
	with impact_churn:
	st.markdown("#### Churn Reduction")
	reduction_pct = st.slider(
	"If we reduce churn by this %",
	min_value=1, max_value=50, value=10, step=1, format="%d%%",
	)
	savings = revenue_at_risk * (reduction_pct / 100)
	st.success(f"{reduction_pct}% churn reduction → ${savings:,.0f}/year saved")

	with impact_rec:
	st.markdown("#### Recommendation Revenue")
	avg_order = df_rec["TotalPrice"].mean()
	conversion_pct = st.slider(
	"If recommendations lift conversion by this %",
	min_value=1, max_value=30, value=5, step=1, format="%d%%",
	)
	additional = total_revenue * (conversion_pct / 100)
	st.success(f"{conversion_pct}% conversion lift → £{additional:,.0f}/year additional revenue")

	st.markdown("---")

	# ═══════════════════════════════════════════════════════════════════════════════
	# Page Directory
	# ═══════════════════════════════════════════════════════════════════════════════
	st.subheader("Page Directory")

	st.markdown(
	"""
	Customer Churn Prediction:

	\| Page \| What You'll See \|
	\|---\|---\|
	\| EDA \| Interactive charts exploring customer demographics, services, and churn patterns \|
	\| Preprocessing \| Step-by-step data cleaning, encoding (Label + One-Hot), scaling with before/after visuals \|
	\| Churn Models \| Algorithm explainers with diagrams, model comparison, SHAP explanations, What-If predictor \|
	\| Live Updates \| All 4 churn models compete — SGD (partial_fit) vs LR, RF, XGBoost (full retrain) \|

	Product Recommendation:

	\| Page \| What You'll See \|
	\|---\|---\|
	\| Rec EDA \| Purchase distributions, product popularity (long tail), sparsity analysis, user-item matrix \|
	\| Rec Preprocessing \| Transaction-to-matrix pipeline, implicit vs explicit feedback, train/test split \|
	\| Rec Models \| Algorithm explainers for SVD/ALS/SGD/NMF/Item-CF, model comparison, interactive recommendations \|
	\| Rec Live \| Monthly data streaming demo — all 5 models compete as purchase history grows \|
	"""
	)

	st.markdown("---")
	st.caption(
	"Data: Telco Customer Churn Dataset + UCI Online Retail Dataset • "
	"Built with Streamlit, scikit-learn, XGBoost, SHAP, implicit, scipy"
	)