ml-demo / app.py
aliarafat-stack-ml's picture
first commit
f3a6f24
import streamlit as st
from utils.data_loader import load_raw_data
from utils.rec_data_loader import load_clean_transactions, build_interaction_matrix
st.set_page_config(
page_title="Data Science Demo",
page_icon="πŸ“Š",
layout="wide",
initial_sidebar_state="expanded",
)
st.title("Data Science Project Demo")
st.markdown(
"Demonstration of two core data science objectives: **predicting customer churn** "
"and **building a product recommendation system** β€” from raw data to deployed models."
)
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# KPI Dashboard β€” Both Objectives
# ═══════════════════════════════════════════════════════════════════════════════
churn_col, rec_col = st.columns(2)
with churn_col:
st.markdown("### Customer Churn Prediction")
df_churn = load_raw_data()
total_customers = len(df_churn)
churn_rate = df_churn["Churn"].mean()
churned_count = int(df_churn["Churn"].sum())
revenue_at_risk = df_churn.loc[df_churn["Churn"] == 1, "MonthlyCharges"].sum() * 12
c1, c2 = st.columns(2)
c1.metric("Total Customers", f"{total_customers:,}")
c2.metric("Churn Rate", f"{churn_rate:.1%}", delta=f"-{churned_count:,} lost", delta_color="inverse")
c3, c4 = st.columns(2)
c3.metric("Avg Monthly Charge", f"${df_churn['MonthlyCharges'].mean():,.2f}")
c4.metric("Annual Revenue at Risk", f"${revenue_at_risk:,.0f}")
st.caption("Dataset: Telco Customer Churn β€” 7,043 customers, 21 features")
with rec_col:
st.markdown("### Product Recommendation")
df_rec = load_clean_transactions()
interactions, *_ = build_interaction_matrix()
n_users, n_items = interactions.shape
n_transactions = len(df_rec)
sparsity = 1 - ((interactions > 0).sum().sum() / (n_users * n_items))
total_revenue = df_rec["TotalPrice"].sum()
r1, r2 = st.columns(2)
r1.metric("Unique Customers", f"{n_users:,}")
r2.metric("Unique Products", f"{n_items:,}")
r3, r4 = st.columns(2)
r3.metric("Transactions", f"{n_transactions:,}")
r4.metric("Matrix Sparsity", f"{sparsity:.1%}")
st.caption("Dataset: UCI Online Retail β€” 541K transactions, UK e-commerce 2010–2011")
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# Two Objectives Overview
# ═══════════════════════════════════════════════════════════════════════════════
st.subheader("Two Objectives β€” Full Pipeline for Each")
st.markdown(
"""
| Objective | Problem Type | Key Algorithms | Pages |
|---|---|---|---|
| **Customer Churn** | Binary Classification | Logistic Regression, Naive Bayes, Random Forest, XGBoost, SGDClassifier | EDA β†’ Preprocessing β†’ Churn Models β†’ Live Updates |
| **Product Recommendation** | Matrix Factorization | SVD, ALS, SGD (Funk SVD), NMF, Item-Based CF | Rec EDA β†’ Rec Preprocessing β†’ Rec Models β†’ Rec Live |
"""
)
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# The ML Journey β€” Step-by-step pipeline
# ═══════════════════════════════════════════════════════════════════════════════
st.subheader("The Machine Learning Journey β€” What We're Doing and Why")
st.markdown(
"Both objectives follow the same well-defined pipeline. "
"Each step has a purpose and builds on the previous one:"
)
st.graphviz_chart("""
digraph pipeline {
rankdir=LR
node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=11, margin="0.3,0.15"]
edge [color="#888888", penwidth=1.5]
data [label="1. Data\\nCollection", fillcolor="#dbeafe", color="#3b82f6"]
eda [label="2. Exploratory\\nData Analysis", fillcolor="#e0e7ff", color="#6366f1"]
preproc [label="3. Data\\nPreprocessing", fillcolor="#fce7f3", color="#ec4899"]
train [label="4. Model\\nTraining", fillcolor="#d1fae5", color="#10b981"]
eval [label="5. Evaluation\\n& Comparison", fillcolor="#fef3c7", color="#f59e0b"]
deploy [label="6. Live Updates\\n& Deployment", fillcolor="#fee2e2", color="#ef4444"]
data -> eda -> preproc -> train -> eval -> deploy
}
""")
steps = [
{
"num": "1",
"title": "Data Collection",
"churn": "**Telco Customer Churn** β€” 7,043 customers with demographics, services, billing, and churn labels.",
"rec": "**UCI Online Retail** β€” 541K purchase transactions from a UK e-commerce store.",
"why": "You can't build a model without data. The quality and relevance of the data determines everything that follows.",
},
{
"num": "2",
"title": "Exploratory Data Analysis",
"churn": "Visualize churn distribution, feature histograms, category-based churn rates, correlation heatmaps, and UMAP customer segments.",
"rec": "Analyze purchase distributions, product popularity (long tail), user-item matrix sparsity, revenue trends, and geography.",
"why": "Before building models, we need to understand the problem. EDA reveals patterns, catches data issues, and guides modeling choices.",
},
{
"num": "3",
"title": "Data Preprocessing",
"churn": "Handle missing values (median imputation), encode categories (Label Encoding for trees, One-Hot for Logistic Regression), scale features (StandardScaler).",
"rec": "Clean transactions (remove cancellations, missing IDs), aggregate into a user-item matrix, handle implicit feedback.",
"why": "ML algorithms need clean, structured input. Raw data has missing values, text, and different scales β€” all of which confuse models.",
},
{
"num": "4",
"title": "Model Training",
"churn": "Train **Logistic Regression**, **Naive Bayes**, **Random Forest**, and **XGBoost** β€” each with the encoding that's optimal for it.",
"rec": "Train **SVD**, **ALS**, **SGD (Funk SVD)**, **NMF**, and **Item-Based CF** β€” five approaches to matrix factorization and collaborative filtering.",
"why": "Different algorithms have different strengths. Training multiple models lets us find the best approach for each problem.",
},
{
"num": "5",
"title": "Evaluation & Comparison",
"churn": "Compare using Accuracy, Precision, Recall, F1, AUC. Explain predictions with **SHAP**.",
"rec": "Compare using Precision@K, Recall@K, Hit Rate. Inspect individual recommendations for quality.",
"why": "A model is only useful if it's accurate and we can measure how accurate. Different metrics capture different aspects of quality.",
},
{
"num": "6",
"title": "Live Updates & Deployment",
"churn": "Incremental learning with **SGDClassifier.partial_fit()** β€” update the model as new customer data streams in.",
"rec": "Monthly batch retraining of **ALS** β€” model quality improves as purchase history accumulates over time.",
"why": "In the real world, data doesn't stop arriving. Deployed models must adapt to new patterns without expensive full retraining.",
},
]
for step in steps:
with st.expander(f"Step {step['num']}: {step['title']}", expanded=False):
st.markdown(f"**Why:** {step['why']}")
col_churn, col_rec = st.columns(2)
with col_churn:
st.markdown(f"**Churn Prediction:** \n{step['churn']}")
with col_rec:
st.markdown(f"**Recommendation:** \n{step['rec']}")
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# How a model actually "learns" β€” quick primer
# ═══════════════════════════════════════════════════════════════════════════════
st.subheader("How Does a Machine Learning Model Actually \"Learn\"?")
st.markdown(
"""
At its core, every ML model follows the same loop:
"""
)
st.graphviz_chart("""
digraph learning {
rankdir=LR
node [shape=box, style="rounded,filled", fontname="Helvetica", fontsize=11, margin="0.3,0.15"]
edge [color="#888888", penwidth=1.5]
input [label="Input Data\\n(features)", fillcolor="#dbeafe", color="#3b82f6"]
guess [label="Model Makes\\na Prediction", fillcolor="#d1fae5", color="#10b981"]
check [label="Compare to\\nActual Answer", fillcolor="#fef3c7", color="#f59e0b"]
adjust [label="Adjust Model\\n(reduce error)", fillcolor="#fee2e2", color="#ef4444"]
input -> guess -> check -> adjust
adjust -> guess [style=dashed, label="repeat", fontsize=9]
}
""")
st.markdown(
"""
1. **Input** β€” The model receives data (customer features for churn, purchase history for recommendations)
2. **Predict** β€” It makes a guess (will they churn? what would they buy?)
3. **Compare** β€” We check the guess against reality
4. **Adjust** β€” The model tweaks its internal parameters to reduce the error
5. **Repeat** β€” This cycle runs until the model gets good
Different algorithms differ in *how* they make predictions and *how* they adjust.
The algorithm explanation pages cover each one with diagrams.
"""
)
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# Business Impact
# ═══════════════════════════════════════════════════════════════════════════════
st.subheader("Business Impact")
impact_churn, impact_rec = st.columns(2)
with impact_churn:
st.markdown("#### Churn Reduction")
reduction_pct = st.slider(
"If we reduce churn by this %",
min_value=1, max_value=50, value=10, step=1, format="%d%%",
)
savings = revenue_at_risk * (reduction_pct / 100)
st.success(f"**{reduction_pct}% churn reduction** β†’ **${savings:,.0f}/year** saved")
with impact_rec:
st.markdown("#### Recommendation Revenue")
avg_order = df_rec["TotalPrice"].mean()
conversion_pct = st.slider(
"If recommendations lift conversion by this %",
min_value=1, max_value=30, value=5, step=1, format="%d%%",
)
additional = total_revenue * (conversion_pct / 100)
st.success(f"**{conversion_pct}% conversion lift** β†’ **Β£{additional:,.0f}/year** additional revenue")
st.markdown("---")
# ═══════════════════════════════════════════════════════════════════════════════
# Page Directory
# ═══════════════════════════════════════════════════════════════════════════════
st.subheader("Page Directory")
st.markdown(
"""
**Customer Churn Prediction:**
| Page | What You'll See |
|---|---|
| **EDA** | Interactive charts exploring customer demographics, services, and churn patterns |
| **Preprocessing** | Step-by-step data cleaning, encoding (Label + One-Hot), scaling with before/after visuals |
| **Churn Models** | Algorithm explainers with diagrams, model comparison, SHAP explanations, What-If predictor |
| **Live Updates** | All 4 churn models compete β€” SGD (partial_fit) vs LR, RF, XGBoost (full retrain) |
**Product Recommendation:**
| Page | What You'll See |
|---|---|
| **Rec EDA** | Purchase distributions, product popularity (long tail), sparsity analysis, user-item matrix |
| **Rec Preprocessing** | Transaction-to-matrix pipeline, implicit vs explicit feedback, train/test split |
| **Rec Models** | Algorithm explainers for SVD/ALS/SGD/NMF/Item-CF, model comparison, interactive recommendations |
| **Rec Live** | Monthly data streaming demo β€” all 5 models compete as purchase history grows |
"""
)
st.markdown("---")
st.caption(
"Data: Telco Customer Churn Dataset + UCI Online Retail Dataset β€’ "
"Built with Streamlit, scikit-learn, XGBoost, SHAP, implicit, scipy"
)