leadscore-ai / app.py
SrihariV's picture
Rename src/app.py to app.py
f7476be verified
"""
LeadScore AI — Enterprise Intent Engine
LightGBM + SHAP | AUC 0.83
"""
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")
import io
import os
import base64
from datetime import datetime
# ─── PAGE CONFIG ────────────────────────────────────────────────────────────
st.set_page_config(
page_title="LeadScore AI | Enterprise Intent Engine",
page_icon="✨",
layout="wide",
initial_sidebar_state="collapsed",
menu_items={
"About": "LeadScore AI — Enterprise lead intelligence powered by LightGBM + SHAP."
}
)
# ─── CUSTOM CSS ──────────────────────────────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Crimson+Pro:wght@400;600&family=Inter:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap');
:root {
--bg-main: #FAF9F6;
--bg-card: #FFFFFF;
--bg-code: #F7F6F3;
--text-main: #2D2A26;
--text-muted: #66635F;
--text-light: #9A9590;
--border: #E8E6E1;
--border-light: #F0EEEA;
--accent: #D96A47;
--accent-light: #FDF2EE;
--teal: #3E7C77;
--teal-light: #EDF5F4;
--warm: #B38622;
--warm-light: #FDF8EC;
--shadow: 0 4px 20px rgba(0, 0, 0, 0.03);
--shadow-hover: 0 8px 30px rgba(0, 0, 0, 0.06);
}
html, body, .stApp {
background-color: var(--bg-main) !important;
color: var(--text-main) !important;
font-family: 'Inter', -apple-system, sans-serif !important;
}
#MainMenu, footer, header { visibility: hidden; }
.block-container { padding-top: 3rem; padding-bottom: 4rem; max-width: 1100px; }
h1, h2, h3, h4, .serif {
font-family: 'Crimson Pro', serif !important;
color: var(--text-main) !important;
}
/* ─── Hero Section ─── */
.hero {
text-align: center;
padding: 2.5rem 1rem 3.5rem 1rem;
border-bottom: 1px solid var(--border);
margin-bottom: 2.5rem;
}
.hero-tag {
font-size: 0.72rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.12em;
color: var(--accent);
margin-bottom: 1rem;
display: inline-block;
}
.hero h1 {
font-size: 3.5rem;
font-weight: 400;
line-height: 1.1;
margin: 0 0 1.2rem 0;
letter-spacing: -0.02em;
}
.hero p {
font-size: 1.1rem;
color: var(--text-muted);
max-width: 680px;
margin: 0 auto 2rem auto;
line-height: 1.7;
}
/* ─── Metrics Row ─── */
.metrics-row {
display: flex;
justify-content: center;
gap: 3rem;
flex-wrap: wrap;
margin-top: 1.5rem;
}
.metric-pill {
text-align: center;
min-width: 100px;
}
.metric-pill .val {
display: block;
font-family: 'Inter', sans-serif;
font-weight: 600;
font-size: 1.3rem;
color: var(--text-main);
}
.metric-pill .lbl {
font-size: 0.72rem;
color: var(--text-muted);
text-transform: uppercase;
letter-spacing: 0.06em;
margin-top: 0.25rem;
display: block;
}
/* ─── Section Titles ─── */
.section-title {
font-family: 'Crimson Pro', serif;
font-size: 2rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.5rem;
margin-bottom: 2rem;
color: var(--text-main);
}
.section-subtitle {
font-size: 1.05rem;
color: var(--text-muted);
line-height: 1.7;
margin-bottom: 2rem;
max-width: 720px;
}
/* ─── Cards ─── */
.card {
background: var(--bg-card);
border: 1px solid var(--border);
border-radius: 8px;
padding: 2rem;
margin-bottom: 1.5rem;
box-shadow: var(--shadow);
height: 100%;
display: flex;
flex-direction: column;
}
.card h3 {
font-size: 1.35rem;
margin: 0 0 0.8rem 0;
}
.card h4 {
font-size: 1.1rem;
margin: 1.2rem 0 0.5rem 0;
color: var(--text-main);
}
.card p, .card li {
color: var(--text-muted);
font-size: 0.95rem;
line-height: 1.75;
margin: 0 0 0.7rem 0;
}
.card ul { padding-left: 1.5rem; margin-bottom: 0.5rem; }
/* Card with accent top border */
.card-accent { border-top: 3px solid var(--accent); }
.card-teal { border-top: 3px solid var(--teal); }
.card-warm { border-top: 3px solid var(--warm); }
/* ─── Score Display ─── */
.score-display {
text-align: center;
padding: 3rem 2rem;
background: var(--bg-card);
border: 1px solid var(--border);
border-radius: 8px;
box-shadow: var(--shadow);
}
.score-number {
font-family: 'Crimson Pro', serif;
font-size: 6rem;
line-height: 1;
margin-bottom: 0.5rem;
}
.score-label {
font-size: 0.8rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--text-muted);
margin-bottom: 1.5rem;
}
.score-badge {
display: inline-block;
padding: 0.4rem 1.2rem;
border-radius: 100px;
font-size: 0.85rem;
font-weight: 500;
}
.badge-hot { background: var(--accent-light); color: var(--accent); border: 1px solid #F6D6CB; }
.badge-warm { background: var(--warm-light); color: var(--warm); border: 1px solid #F5E8C3; }
.badge-cold { background: #F4F4F4; color: var(--text-muted); border: 1px solid #E4E3E0; }
/* ─── Form Elements ─── */
.stSlider label, .stSelectbox label {
color: var(--text-main) !important;
font-weight: 500 !important;
font-size: 0.95rem !important;
}
/* ─── Tabs ─── */
.stTabs [data-baseweb="tab-list"] {
background: transparent;
border-bottom: 1px solid var(--border);
gap: 2rem;
padding-bottom: 0;
margin-bottom: 2rem;
}
.stTabs [data-baseweb="tab"] {
background: transparent !important;
color: var(--text-muted) !important;
font-family: 'Inter', sans-serif !important;
font-weight: 500 !important;
font-size: 0.95rem !important;
padding: 0.5rem 0 1rem 0 !important;
border: none !important;
}
.stTabs [data-baseweb="tab"]:hover {
color: var(--text-main) !important;
}
.stTabs [aria-selected="true"] {
color: var(--text-main) !important;
border-bottom: 2px solid var(--text-main) !important;
}
/* ─── Tables ─── */
.stDataFrame { border-radius: 8px; border: 1px solid var(--border); }
.stDataFrame thead th {
background: var(--bg-main) !important;
color: var(--text-main) !important;
font-size: 0.85rem !important;
font-weight: 600 !important;
border-bottom: 1px solid var(--border) !important;
}
/* ─── Buttons ─── */
.stButton > button, .stDownloadButton > button {
background: var(--bg-card) !important;
color: var(--text-main) !important;
border: 1px solid var(--border) !important;
border-radius: 6px !important;
font-weight: 500 !important;
padding: 0.5rem 1.5rem !important;
transition: all 0.2s !important;
}
.stButton > button:hover, .stDownloadButton > button:hover {
border-color: var(--text-main) !important;
box-shadow: var(--shadow) !important;
background: #fdfdfd !important;
}
/* ─── Progress Bar ─── */
.stProgress > div > div { background: var(--teal) !important; }
/* ─── Highlight Box ─── */
.highlight-box {
background: #FDFCF9;
border-left: 3px solid var(--accent);
padding: 1.25rem 1.5rem;
margin: 1.5rem 0;
font-family: 'Crimson Pro', serif;
font-size: 1.15rem;
color: var(--text-main);
line-height: 1.6;
}
/* ─── Stat Row ─── */
.stat-row {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.75rem 0;
border-bottom: 1px solid var(--border-light);
}
.stat-row:last-child { border-bottom: none; }
.stat-row .stat-label {
color: var(--text-muted);
font-size: 0.9rem;
}
.stat-row .stat-value {
font-family: 'Crimson Pro', serif;
font-size: 1.15rem;
font-weight: 600;
}
/* ─── Code Blocks ─── */
.code-block {
background: var(--bg-code);
border: 1px solid var(--border);
border-radius: 6px;
padding: 1.25rem 1.5rem;
font-family: 'JetBrains Mono', monospace;
font-size: 0.82rem;
line-height: 1.7;
color: var(--text-main);
overflow-x: auto;
white-space: pre;
margin: 1rem 0;
}
/* ─── Pipeline Step ─── */
.pipeline-step {
display: flex;
align-items: flex-start;
gap: 1.25rem;
padding: 1.5rem 0;
border-bottom: 1px solid var(--border-light);
}
.pipeline-step:last-child { border-bottom: none; }
.step-number {
flex-shrink: 0;
width: 36px;
height: 36px;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-family: 'Inter', sans-serif;
font-weight: 600;
font-size: 0.85rem;
color: white;
margin-top: 2px;
}
.step-content h4 {
font-family: 'Crimson Pro', serif !important;
font-size: 1.1rem;
margin: 0 0 0.3rem 0;
color: var(--text-main);
}
.step-content p {
color: var(--text-muted);
font-size: 0.9rem;
line-height: 1.65;
margin: 0;
}
/* ─── Feature Table ─── */
.feature-table {
width: 100%;
border-collapse: collapse;
margin: 1rem 0;
font-size: 0.88rem;
}
.feature-table th {
text-align: left;
padding: 0.7rem 1rem;
background: var(--bg-main);
color: var(--text-main);
font-weight: 600;
font-size: 0.8rem;
text-transform: uppercase;
letter-spacing: 0.04em;
border-bottom: 2px solid var(--border);
}
.feature-table td {
padding: 0.6rem 1rem;
border-bottom: 1px solid var(--border-light);
color: var(--text-muted);
}
.feature-table tr:hover td {
background: #FDFCFA;
}
/* ─── Confusion Matrix ─── */
.cm-grid {
display: grid;
grid-template-columns: auto 1fr 1fr;
grid-template-rows: auto 1fr 1fr;
gap: 3px;
max-width: 360px;
margin: 1rem auto;
font-size: 0.85rem;
}
.cm-header {
text-align: center;
font-weight: 600;
color: var(--text-muted);
padding: 0.5rem;
font-size: 0.75rem;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.cm-cell {
text-align: center;
padding: 1.25rem 0.75rem;
border-radius: 6px;
font-weight: 600;
font-size: 1.1rem;
}
.cm-tp { background: var(--teal-light); color: var(--teal); }
.cm-tn { background: var(--teal-light); color: var(--teal); }
.cm-fp { background: var(--accent-light); color: var(--accent); }
.cm-fn { background: var(--accent-light); color: var(--accent); }
.cm-label {
font-size: 0.7rem;
display: block;
margin-top: 0.3rem;
font-weight: 400;
opacity: 0.7;
}
/* ─── Tag/Chip ─── */
.tag {
display: inline-block;
padding: 0.2rem 0.6rem;
border-radius: 4px;
font-size: 0.75rem;
font-weight: 500;
margin-right: 0.4rem;
margin-bottom: 0.4rem;
}
.tag-accent { background: var(--accent-light); color: var(--accent); }
.tag-teal { background: var(--teal-light); color: var(--teal); }
.tag-warm { background: var(--warm-light); color: var(--warm); }
.tag-muted { background: #F4F4F4; color: var(--text-muted); }
</style>
""", unsafe_allow_html=True)
# ─── LOAD MODEL ─────────────────────────────────────────────────────────────
@st.cache_resource(show_spinner=False)
def load_model():
paths = ["model/lead_model_bundle.pkl", "lead_model_bundle.pkl", "../model/lead_model_bundle.pkl"]
for p in paths:
if os.path.exists(p):
return joblib.load(p)
return None
model_bundle = load_model()
LEAD_SOURCES = ["Referral", "Partner", "Cold Call", "Instagram", "Google", "Facebook", "Direct"]
EXPECTED_FEATURES = [
"TotalVisits", "Total Time Spent on Website", "Page Views Per Visit",
"Lead Source_Cold Call", "Lead Source_Direct", "Lead Source_Facebook",
"Lead Source_Google", "Lead Source_Instagram", "Lead Source_Partner", "Lead Source_Referral"
]
# ─── HELPERS ────────────────────────────────────────────────────────────────
def build_row(visits, time_spent, page_views, lead_source):
row = {"TotalVisits": visits, "Total Time Spent on Website": time_spent, "Page Views Per Visit": page_views}
for src in LEAD_SOURCES:
row[f"Lead Source_{src}"] = 1 if src == lead_source else 0
return pd.DataFrame([row])
def get_score_tier(prob):
if prob >= 0.70: return "High Intent", "badge-hot", "#D96A47"
elif prob >= 0.40: return "Evaluating", "badge-warm", "#B38622"
else: return "Low Intent", "badge-cold", "#66635F"
def predict_single(visits, time_spent, page_views, lead_source):
if not model_bundle:
return None, None
model, feature_names = model_bundle["model"], model_bundle["features"]
df = build_row(visits, time_spent, page_views, lead_source)
df = df.reindex(columns=feature_names, fill_value=0)
prob = model.predict_proba(df)[0][1]
return prob, df
def predict_batch(df_input):
if not model_bundle:
return None
model, feature_names = model_bundle["model"], model_bundle["features"]
df_enc = pd.get_dummies(df_input, columns=["Lead Source"]) if "Lead Source" in df_input.columns else df_input.copy()
df_enc = df_enc.reindex(columns=feature_names, fill_value=0)
probs = model.predict_proba(df_enc)[:, 1]
return probs
def shap_waterfall(df_row):
if not model_bundle:
return None
model, feature_names = model_bundle["model"], model_bundle["features"]
df_r = df_row.reindex(columns=feature_names, fill_value=0)
explainer = shap.TreeExplainer(model)
sv = explainer.shap_values(df_r)
vals = sv[0] if isinstance(sv, list) else sv[0]
fig, ax = plt.subplots(figsize=(8, 4))
fig.patch.set_facecolor("#FFFFFF")
ax.set_facecolor("#FFFFFF")
feat_vals = list(zip(feature_names, vals))
feat_vals.sort(key=lambda x: abs(x[1]))
names = [f[0].replace("Lead Source_", "Src: ") for f in feat_vals]
values = [f[1] for f in feat_vals]
colors = ["#D96A47" if v > 0 else "#3E7C77" for v in values]
bars = ax.barh(names, values, color=colors, height=0.6, edgecolor="none")
ax.axvline(0, color="#E8E6E1", linewidth=1, linestyle="--")
ax.set_xlabel("Impact on Conversion Probability", color="#66635F", fontsize=9, fontfamily="Inter")
ax.tick_params(colors="#66635F", labelsize=8)
for spine in ax.spines.values(): spine.set_visible(False)
ax.xaxis.label.set_color("#66635F")
plt.tight_layout()
return fig
# ─── HERO ───────────────────────────────────────────────────────────────────
st.markdown("""
<div class="hero">
<div class="hero-tag">Predictive GTM Intelligence</div>
<h1>Stop Guessing. Know Who Converts.</h1>
<p>A LightGBM engine trained on 300,000 B2B interactions with full SHAP explainability.
Every prediction comes with transparent reasoning — not just a score, but the <em>why</em> behind it — so sales teams
can act on intent, not intuition.</p>
<div class="metrics-row">
<div class="metric-pill"><span class="val">0.83</span><span class="lbl">AUC-ROC</span></div>
<div class="metric-pill"><span class="val">300K</span><span class="lbl">Leads Trained</span></div>
<div class="metric-pill"><span class="val">10</span><span class="lbl">Feature Signals</span></div>
<div class="metric-pill"><span class="val">100%</span><span class="lbl">Explainable</span></div>
</div>
</div>
""", unsafe_allow_html=True)
# ─── TABS ───────────────────────────────────────────────────────────────────
tab1, tab2, tab3, tab4 = st.tabs([
"Real-Time Profiling",
"Pipeline Scoring",
"Model Deep Dive",
"How It Works"
])
# ═══════════════════════════════════════════════════════════════════
# TAB 1: SINGLE LEAD SCORING
# ═══════════════════════════════════════════════════════════════════
with tab1:
col1, col2 = st.columns([1, 1], gap="large")
with col1:
st.markdown('<div class="section-title">Lead Parameters</div>', unsafe_allow_html=True)
with st.container(border=True):
lead_source = st.selectbox("Acquisition Channel", LEAD_SOURCES, index=0)
total_visits = st.slider("Total Web Properties Visited", 1, 30, 8)
time_on_site = st.slider("Cumulative Dwell Time (sec)", 0, 2700, 420, step=30)
page_views = st.slider("Pages Viewed Per Session", 1, 20, 5)
st.markdown("""
<div class="highlight-box">
"Interpretability is revenue. Knowing a lead scored 85% <em>because of</em> '12 visits + Referral channel' gives Sales
the context to personalize outreach and close faster. Black-box models leave teams guessing."
</div>
""", unsafe_allow_html=True)
# Quick guide on reading the chart
st.markdown("""
<div style="margin-top:0.5rem;">
<p style="color:var(--text-muted); font-size:0.85rem; line-height:1.7;">
<strong style="color:var(--text-main);">Reading the Decision Drivers chart:</strong><br>
Each bar shows how much a single feature pushed the prediction up or down.
<span style="color:#D96A47;">Orange bars</span> are conversion drivers — they increase the lead's score.
<span style="color:#3E7C77;">Teal bars</span> indicate friction — factors pulling the score down.
The longer the bar, the stronger that feature's influence on this specific prediction.
</p>
</div>
""", unsafe_allow_html=True)
with col2:
if model_bundle:
prob, df_row = predict_single(total_visits, time_on_site, page_views, lead_source)
pct = int(prob * 100)
tier_label, tier_class, tier_color = get_score_tier(prob)
st.markdown(f"""
<div class="score-display">
<div class="score-number" style="color:{tier_color};">{pct}%</div>
<div class="score-label">Predicted Conversion Probability</div>
<div class="score-badge {tier_class}">{tier_label}</div>
</div>
""", unsafe_allow_html=True)
st.markdown("<br>", unsafe_allow_html=True)
st.markdown('<b style="color:#2D2A26; font-family:\'Crimson Pro\',serif; font-size:1.4rem;">Decision Drivers</b>', unsafe_allow_html=True)
st.markdown('<p style="color:#66635F; font-size:0.88rem; margin-bottom:1rem;">SHAP feature attribution for this prediction</p>', unsafe_allow_html=True)
fig = shap_waterfall(df_row)
if fig:
st.pyplot(fig, use_container_width=True)
plt.close()
# Recommended action based on tier
if tier_label == "High Intent":
action_html = """
<div style="background:var(--accent-light); border:1px solid #F6D6CB; border-radius:8px; padding:1.25rem; margin-top:1rem;">
<strong style="color:var(--accent); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong>
<p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;">
Route directly to a senior Account Executive. This lead shows strong buying signals —
prioritize personalized outreach within 24 hours to maximize conversion window.
</p>
</div>
"""
elif tier_label == "Evaluating":
action_html = """
<div style="background:var(--warm-light); border:1px solid #F5E8C3; border-radius:8px; padding:1.25rem; margin-top:1rem;">
<strong style="color:var(--warm); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong>
<p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;">
Assign to SDR for nurturing. Send targeted case studies relevant to their browsing behavior.
Schedule follow-up touchpoints over the next 2 weeks.
</p>
</div>
"""
else:
action_html = """
<div style="background:#F4F4F4; border:1px solid #E4E3E0; border-radius:8px; padding:1.25rem; margin-top:1rem;">
<strong style="color:var(--text-muted); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong>
<p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;">
Enroll in automated drip campaigns. No manual sales effort required — let marketing
nurture over time. Re-score in 30 days if engagement increases.
</p>
</div>
"""
st.markdown(action_html, unsafe_allow_html=True)
else:
st.markdown("""
<div class="score-display">
<div class="score-number" style="color:#E8E6E1;">—%</div>
<div class="score-label">Engine Offline</div>
</div>
""", unsafe_allow_html=True)
# ═══════════════════════════════════════════════════════════════════
# TAB 2: BATCH / PIPELINE SCORING
# ═══════════════════════════════════════════════════════════════════
with tab2:
st.markdown('<div class="section-title">Enrich Your Pipeline</div>', unsafe_allow_html=True)
st.markdown("""
<p class="section-subtitle">
Upload a CSV of raw leads. The engine scores each record with a conversion probability and intent tier,
enabling your team to prioritize outreach by predicted value rather than gut feel.
Every row gets the same model that powers the real-time profiler — LightGBM with 300K training rows.
</p>
""", unsafe_allow_html=True)
sample_data = pd.DataFrame({
"TotalVisits": [15, 3, 22, 1, 8],
"Total Time Spent on Website": [890, 45, 1200, 15, 420],
"Page Views Per Visit": [12, 2, 18, 1, 6],
"Lead Source": ["Referral", "Cold Call", "Partner", "Cold Call", "Google"]
})
# Template & Upload
col_btn1, col_btn2 = st.columns([1, 4])
with col_btn1:
st.download_button("Download Template CSV", sample_data.to_csv(index=False).encode(), "template.csv", "text/csv", use_container_width=True)
uploaded_file = st.file_uploader("", type=["csv"])
if uploaded_file:
try:
df_up = pd.read_csv(uploaded_file)
st.markdown(f"<p style='color:var(--text-main); font-weight:600; font-size:1.05rem;'>{len(df_up):,} leads ingested.</p>", unsafe_allow_html=True)
if model_bundle:
with st.spinner("Analyzing intent signals..."):
probs = predict_batch(df_up)
if probs is not None:
df_up.insert(0, "Score Tier", probs)
df_up.insert(0, "Win Probability", (probs * 100).round(1).astype(str) + "%")
df_up["Score Tier"] = df_up["Score Tier"].apply(lambda p: "High Intent" if p >= 0.7 else ("Evaluating" if p >= 0.4 else "Low Intent"))
df_up = df_up.sort_values("Win Probability", ascending=False)
hot = (probs >= 0.7).sum()
warm = ((probs >= 0.4) & (probs < 0.7)).sum()
cold = (probs < 0.4).sum()
m1, m2, m3 = st.columns(3)
m1.markdown(f"""<div class='card card-accent' style='text-align:center;'>
<h3 style='color:var(--accent); font-size:2rem; margin-bottom:0.3rem;'>{hot:,}</h3>
<p style='margin:0; font-weight:500;'>High Intent</p>
<p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>Route to Account Executives</p>
</div>""", unsafe_allow_html=True)
m2.markdown(f"""<div class='card card-warm' style='text-align:center;'>
<h3 style='color:var(--warm); font-size:2rem; margin-bottom:0.3rem;'>{warm:,}</h3>
<p style='margin:0; font-weight:500;'>Evaluating</p>
<p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>SDR follow-up sequence</p>
</div>""", unsafe_allow_html=True)
m3.markdown(f"""<div class='card' style='text-align:center; border-top:3px solid var(--text-light);'>
<h3 style='color:var(--text-muted); font-size:2rem; margin-bottom:0.3rem;'>{cold:,}</h3>
<p style='margin:0; font-weight:500;'>Long-term Nurture</p>
<p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>Automated drip campaigns</p>
</div>""", unsafe_allow_html=True)
st.dataframe(df_up, use_container_width=True)
# Channel breakdown
if "Lead Source" in df_up.columns:
st.markdown("<br>", unsafe_allow_html=True)
st.markdown('<b style="font-family:\'Crimson Pro\',serif; font-size:1.2rem;">Channel Performance Breakdown</b>', unsafe_allow_html=True)
prob_numeric = probs
channel_df = pd.DataFrame({
"Lead Source": pd.read_csv(uploaded_file)["Lead Source"],
"Probability": prob_numeric
})
channel_summary = channel_df.groupby("Lead Source").agg(
Leads=("Probability", "count"),
Avg_Score=("Probability", "mean"),
High_Intent=("Probability", lambda x: (x >= 0.7).sum())
).sort_values("Avg_Score", ascending=False)
channel_summary["Avg_Score"] = (channel_summary["Avg_Score"] * 100).round(1).astype(str) + "%"
st.dataframe(channel_summary, use_container_width=True)
out_csv = df_up.to_csv(index=False).encode()
st.download_button("Export Scored Pipeline", out_csv, f"scored_routing_{datetime.now().strftime('%Y%m%d')}.csv", "text/csv")
except Exception as e:
st.error(f"Ingestion error: {e}")
else:
st.markdown("""
<div style="margin-top:1rem;">
<p style="color:var(--text-muted); font-size:0.95rem; margin-bottom:0.5rem;">
<strong style="color:var(--text-main);">Expected CSV columns:</strong>
</p>
<div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:0.5rem 0;">
<div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);">
<span>Column</span><span>Type</span><span>Example</span><span>Notes</span>
</div>
<div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>TotalVisits</span><span>Integer</span><span>15</span><span>Unique web visits (1–30)</span>
</div>
<div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Total Time Spent on Website</span><span>Integer</span><span>890</span><span>Seconds on site (30–1200)</span>
</div>
<div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Page Views Per Visit</span><span>Integer</span><span>12</span><span>Pages per session (1–20)</span>
</div>
<div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; font-size:0.88rem; color:var(--text-muted);">
<span>Lead Source</span><span>String</span><span>Referral</span><span>One of 7 channels</span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
st.markdown("<p style='color:var(--text-muted); margin-top:1.5rem;'><em>Demonstration preview:</em></p>", unsafe_allow_html=True)
sample_data_demo = sample_data.copy()
sample_data_demo.insert(0, "Score Tier", ["High Intent", "Low Intent", "High Intent", "Low Intent", "Evaluating"])
sample_data_demo.insert(0, "Win Probability", ["87.2%", "12.4%", "94.1%", "6.8%", "61.3%"])
st.dataframe(sample_data_demo, use_container_width=True)
# ═══════════════════════════════════════════════════════════════════
# TAB 3: MODEL DEEP DIVE
# ═══════════════════════════════════════════════════════════════════
with tab3:
st.markdown('<div class="section-title">The Analytics Engine</div>', unsafe_allow_html=True)
st.markdown("""
<p class="section-subtitle">
A transparent look at the model architecture, performance metrics, and design decisions.
We believe every production ML system should be auditable — this section exists so you can verify,
not just trust.
</p>
""", unsafe_allow_html=True)
# ── Algorithm Selection ──
st.markdown("""
<div class="card card-teal">
<h3>Algorithm Selection</h3>
<p>We evaluated four model families on the same 300K-row dataset before selecting LightGBM for production deployment.
The decision was driven by three criteria: <strong>predictive accuracy</strong> (AUC on held-out data),
<strong>training efficiency</strong> (GPU utilization and wall-clock time), and <strong>explainability</strong>
(compatibility with SHAP for per-prediction reasoning).</p>
<div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin-top:1rem;">
<div style="display:grid; grid-template-columns:140px 90px 1fr; background:var(--bg-main); padding:0.7rem 1rem; font-size:0.8rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);">
<span>Algorithm</span><span>Verdict</span><span>Rationale</span>
</div>
<div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;">
<span><strong style="color:var(--teal);">LightGBM</strong></span>
<span><span class="tag tag-teal">Deployed</span></span>
<span>Leaf-wise tree growth with native CUDA GPU support. Trained 300K rows in seconds with early stopping. Natively handles categorical interactions and integrates cleanly with TreeSHAP for exact Shapley value computation.</span>
</div>
<div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;">
<span><strong>XGBoost</strong></span>
<span><span class="tag tag-warm">Rejected</span></span>
<span>Depth-wise growth is more memory-intensive. On our dataset, XGBoost trained 3–4x slower with no measurable AUC improvement. The accuracy parity didn't justify the compute cost.</span>
</div>
<div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;">
<span><strong>Logistic Reg.</strong></span>
<span><span class="tag tag-accent">Failed</span></span>
<span>Assumes linear feature-target relationships. Completely missed interaction effects — a visitor with 20 visits but 0 seconds on page is likely a bot, not a hot lead.</span>
</div>
<div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; font-size:0.9rem; color:var(--text-muted); align-items:start;">
<span><strong>Deep Learning</strong></span>
<span><span class="tag tag-muted">Overkill</span></span>
<span>Neural networks require spatial/temporal locality to outperform trees. On flat tabular data with 10 features, DNNs overfit to training noise and failed to beat gradient boosted trees.</span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
# ── Model Performance ──
perf_col1, perf_col2 = st.columns(2, gap="large")
with perf_col1:
st.markdown("""
<div class="card card-accent">
<h3>Performance Metrics</h3>
<p>Validated against a stratified 60,000-row held-out test set (20% of total data, never seen during training).</p>
<div class="stat-row">
<span class="stat-label">AUC-ROC</span>
<span class="stat-value" style="color:var(--teal);">0.8302</span>
</div>
<div class="stat-row">
<span class="stat-label">Accuracy</span>
<span class="stat-value">78.5%</span>
</div>
<div class="stat-row">
<span class="stat-label">Precision (Converters)</span>
<span class="stat-value">78.0%</span>
</div>
<div class="stat-row">
<span class="stat-label">Recall (Converters)</span>
<span class="stat-value">59.0%</span>
</div>
<div class="stat-row">
<span class="stat-label">F1 Score</span>
<span class="stat-value">0.67</span>
</div>
<div class="stat-row">
<span class="stat-label">Training Iterations</span>
<span class="stat-value">149 / 1,500</span>
</div>
<p style="font-size:0.82rem; margin-top:1rem; color:var(--text-light);">
Early stopping halted training at epoch 149 (patience: 50 rounds) to prevent overfitting.
The model converged to its best validation score quickly, confirming the signal-to-noise
ratio in the feature set is strong.
</p>
</div>
""", unsafe_allow_html=True)
with perf_col2:
st.markdown("""
<div class="card">
<h3>Confusion Matrix</h3>
<p>How the model's predictions map against ground truth on the 60K test set.</p>
<div class="cm-grid">
<div></div>
<div class="cm-header">Pred: No</div>
<div class="cm-header">Pred: Yes</div>
<div class="cm-header" style="writing-mode:vertical-lr; transform:rotate(180deg); padding:0.3rem;">Actual: No</div>
<div class="cm-cell cm-tn">33,988<span class="cm-label">True Neg</span></div>
<div class="cm-cell cm-fp">3,680<span class="cm-label">False Pos</span></div>
<div class="cm-header" style="writing-mode:vertical-lr; transform:rotate(180deg); padding:0.3rem;">Actual: Yes</div>
<div class="cm-cell cm-fn">9,235<span class="cm-label">False Neg</span></div>
<div class="cm-cell cm-tp">13,097<span class="cm-label">True Pos</span></div>
</div>
<p style="font-size:0.85rem; margin-top:1.2rem;">
The model is <strong>conservative by design</strong> — it has higher precision (78%) than recall (59%)
for converters. This means when it flags a lead as "High Intent," that signal is reliable. The trade-off
is that some true converters land in the "Evaluating" bucket, where SDR follow-up catches them.
</p>
<p style="font-size:0.85rem;">
This bias is intentional for B2B sales: a false positive (wasting an AE's time on a cold lead) is more
expensive than a false negative (a warm lead getting an SDR call instead of a direct AE route).
</p>
</div>
""", unsafe_allow_html=True)
# ── Hyperparameters & Feature Signals ──
hyper_col1, hyper_col2 = st.columns(2, gap="large")
with hyper_col1:
st.markdown("""
<div class="card">
<h3>Hyperparameters</h3>
<p>The LightGBM classifier was tuned for a 300K-row, 10-feature tabular dataset with GPU acceleration.</p>
<div class="code-block">LGBMClassifier(
device = "gpu"
n_estimators = 1,500 (with early stopping)
learning_rate = 0.05
num_leaves = 63
max_depth = 10
min_child_samples = 50
subsample = 0.8
colsample_bytree = 0.8
reg_alpha = 0.1 (L1 regularization)
reg_lambda = 1.0 (L2 regularization)
random_state = 42
)</div>
<p style="font-size:0.85rem; margin-top:0.5rem;">
<strong>Key design choices:</strong> 63 leaves with depth-10 cap balances expressiveness against overfitting.
L1 + L2 regularization penalizes unnecessarily complex splits. 80% row and column subsampling adds stochasticity,
reducing variance across boosting rounds. Early stopping with 50-round patience ensures the model stops
learning before it starts memorizing.
</p>
</div>
""", unsafe_allow_html=True)
with hyper_col2:
st.markdown("""
<div class="card">
<h3>Feature Signal Strength</h3>
<p>Pearson correlations between each numeric feature and the conversion target, measured across the full 300K dataset.</p>
<div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;">
<div style="display:grid; grid-template-columns:1fr 100px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);">
<span>Feature</span><span>Correlation</span><span>Strength</span>
</div>
<div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Total Visits</span><span style="font-weight:600; color:var(--teal);">+0.498</span><span><span class="tag tag-teal">Strong</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Time on Website</span><span style="font-weight:600; color:var(--teal);">+0.460</span><span><span class="tag tag-teal">Strong</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; font-size:0.88rem; color:var(--text-muted);">
<span>Page Views / Visit</span><span style="font-weight:600; color:var(--warm);">+0.329</span><span><span class="tag tag-warm">Moderate</span></span>
</div>
</div>
<h4>Channel Conversion Propensity</h4>
<p>Logit coefficients assigned during data generation — representing each channel's inherent quality.</p>
<div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;">
<div style="display:grid; grid-template-columns:1fr 90px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);">
<span>Channel</span><span>Coeff.</span><span>Effect</span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Referral</span><span style="color:var(--teal); font-weight:600;">+1.30</span><span><span class="tag tag-teal">Strongest</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Partner</span><span style="color:var(--teal); font-weight:600;">+0.90</span><span><span class="tag tag-teal">Strong</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Google</span><span style="color:var(--warm); font-weight:600;">+0.25</span><span><span class="tag tag-warm">Moderate</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Direct</span><span>0.00</span><span><span class="tag tag-muted">Neutral</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Facebook</span><span style="color:var(--accent);">-0.25</span><span><span class="tag tag-accent">Weak</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);">
<span>Instagram</span><span style="color:var(--accent);">-0.60</span><span><span class="tag tag-accent">Negative</span></span>
</div>
<div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; font-size:0.88rem; color:var(--text-muted);">
<span>Cold Call</span><span style="color:var(--accent); font-weight:600;">-1.00</span><span><span class="tag tag-accent">Weakest</span></span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
# ── Explainability & Routing ──
explain_col1, explain_col2 = st.columns(2, gap="large")
with explain_col1:
st.markdown("""
<div class="card">
<h3>Why Explainability Matters</h3>
<p>Traditional lead scoring is a black box. A model says "82%" and the sales team has no idea why. Was it the
channel? The browsing depth? The time spent? Without that context, every follow-up is generic.</p>
<p>We deploy <strong>TreeSHAP</strong> — a game-theoretic framework based on Shapley values from cooperative game theory.
For every single prediction, TreeSHAP computes the exact marginal contribution of each feature to the final score.</p>
<h4>What this means in practice</h4>
<ul>
<li><strong>For Sales:</strong> "This lead scored 87% primarily because of 14 return visits via the Referral channel."
That's actionable — the AE knows to reference the referrer in their outreach.</li>
<li><strong>For Marketing:</strong> Aggregate SHAP values across cohorts reveal which channels and behaviors
consistently drive conversions, informing budget allocation.</li>
<li><strong>For Compliance:</strong> Full audit trail. Every prediction can be decomposed into its constituent
factors — no unexplainable decisions.</li>
</ul>
<p style="font-size:0.85rem; color:var(--text-light); margin-top:0.5rem;">
TreeSHAP runs in O(TLD) time where T = trees, L = leaves, D = depth. For our 149-tree model with depth 10,
explanation latency is under 50ms per prediction.
</p>
</div>
""", unsafe_allow_html=True)
with explain_col2:
st.markdown("""
<div class="card">
<h3>Intent Routing Protocol</h3>
<p>The scoring engine classifies every lead into one of three intent tiers. Each tier maps to a specific
go-to-market action, ensuring that sales resources are allocated proportionally to predicted value.</p>
<div style="background:var(--accent-light); border:1px solid #F6D6CB; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;">
<strong style="color:var(--accent); font-size:0.9rem;">High Intent — P(conv) > 70%</strong>
<p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);">
Bypass SDRs entirely. Route to senior Account Executives for personalized, white-glove outreach
within 24 hours. These leads have demonstrated strong multi-signal buying behavior.
</p>
</div>
<div style="background:var(--warm-light); border:1px solid #F5E8C3; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;">
<strong style="color:var(--warm); font-size:0.9rem;">Evaluating — P(conv) 40–70%</strong>
<p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);">
Interested but undecided. Route to SDRs for structured follow-up: targeted case studies, product
demos, and persistent multi-touch cadences over 2–3 weeks.
</p>
</div>
<div style="background:#F4F4F4; border:1px solid #E4E3E0; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;">
<strong style="color:var(--text-muted); font-size:0.9rem;">Low Intent — P(conv) < 40%</strong>
<p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);">
Do not allocate manual sales time. Enroll in automated newsletter sequences and long-term
nurture campaigns. Re-score monthly — intent can change as needs evolve.
</p>
</div>
<p style="font-size:0.85rem; color:var(--text-light); margin-top:0.5rem;">
This protocol ensures AEs spend 100% of their time on qualified opportunities, while no lead
is ever abandoned — just routed to the appropriate engagement tier.
</p>
</div>
""", unsafe_allow_html=True)
# ── Feature Importance Images ──
fi_path = None
for p in ["model/feature_importance.png", "../model/feature_importance.png"]:
if os.path.exists(p):
fi_path = p
break
shap_path = None
for p in ["model/shap_summary.png", "../model/shap_summary.png"]:
if os.path.exists(p):
shap_path = p
break
def get_base64_image(image_path):
with open(image_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode()
if fi_path or shap_path:
st.markdown("<br>", unsafe_allow_html=True)
img_col1, img_col2 = st.columns(2, gap="large")
if fi_path:
with img_col1:
b64_fi = get_base64_image(fi_path)
st.markdown(f"""
<div class="card">
<h3 style="font-family:'Crimson Pro',serif; font-size:1.35rem; margin-bottom:0.5rem;">Feature Importance (Split Count)</h3>
<p style="color:var(--text-muted); font-size:0.88rem; margin-bottom:1rem;">Number of times each feature was used for a decision split across all 149 trees.</p>
<img src="data:image/png;base64,{b64_fi}" style="width:100%; border-radius:4px;">
</div>
""", unsafe_allow_html=True)
if shap_path:
with img_col2:
b64_shap = get_base64_image(shap_path)
st.markdown(f"""
<div class="card">
<h3 style="font-family:'Crimson Pro',serif; font-size:1.35rem; margin-bottom:0.5rem;">SHAP Summary (Beeswarm)</h3>
<p style="color:var(--text-muted); font-size:0.88rem; margin-bottom:1rem;">Distribution of SHAP values per feature across a 500-row sample. Color = feature value (red = high).</p>
<img src="data:image/png;base64,{b64_shap}" style="width:100%; border-radius:4px;">
</div>
""", unsafe_allow_html=True)
# ═══════════════════════════════════════════════════════════════════
# TAB 4: HOW IT WORKS (PIPELINE WALKTHROUGH)
# ═══════════════════════════════════════════════════════════════════
with tab4:
st.markdown('<div class="section-title">How It Works</div>', unsafe_allow_html=True)
st.markdown("""
<p class="section-subtitle">
From raw data to actionable scores — a walkthrough of every stage in the ML pipeline.
Each step is designed to be reproducible, auditable, and free of data leakage.
</p>
""", unsafe_allow_html=True)
# ── Pipeline Steps ──
st.markdown("""
<div class="card">
<h3>The Pipeline — End to End</h3>
<div class="pipeline-step">
<div class="step-number" style="background:var(--accent);">1</div>
<div class="step-content">
<h4>Synthetic Data Generation</h4>
<p>300,000 B2B leads are generated using a statistically rigorous process. Each lead has 3 behavioral
features (visits, dwell time, page views) and 1 categorical feature (acquisition channel).
Conversion labels are generated using a sigmoid function with main effects, interaction terms, and
controlled noise — producing realistic, non-trivial decision boundaries that require a non-linear
model to learn.</p>
</div>
</div>
<div class="pipeline-step">
<div class="step-number" style="background:var(--accent);">2</div>
<div class="step-content">
<h4>Feature Engineering</h4>
<p>The categorical "Lead Source" column is one-hot encoded into 7 binary features, producing a
total of 10 model inputs. No feature scaling is applied — tree-based models are invariant to
monotonic transformations, so normalization would add complexity without benefit. Feature names
are serialized alongside the model to prevent column-mismatch errors at inference time.</p>
</div>
</div>
<div class="pipeline-step">
<div class="step-number" style="background:var(--teal);">3</div>
<div class="step-content">
<h4>Train / Test Split</h4>
<p>An 80/20 stratified split produces 240,000 training rows and 60,000 test rows. Stratification
ensures both sets maintain the same conversion rate (~37%). The test set is strictly held out — it
is never used for hyperparameter tuning or feature selection, only for final evaluation.</p>
</div>
</div>
<div class="pipeline-step">
<div class="step-number" style="background:var(--teal);">4</div>
<div class="step-content">
<h4>GPU-Accelerated Training</h4>
<p>LightGBM trains with CUDA GPU acceleration. The model is configured for up to 1,500 boosting rounds,
but early stopping (patience = 50) halts training when the validation AUC stops improving. In practice,
the model converges at iteration 149 — just 10% of the maximum budget. This aggressive early stop
confirms the feature set carries strong, learnable signal.</p>
</div>
</div>
<div class="pipeline-step">
<div class="step-number" style="background:var(--warm);">5</div>
<div class="step-content">
<h4>Evaluation & Validation</h4>
<p>The trained model is evaluated on the 60K held-out test set across multiple metrics: AUC-ROC (0.83),
accuracy (78.5%), precision (78%), and recall (59%). The precision-recall trade-off is intentionally
conservative — we accept lower recall to ensure high-intent predictions are trustworthy. A full
confusion matrix and classification report are generated for auditability.</p>
</div>
</div>
<div class="pipeline-step">
<div class="step-number" style="background:var(--warm);">6</div>
<div class="step-content">
<h4>SHAP Explainability Layer</h4>
<p>TreeSHAP is applied post-training to provide per-prediction feature attribution. For each scored lead,
a waterfall chart decomposes the prediction into individual feature contributions. This transforms
a single probability score into a narrative — "72% because of 14 visits and Referral source,
partially offset by low page depth" — that sales teams can act on immediately.</p>
</div>
</div>
<div class="pipeline-step" style="border-bottom:none;">
<div class="step-number" style="background:var(--text-main);">7</div>
<div class="step-content">
<h4>Model Serialization & Deployment</h4>
<p>The trained LightGBM classifier and its feature name list are bundled into a single pickle file
using joblib. This bundle is loaded at app startup with Streamlit's cache_resource decorator —
ensuring the model is loaded once and shared across all user sessions. The app supports both
single-lead real-time scoring and batch CSV uploads with identical model inference.</p>
</div>
</div>
</div>
""", unsafe_allow_html=True)
# ── Data Generation Details ──
gen_col1, gen_col2 = st.columns(2, gap="large")
with gen_col1:
st.markdown("""
<div class="card card-accent">
<h3>Data Generation — The Math</h3>
<p>Conversion labels aren't randomly assigned — they're generated from a sigmoid (logistic) function
with carefully designed coefficients, ensuring the dataset has <em>learnable structure</em> that mirrors
real-world B2B dynamics.</p>
<h4>The Logit Function</h4>
<div class="code-block">logit = -0.80 # base intercept
+ 0.80 * visits_z # main: visit frequency
+ 0.75 * time_z # main: engagement depth
+ 0.60 * pages_z # main: content breadth
+ 0.35 * visits_z * time_z # interaction
+ 0.25 * visits_z * pages_z # interaction
+ 0.20 * time_z * pages_z # interaction
+ source_bonus * 0.50 # channel quality
+ noise ~ N(0, 0.35) # controlled noise
P(convert) = sigmoid(logit)</div>
<p style="font-size:0.85rem;">
The interaction terms are critical — they ensure that <strong>combinations of features</strong> matter,
not just individual values. A lead with high visits but zero time-on-site (possible bot behavior)
gets a different score than one with moderate visits and deep engagement.
</p>
</div>
""", unsafe_allow_html=True)
with gen_col2:
st.markdown("""
<div class="card card-teal">
<h3>Feature Distributions</h3>
<p>Each numeric feature is generated from a gamma distribution (right-skewed), reflecting
the typical long-tail patterns in real web analytics data. Most visitors have few visits;
a small fraction are highly engaged.</p>
<div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;">
<div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);">
<span>Feature</span><span>Distribution</span><span>Range</span><span>Corr.</span>
</div>
<div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.85rem; color:var(--text-muted);">
<span>TotalVisits</span><span>Gamma(3.0, 2.5)</span><span>1 – 30</span><span style="font-weight:600; color:var(--teal);">+0.498</span>
</div>
<div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.85rem; color:var(--text-muted);">
<span>Time on Site</span><span>visits×18 + Gamma</span><span>30 – 1200s</span><span style="font-weight:600; color:var(--teal);">+0.460</span>
</div>
<div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; font-size:0.85rem; color:var(--text-muted);">
<span>Page Views</span><span>visits×0.25 + Gamma</span><span>1 – 20</span><span style="font-weight:600; color:var(--warm);">+0.329</span>
</div>
</div>
<h4>Why Gamma?</h4>
<p style="font-size:0.88rem;">
Real web traffic is never normally distributed — it's always right-skewed. Most visitors browse briefly;
a small but important tail spends significant time. Gamma distributions naturally model this shape,
producing more realistic synthetic data than a simple uniform or Gaussian would.
</p>
<h4>Inter-Feature Correlations</h4>
<p style="font-size:0.88rem;">
Time-on-site and page views are partially derived from visit count (with independent gamma noise),
creating <strong>moderate multicollinearity</strong> — exactly what you'd see in real web analytics.
The model must learn to separate the shared signal from the unique information each feature carries.
</p>
</div>
""", unsafe_allow_html=True)
# ── Tech Stack ──
st.markdown("""
<div class="card">
<h3>Technology Stack</h3>
<div style="display:flex; flex-wrap:wrap; gap:2rem; margin-top:0.5rem;">
<div style="flex:1; min-width:200px;">
<h4 style="color:var(--teal); margin-top:0;">ML & Data</h4>
<ul>
<li><strong>LightGBM 4.0+</strong> — Gradient boosted decision trees with GPU</li>
<li><strong>SHAP 0.44+</strong> — TreeSHAP for per-prediction explanations</li>
<li><strong>scikit-learn 1.3+</strong> — Train/test splits, metrics, preprocessing</li>
<li><strong>Pandas / NumPy</strong> — Data manipulation and numerical ops</li>
</ul>
</div>
<div style="flex:1; min-width:200px;">
<h4 style="color:var(--accent); margin-top:0;">Application</h4>
<ul>
<li><strong>Streamlit 1.32+</strong> — Interactive web application framework</li>
<li><strong>Matplotlib</strong> — SHAP waterfall visualizations</li>
<li><strong>joblib</strong> — Model serialization with compression</li>
<li><strong>Faker</strong> — Realistic synthetic data generation</li>
</ul>
</div>
<div style="flex:1; min-width:200px;">
<h4 style="margin-top:0;">Infrastructure</h4>
<ul>
<li><strong>CUDA GPU</strong> — Hardware-accelerated model training</li>
<li><strong>Python 3.10+</strong> — Runtime environment</li>
<li><strong>Git</strong> — Version control for code and pipeline artifacts</li>
<li><strong>Streamlit Cache</strong> — Model loaded once, shared across sessions</li>
</ul>
</div>
</div>
</div>
""", unsafe_allow_html=True)