Spaces:
Runtime error
Runtime error
| """ | |
| LeadScore AI — Enterprise Intent Engine | |
| LightGBM + SHAP | AUC 0.83 | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import shap | |
| import matplotlib.pyplot as plt | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import io | |
| import os | |
| import base64 | |
| from datetime import datetime | |
| # ─── PAGE CONFIG ──────────────────────────────────────────────────────────── | |
| st.set_page_config( | |
| page_title="LeadScore AI | Enterprise Intent Engine", | |
| page_icon="✨", | |
| layout="wide", | |
| initial_sidebar_state="collapsed", | |
| menu_items={ | |
| "About": "LeadScore AI — Enterprise lead intelligence powered by LightGBM + SHAP." | |
| } | |
| ) | |
| # ─── CUSTOM CSS ────────────────────────────────────────────────────────────── | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Crimson+Pro:wght@400;600&family=Inter:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap'); | |
| :root { | |
| --bg-main: #FAF9F6; | |
| --bg-card: #FFFFFF; | |
| --bg-code: #F7F6F3; | |
| --text-main: #2D2A26; | |
| --text-muted: #66635F; | |
| --text-light: #9A9590; | |
| --border: #E8E6E1; | |
| --border-light: #F0EEEA; | |
| --accent: #D96A47; | |
| --accent-light: #FDF2EE; | |
| --teal: #3E7C77; | |
| --teal-light: #EDF5F4; | |
| --warm: #B38622; | |
| --warm-light: #FDF8EC; | |
| --shadow: 0 4px 20px rgba(0, 0, 0, 0.03); | |
| --shadow-hover: 0 8px 30px rgba(0, 0, 0, 0.06); | |
| } | |
| html, body, .stApp { | |
| background-color: var(--bg-main) !important; | |
| color: var(--text-main) !important; | |
| font-family: 'Inter', -apple-system, sans-serif !important; | |
| } | |
| #MainMenu, footer, header { visibility: hidden; } | |
| .block-container { padding-top: 3rem; padding-bottom: 4rem; max-width: 1100px; } | |
| h1, h2, h3, h4, .serif { | |
| font-family: 'Crimson Pro', serif !important; | |
| color: var(--text-main) !important; | |
| } | |
| /* ─── Hero Section ─── */ | |
| .hero { | |
| text-align: center; | |
| padding: 2.5rem 1rem 3.5rem 1rem; | |
| border-bottom: 1px solid var(--border); | |
| margin-bottom: 2.5rem; | |
| } | |
| .hero-tag { | |
| font-size: 0.72rem; | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| letter-spacing: 0.12em; | |
| color: var(--accent); | |
| margin-bottom: 1rem; | |
| display: inline-block; | |
| } | |
| .hero h1 { | |
| font-size: 3.5rem; | |
| font-weight: 400; | |
| line-height: 1.1; | |
| margin: 0 0 1.2rem 0; | |
| letter-spacing: -0.02em; | |
| } | |
| .hero p { | |
| font-size: 1.1rem; | |
| color: var(--text-muted); | |
| max-width: 680px; | |
| margin: 0 auto 2rem auto; | |
| line-height: 1.7; | |
| } | |
| /* ─── Metrics Row ─── */ | |
| .metrics-row { | |
| display: flex; | |
| justify-content: center; | |
| gap: 3rem; | |
| flex-wrap: wrap; | |
| margin-top: 1.5rem; | |
| } | |
| .metric-pill { | |
| text-align: center; | |
| min-width: 100px; | |
| } | |
| .metric-pill .val { | |
| display: block; | |
| font-family: 'Inter', sans-serif; | |
| font-weight: 600; | |
| font-size: 1.3rem; | |
| color: var(--text-main); | |
| } | |
| .metric-pill .lbl { | |
| font-size: 0.72rem; | |
| color: var(--text-muted); | |
| text-transform: uppercase; | |
| letter-spacing: 0.06em; | |
| margin-top: 0.25rem; | |
| display: block; | |
| } | |
| /* ─── Section Titles ─── */ | |
| .section-title { | |
| font-family: 'Crimson Pro', serif; | |
| font-size: 2rem; | |
| border-bottom: 1px solid var(--border); | |
| padding-bottom: 0.5rem; | |
| margin-bottom: 2rem; | |
| color: var(--text-main); | |
| } | |
| .section-subtitle { | |
| font-size: 1.05rem; | |
| color: var(--text-muted); | |
| line-height: 1.7; | |
| margin-bottom: 2rem; | |
| max-width: 720px; | |
| } | |
| /* ─── Cards ─── */ | |
| .card { | |
| background: var(--bg-card); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 2rem; | |
| margin-bottom: 1.5rem; | |
| box-shadow: var(--shadow); | |
| height: 100%; | |
| display: flex; | |
| flex-direction: column; | |
| } | |
| .card h3 { | |
| font-size: 1.35rem; | |
| margin: 0 0 0.8rem 0; | |
| } | |
| .card h4 { | |
| font-size: 1.1rem; | |
| margin: 1.2rem 0 0.5rem 0; | |
| color: var(--text-main); | |
| } | |
| .card p, .card li { | |
| color: var(--text-muted); | |
| font-size: 0.95rem; | |
| line-height: 1.75; | |
| margin: 0 0 0.7rem 0; | |
| } | |
| .card ul { padding-left: 1.5rem; margin-bottom: 0.5rem; } | |
| /* Card with accent top border */ | |
| .card-accent { border-top: 3px solid var(--accent); } | |
| .card-teal { border-top: 3px solid var(--teal); } | |
| .card-warm { border-top: 3px solid var(--warm); } | |
| /* ─── Score Display ─── */ | |
| .score-display { | |
| text-align: center; | |
| padding: 3rem 2rem; | |
| background: var(--bg-card); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| box-shadow: var(--shadow); | |
| } | |
| .score-number { | |
| font-family: 'Crimson Pro', serif; | |
| font-size: 6rem; | |
| line-height: 1; | |
| margin-bottom: 0.5rem; | |
| } | |
| .score-label { | |
| font-size: 0.8rem; | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--text-muted); | |
| margin-bottom: 1.5rem; | |
| } | |
| .score-badge { | |
| display: inline-block; | |
| padding: 0.4rem 1.2rem; | |
| border-radius: 100px; | |
| font-size: 0.85rem; | |
| font-weight: 500; | |
| } | |
| .badge-hot { background: var(--accent-light); color: var(--accent); border: 1px solid #F6D6CB; } | |
| .badge-warm { background: var(--warm-light); color: var(--warm); border: 1px solid #F5E8C3; } | |
| .badge-cold { background: #F4F4F4; color: var(--text-muted); border: 1px solid #E4E3E0; } | |
| /* ─── Form Elements ─── */ | |
| .stSlider label, .stSelectbox label { | |
| color: var(--text-main) !important; | |
| font-weight: 500 !important; | |
| font-size: 0.95rem !important; | |
| } | |
| /* ─── Tabs ─── */ | |
| .stTabs [data-baseweb="tab-list"] { | |
| background: transparent; | |
| border-bottom: 1px solid var(--border); | |
| gap: 2rem; | |
| padding-bottom: 0; | |
| margin-bottom: 2rem; | |
| } | |
| .stTabs [data-baseweb="tab"] { | |
| background: transparent !important; | |
| color: var(--text-muted) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| font-weight: 500 !important; | |
| font-size: 0.95rem !important; | |
| padding: 0.5rem 0 1rem 0 !important; | |
| border: none !important; | |
| } | |
| .stTabs [data-baseweb="tab"]:hover { | |
| color: var(--text-main) !important; | |
| } | |
| .stTabs [aria-selected="true"] { | |
| color: var(--text-main) !important; | |
| border-bottom: 2px solid var(--text-main) !important; | |
| } | |
| /* ─── Tables ─── */ | |
| .stDataFrame { border-radius: 8px; border: 1px solid var(--border); } | |
| .stDataFrame thead th { | |
| background: var(--bg-main) !important; | |
| color: var(--text-main) !important; | |
| font-size: 0.85rem !important; | |
| font-weight: 600 !important; | |
| border-bottom: 1px solid var(--border) !important; | |
| } | |
| /* ─── Buttons ─── */ | |
| .stButton > button, .stDownloadButton > button { | |
| background: var(--bg-card) !important; | |
| color: var(--text-main) !important; | |
| border: 1px solid var(--border) !important; | |
| border-radius: 6px !important; | |
| font-weight: 500 !important; | |
| padding: 0.5rem 1.5rem !important; | |
| transition: all 0.2s !important; | |
| } | |
| .stButton > button:hover, .stDownloadButton > button:hover { | |
| border-color: var(--text-main) !important; | |
| box-shadow: var(--shadow) !important; | |
| background: #fdfdfd !important; | |
| } | |
| /* ─── Progress Bar ─── */ | |
| .stProgress > div > div { background: var(--teal) !important; } | |
| /* ─── Highlight Box ─── */ | |
| .highlight-box { | |
| background: #FDFCF9; | |
| border-left: 3px solid var(--accent); | |
| padding: 1.25rem 1.5rem; | |
| margin: 1.5rem 0; | |
| font-family: 'Crimson Pro', serif; | |
| font-size: 1.15rem; | |
| color: var(--text-main); | |
| line-height: 1.6; | |
| } | |
| /* ─── Stat Row ─── */ | |
| .stat-row { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| padding: 0.75rem 0; | |
| border-bottom: 1px solid var(--border-light); | |
| } | |
| .stat-row:last-child { border-bottom: none; } | |
| .stat-row .stat-label { | |
| color: var(--text-muted); | |
| font-size: 0.9rem; | |
| } | |
| .stat-row .stat-value { | |
| font-family: 'Crimson Pro', serif; | |
| font-size: 1.15rem; | |
| font-weight: 600; | |
| } | |
| /* ─── Code Blocks ─── */ | |
| .code-block { | |
| background: var(--bg-code); | |
| border: 1px solid var(--border); | |
| border-radius: 6px; | |
| padding: 1.25rem 1.5rem; | |
| font-family: 'JetBrains Mono', monospace; | |
| font-size: 0.82rem; | |
| line-height: 1.7; | |
| color: var(--text-main); | |
| overflow-x: auto; | |
| white-space: pre; | |
| margin: 1rem 0; | |
| } | |
| /* ─── Pipeline Step ─── */ | |
| .pipeline-step { | |
| display: flex; | |
| align-items: flex-start; | |
| gap: 1.25rem; | |
| padding: 1.5rem 0; | |
| border-bottom: 1px solid var(--border-light); | |
| } | |
| .pipeline-step:last-child { border-bottom: none; } | |
| .step-number { | |
| flex-shrink: 0; | |
| width: 36px; | |
| height: 36px; | |
| border-radius: 50%; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-family: 'Inter', sans-serif; | |
| font-weight: 600; | |
| font-size: 0.85rem; | |
| color: white; | |
| margin-top: 2px; | |
| } | |
| .step-content h4 { | |
| font-family: 'Crimson Pro', serif !important; | |
| font-size: 1.1rem; | |
| margin: 0 0 0.3rem 0; | |
| color: var(--text-main); | |
| } | |
| .step-content p { | |
| color: var(--text-muted); | |
| font-size: 0.9rem; | |
| line-height: 1.65; | |
| margin: 0; | |
| } | |
| /* ─── Feature Table ─── */ | |
| .feature-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 1rem 0; | |
| font-size: 0.88rem; | |
| } | |
| .feature-table th { | |
| text-align: left; | |
| padding: 0.7rem 1rem; | |
| background: var(--bg-main); | |
| color: var(--text-main); | |
| font-weight: 600; | |
| font-size: 0.8rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| border-bottom: 2px solid var(--border); | |
| } | |
| .feature-table td { | |
| padding: 0.6rem 1rem; | |
| border-bottom: 1px solid var(--border-light); | |
| color: var(--text-muted); | |
| } | |
| .feature-table tr:hover td { | |
| background: #FDFCFA; | |
| } | |
| /* ─── Confusion Matrix ─── */ | |
| .cm-grid { | |
| display: grid; | |
| grid-template-columns: auto 1fr 1fr; | |
| grid-template-rows: auto 1fr 1fr; | |
| gap: 3px; | |
| max-width: 360px; | |
| margin: 1rem auto; | |
| font-size: 0.85rem; | |
| } | |
| .cm-header { | |
| text-align: center; | |
| font-weight: 600; | |
| color: var(--text-muted); | |
| padding: 0.5rem; | |
| font-size: 0.75rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| } | |
| .cm-cell { | |
| text-align: center; | |
| padding: 1.25rem 0.75rem; | |
| border-radius: 6px; | |
| font-weight: 600; | |
| font-size: 1.1rem; | |
| } | |
| .cm-tp { background: var(--teal-light); color: var(--teal); } | |
| .cm-tn { background: var(--teal-light); color: var(--teal); } | |
| .cm-fp { background: var(--accent-light); color: var(--accent); } | |
| .cm-fn { background: var(--accent-light); color: var(--accent); } | |
| .cm-label { | |
| font-size: 0.7rem; | |
| display: block; | |
| margin-top: 0.3rem; | |
| font-weight: 400; | |
| opacity: 0.7; | |
| } | |
| /* ─── Tag/Chip ─── */ | |
| .tag { | |
| display: inline-block; | |
| padding: 0.2rem 0.6rem; | |
| border-radius: 4px; | |
| font-size: 0.75rem; | |
| font-weight: 500; | |
| margin-right: 0.4rem; | |
| margin-bottom: 0.4rem; | |
| } | |
| .tag-accent { background: var(--accent-light); color: var(--accent); } | |
| .tag-teal { background: var(--teal-light); color: var(--teal); } | |
| .tag-warm { background: var(--warm-light); color: var(--warm); } | |
| .tag-muted { background: #F4F4F4; color: var(--text-muted); } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ─── LOAD MODEL ───────────────────────────────────────────────────────────── | |
| def load_model(): | |
| paths = ["model/lead_model_bundle.pkl", "lead_model_bundle.pkl", "../model/lead_model_bundle.pkl"] | |
| for p in paths: | |
| if os.path.exists(p): | |
| return joblib.load(p) | |
| return None | |
| model_bundle = load_model() | |
| LEAD_SOURCES = ["Referral", "Partner", "Cold Call", "Instagram", "Google", "Facebook", "Direct"] | |
| EXPECTED_FEATURES = [ | |
| "TotalVisits", "Total Time Spent on Website", "Page Views Per Visit", | |
| "Lead Source_Cold Call", "Lead Source_Direct", "Lead Source_Facebook", | |
| "Lead Source_Google", "Lead Source_Instagram", "Lead Source_Partner", "Lead Source_Referral" | |
| ] | |
| # ─── HELPERS ──────────────────────────────────────────────────────────────── | |
| def build_row(visits, time_spent, page_views, lead_source): | |
| row = {"TotalVisits": visits, "Total Time Spent on Website": time_spent, "Page Views Per Visit": page_views} | |
| for src in LEAD_SOURCES: | |
| row[f"Lead Source_{src}"] = 1 if src == lead_source else 0 | |
| return pd.DataFrame([row]) | |
| def get_score_tier(prob): | |
| if prob >= 0.70: return "High Intent", "badge-hot", "#D96A47" | |
| elif prob >= 0.40: return "Evaluating", "badge-warm", "#B38622" | |
| else: return "Low Intent", "badge-cold", "#66635F" | |
| def predict_single(visits, time_spent, page_views, lead_source): | |
| if not model_bundle: | |
| return None, None | |
| model, feature_names = model_bundle["model"], model_bundle["features"] | |
| df = build_row(visits, time_spent, page_views, lead_source) | |
| df = df.reindex(columns=feature_names, fill_value=0) | |
| prob = model.predict_proba(df)[0][1] | |
| return prob, df | |
| def predict_batch(df_input): | |
| if not model_bundle: | |
| return None | |
| model, feature_names = model_bundle["model"], model_bundle["features"] | |
| df_enc = pd.get_dummies(df_input, columns=["Lead Source"]) if "Lead Source" in df_input.columns else df_input.copy() | |
| df_enc = df_enc.reindex(columns=feature_names, fill_value=0) | |
| probs = model.predict_proba(df_enc)[:, 1] | |
| return probs | |
| def shap_waterfall(df_row): | |
| if not model_bundle: | |
| return None | |
| model, feature_names = model_bundle["model"], model_bundle["features"] | |
| df_r = df_row.reindex(columns=feature_names, fill_value=0) | |
| explainer = shap.TreeExplainer(model) | |
| sv = explainer.shap_values(df_r) | |
| vals = sv[0] if isinstance(sv, list) else sv[0] | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| fig.patch.set_facecolor("#FFFFFF") | |
| ax.set_facecolor("#FFFFFF") | |
| feat_vals = list(zip(feature_names, vals)) | |
| feat_vals.sort(key=lambda x: abs(x[1])) | |
| names = [f[0].replace("Lead Source_", "Src: ") for f in feat_vals] | |
| values = [f[1] for f in feat_vals] | |
| colors = ["#D96A47" if v > 0 else "#3E7C77" for v in values] | |
| bars = ax.barh(names, values, color=colors, height=0.6, edgecolor="none") | |
| ax.axvline(0, color="#E8E6E1", linewidth=1, linestyle="--") | |
| ax.set_xlabel("Impact on Conversion Probability", color="#66635F", fontsize=9, fontfamily="Inter") | |
| ax.tick_params(colors="#66635F", labelsize=8) | |
| for spine in ax.spines.values(): spine.set_visible(False) | |
| ax.xaxis.label.set_color("#66635F") | |
| plt.tight_layout() | |
| return fig | |
| # ─── HERO ─────────────────────────────────────────────────────────────────── | |
| st.markdown(""" | |
| <div class="hero"> | |
| <div class="hero-tag">Predictive GTM Intelligence</div> | |
| <h1>Stop Guessing. Know Who Converts.</h1> | |
| <p>A LightGBM engine trained on 300,000 B2B interactions with full SHAP explainability. | |
| Every prediction comes with transparent reasoning — not just a score, but the <em>why</em> behind it — so sales teams | |
| can act on intent, not intuition.</p> | |
| <div class="metrics-row"> | |
| <div class="metric-pill"><span class="val">0.83</span><span class="lbl">AUC-ROC</span></div> | |
| <div class="metric-pill"><span class="val">300K</span><span class="lbl">Leads Trained</span></div> | |
| <div class="metric-pill"><span class="val">10</span><span class="lbl">Feature Signals</span></div> | |
| <div class="metric-pill"><span class="val">100%</span><span class="lbl">Explainable</span></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ─── TABS ─────────────────────────────────────────────────────────────────── | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "Real-Time Profiling", | |
| "Pipeline Scoring", | |
| "Model Deep Dive", | |
| "How It Works" | |
| ]) | |
| # ═══════════════════════════════════════════════════════════════════ | |
| # TAB 1: SINGLE LEAD SCORING | |
| # ═══════════════════════════════════════════════════════════════════ | |
| with tab1: | |
| col1, col2 = st.columns([1, 1], gap="large") | |
| with col1: | |
| st.markdown('<div class="section-title">Lead Parameters</div>', unsafe_allow_html=True) | |
| with st.container(border=True): | |
| lead_source = st.selectbox("Acquisition Channel", LEAD_SOURCES, index=0) | |
| total_visits = st.slider("Total Web Properties Visited", 1, 30, 8) | |
| time_on_site = st.slider("Cumulative Dwell Time (sec)", 0, 2700, 420, step=30) | |
| page_views = st.slider("Pages Viewed Per Session", 1, 20, 5) | |
| st.markdown(""" | |
| <div class="highlight-box"> | |
| "Interpretability is revenue. Knowing a lead scored 85% <em>because of</em> '12 visits + Referral channel' gives Sales | |
| the context to personalize outreach and close faster. Black-box models leave teams guessing." | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Quick guide on reading the chart | |
| st.markdown(""" | |
| <div style="margin-top:0.5rem;"> | |
| <p style="color:var(--text-muted); font-size:0.85rem; line-height:1.7;"> | |
| <strong style="color:var(--text-main);">Reading the Decision Drivers chart:</strong><br> | |
| Each bar shows how much a single feature pushed the prediction up or down. | |
| <span style="color:#D96A47;">Orange bars</span> are conversion drivers — they increase the lead's score. | |
| <span style="color:#3E7C77;">Teal bars</span> indicate friction — factors pulling the score down. | |
| The longer the bar, the stronger that feature's influence on this specific prediction. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| if model_bundle: | |
| prob, df_row = predict_single(total_visits, time_on_site, page_views, lead_source) | |
| pct = int(prob * 100) | |
| tier_label, tier_class, tier_color = get_score_tier(prob) | |
| st.markdown(f""" | |
| <div class="score-display"> | |
| <div class="score-number" style="color:{tier_color};">{pct}%</div> | |
| <div class="score-label">Predicted Conversion Probability</div> | |
| <div class="score-badge {tier_class}">{tier_label}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| st.markdown('<b style="color:#2D2A26; font-family:\'Crimson Pro\',serif; font-size:1.4rem;">Decision Drivers</b>', unsafe_allow_html=True) | |
| st.markdown('<p style="color:#66635F; font-size:0.88rem; margin-bottom:1rem;">SHAP feature attribution for this prediction</p>', unsafe_allow_html=True) | |
| fig = shap_waterfall(df_row) | |
| if fig: | |
| st.pyplot(fig, use_container_width=True) | |
| plt.close() | |
| # Recommended action based on tier | |
| if tier_label == "High Intent": | |
| action_html = """ | |
| <div style="background:var(--accent-light); border:1px solid #F6D6CB; border-radius:8px; padding:1.25rem; margin-top:1rem;"> | |
| <strong style="color:var(--accent); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong> | |
| <p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;"> | |
| Route directly to a senior Account Executive. This lead shows strong buying signals — | |
| prioritize personalized outreach within 24 hours to maximize conversion window. | |
| </p> | |
| </div> | |
| """ | |
| elif tier_label == "Evaluating": | |
| action_html = """ | |
| <div style="background:var(--warm-light); border:1px solid #F5E8C3; border-radius:8px; padding:1.25rem; margin-top:1rem;"> | |
| <strong style="color:var(--warm); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong> | |
| <p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;"> | |
| Assign to SDR for nurturing. Send targeted case studies relevant to their browsing behavior. | |
| Schedule follow-up touchpoints over the next 2 weeks. | |
| </p> | |
| </div> | |
| """ | |
| else: | |
| action_html = """ | |
| <div style="background:#F4F4F4; border:1px solid #E4E3E0; border-radius:8px; padding:1.25rem; margin-top:1rem;"> | |
| <strong style="color:var(--text-muted); font-size:0.85rem; text-transform:uppercase; letter-spacing:0.05em;">Recommended Action</strong> | |
| <p style="color:var(--text-main); font-size:0.9rem; margin:0.5rem 0 0 0; line-height:1.6;"> | |
| Enroll in automated drip campaigns. No manual sales effort required — let marketing | |
| nurture over time. Re-score in 30 days if engagement increases. | |
| </p> | |
| </div> | |
| """ | |
| st.markdown(action_html, unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div class="score-display"> | |
| <div class="score-number" style="color:#E8E6E1;">—%</div> | |
| <div class="score-label">Engine Offline</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ═══════════════════════════════════════════════════════════════════ | |
| # TAB 2: BATCH / PIPELINE SCORING | |
| # ═══════════════════════════════════════════════════════════════════ | |
| with tab2: | |
| st.markdown('<div class="section-title">Enrich Your Pipeline</div>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <p class="section-subtitle"> | |
| Upload a CSV of raw leads. The engine scores each record with a conversion probability and intent tier, | |
| enabling your team to prioritize outreach by predicted value rather than gut feel. | |
| Every row gets the same model that powers the real-time profiler — LightGBM with 300K training rows. | |
| </p> | |
| """, unsafe_allow_html=True) | |
| sample_data = pd.DataFrame({ | |
| "TotalVisits": [15, 3, 22, 1, 8], | |
| "Total Time Spent on Website": [890, 45, 1200, 15, 420], | |
| "Page Views Per Visit": [12, 2, 18, 1, 6], | |
| "Lead Source": ["Referral", "Cold Call", "Partner", "Cold Call", "Google"] | |
| }) | |
| # Template & Upload | |
| col_btn1, col_btn2 = st.columns([1, 4]) | |
| with col_btn1: | |
| st.download_button("Download Template CSV", sample_data.to_csv(index=False).encode(), "template.csv", "text/csv", use_container_width=True) | |
| uploaded_file = st.file_uploader("", type=["csv"]) | |
| if uploaded_file: | |
| try: | |
| df_up = pd.read_csv(uploaded_file) | |
| st.markdown(f"<p style='color:var(--text-main); font-weight:600; font-size:1.05rem;'>{len(df_up):,} leads ingested.</p>", unsafe_allow_html=True) | |
| if model_bundle: | |
| with st.spinner("Analyzing intent signals..."): | |
| probs = predict_batch(df_up) | |
| if probs is not None: | |
| df_up.insert(0, "Score Tier", probs) | |
| df_up.insert(0, "Win Probability", (probs * 100).round(1).astype(str) + "%") | |
| df_up["Score Tier"] = df_up["Score Tier"].apply(lambda p: "High Intent" if p >= 0.7 else ("Evaluating" if p >= 0.4 else "Low Intent")) | |
| df_up = df_up.sort_values("Win Probability", ascending=False) | |
| hot = (probs >= 0.7).sum() | |
| warm = ((probs >= 0.4) & (probs < 0.7)).sum() | |
| cold = (probs < 0.4).sum() | |
| m1, m2, m3 = st.columns(3) | |
| m1.markdown(f"""<div class='card card-accent' style='text-align:center;'> | |
| <h3 style='color:var(--accent); font-size:2rem; margin-bottom:0.3rem;'>{hot:,}</h3> | |
| <p style='margin:0; font-weight:500;'>High Intent</p> | |
| <p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>Route to Account Executives</p> | |
| </div>""", unsafe_allow_html=True) | |
| m2.markdown(f"""<div class='card card-warm' style='text-align:center;'> | |
| <h3 style='color:var(--warm); font-size:2rem; margin-bottom:0.3rem;'>{warm:,}</h3> | |
| <p style='margin:0; font-weight:500;'>Evaluating</p> | |
| <p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>SDR follow-up sequence</p> | |
| </div>""", unsafe_allow_html=True) | |
| m3.markdown(f"""<div class='card' style='text-align:center; border-top:3px solid var(--text-light);'> | |
| <h3 style='color:var(--text-muted); font-size:2rem; margin-bottom:0.3rem;'>{cold:,}</h3> | |
| <p style='margin:0; font-weight:500;'>Long-term Nurture</p> | |
| <p style='font-size:0.8rem; margin:0.3rem 0 0 0;'>Automated drip campaigns</p> | |
| </div>""", unsafe_allow_html=True) | |
| st.dataframe(df_up, use_container_width=True) | |
| # Channel breakdown | |
| if "Lead Source" in df_up.columns: | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| st.markdown('<b style="font-family:\'Crimson Pro\',serif; font-size:1.2rem;">Channel Performance Breakdown</b>', unsafe_allow_html=True) | |
| prob_numeric = probs | |
| channel_df = pd.DataFrame({ | |
| "Lead Source": pd.read_csv(uploaded_file)["Lead Source"], | |
| "Probability": prob_numeric | |
| }) | |
| channel_summary = channel_df.groupby("Lead Source").agg( | |
| Leads=("Probability", "count"), | |
| Avg_Score=("Probability", "mean"), | |
| High_Intent=("Probability", lambda x: (x >= 0.7).sum()) | |
| ).sort_values("Avg_Score", ascending=False) | |
| channel_summary["Avg_Score"] = (channel_summary["Avg_Score"] * 100).round(1).astype(str) + "%" | |
| st.dataframe(channel_summary, use_container_width=True) | |
| out_csv = df_up.to_csv(index=False).encode() | |
| st.download_button("Export Scored Pipeline", out_csv, f"scored_routing_{datetime.now().strftime('%Y%m%d')}.csv", "text/csv") | |
| except Exception as e: | |
| st.error(f"Ingestion error: {e}") | |
| else: | |
| st.markdown(""" | |
| <div style="margin-top:1rem;"> | |
| <p style="color:var(--text-muted); font-size:0.95rem; margin-bottom:0.5rem;"> | |
| <strong style="color:var(--text-main);">Expected CSV columns:</strong> | |
| </p> | |
| <div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:0.5rem 0;"> | |
| <div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);"> | |
| <span>Column</span><span>Type</span><span>Example</span><span>Notes</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>TotalVisits</span><span>Integer</span><span>15</span><span>Unique web visits (1–30)</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Total Time Spent on Website</span><span>Integer</span><span>890</span><span>Seconds on site (30–1200)</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Page Views Per Visit</span><span>Integer</span><span>12</span><span>Pages per session (1–20)</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 70px 70px 1fr; padding:0.5rem 1rem; font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Lead Source</span><span>String</span><span>Referral</span><span>One of 7 channels</span> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("<p style='color:var(--text-muted); margin-top:1.5rem;'><em>Demonstration preview:</em></p>", unsafe_allow_html=True) | |
| sample_data_demo = sample_data.copy() | |
| sample_data_demo.insert(0, "Score Tier", ["High Intent", "Low Intent", "High Intent", "Low Intent", "Evaluating"]) | |
| sample_data_demo.insert(0, "Win Probability", ["87.2%", "12.4%", "94.1%", "6.8%", "61.3%"]) | |
| st.dataframe(sample_data_demo, use_container_width=True) | |
| # ═══════════════════════════════════════════════════════════════════ | |
| # TAB 3: MODEL DEEP DIVE | |
| # ═══════════════════════════════════════════════════════════════════ | |
| with tab3: | |
| st.markdown('<div class="section-title">The Analytics Engine</div>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <p class="section-subtitle"> | |
| A transparent look at the model architecture, performance metrics, and design decisions. | |
| We believe every production ML system should be auditable — this section exists so you can verify, | |
| not just trust. | |
| </p> | |
| """, unsafe_allow_html=True) | |
| # ── Algorithm Selection ── | |
| st.markdown(""" | |
| <div class="card card-teal"> | |
| <h3>Algorithm Selection</h3> | |
| <p>We evaluated four model families on the same 300K-row dataset before selecting LightGBM for production deployment. | |
| The decision was driven by three criteria: <strong>predictive accuracy</strong> (AUC on held-out data), | |
| <strong>training efficiency</strong> (GPU utilization and wall-clock time), and <strong>explainability</strong> | |
| (compatibility with SHAP for per-prediction reasoning).</p> | |
| <div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin-top:1rem;"> | |
| <div style="display:grid; grid-template-columns:140px 90px 1fr; background:var(--bg-main); padding:0.7rem 1rem; font-size:0.8rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);"> | |
| <span>Algorithm</span><span>Verdict</span><span>Rationale</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;"> | |
| <span><strong style="color:var(--teal);">LightGBM</strong></span> | |
| <span><span class="tag tag-teal">Deployed</span></span> | |
| <span>Leaf-wise tree growth with native CUDA GPU support. Trained 300K rows in seconds with early stopping. Natively handles categorical interactions and integrates cleanly with TreeSHAP for exact Shapley value computation.</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;"> | |
| <span><strong>XGBoost</strong></span> | |
| <span><span class="tag tag-warm">Rejected</span></span> | |
| <span>Depth-wise growth is more memory-intensive. On our dataset, XGBoost trained 3–4x slower with no measurable AUC improvement. The accuracy parity didn't justify the compute cost.</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.9rem; color:var(--text-muted); align-items:start;"> | |
| <span><strong>Logistic Reg.</strong></span> | |
| <span><span class="tag tag-accent">Failed</span></span> | |
| <span>Assumes linear feature-target relationships. Completely missed interaction effects — a visitor with 20 visits but 0 seconds on page is likely a bot, not a hot lead.</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:140px 90px 1fr; padding:0.75rem 1rem; font-size:0.9rem; color:var(--text-muted); align-items:start;"> | |
| <span><strong>Deep Learning</strong></span> | |
| <span><span class="tag tag-muted">Overkill</span></span> | |
| <span>Neural networks require spatial/temporal locality to outperform trees. On flat tabular data with 10 features, DNNs overfit to training noise and failed to beat gradient boosted trees.</span> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Model Performance ── | |
| perf_col1, perf_col2 = st.columns(2, gap="large") | |
| with perf_col1: | |
| st.markdown(""" | |
| <div class="card card-accent"> | |
| <h3>Performance Metrics</h3> | |
| <p>Validated against a stratified 60,000-row held-out test set (20% of total data, never seen during training).</p> | |
| <div class="stat-row"> | |
| <span class="stat-label">AUC-ROC</span> | |
| <span class="stat-value" style="color:var(--teal);">0.8302</span> | |
| </div> | |
| <div class="stat-row"> | |
| <span class="stat-label">Accuracy</span> | |
| <span class="stat-value">78.5%</span> | |
| </div> | |
| <div class="stat-row"> | |
| <span class="stat-label">Precision (Converters)</span> | |
| <span class="stat-value">78.0%</span> | |
| </div> | |
| <div class="stat-row"> | |
| <span class="stat-label">Recall (Converters)</span> | |
| <span class="stat-value">59.0%</span> | |
| </div> | |
| <div class="stat-row"> | |
| <span class="stat-label">F1 Score</span> | |
| <span class="stat-value">0.67</span> | |
| </div> | |
| <div class="stat-row"> | |
| <span class="stat-label">Training Iterations</span> | |
| <span class="stat-value">149 / 1,500</span> | |
| </div> | |
| <p style="font-size:0.82rem; margin-top:1rem; color:var(--text-light);"> | |
| Early stopping halted training at epoch 149 (patience: 50 rounds) to prevent overfitting. | |
| The model converged to its best validation score quickly, confirming the signal-to-noise | |
| ratio in the feature set is strong. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with perf_col2: | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Confusion Matrix</h3> | |
| <p>How the model's predictions map against ground truth on the 60K test set.</p> | |
| <div class="cm-grid"> | |
| <div></div> | |
| <div class="cm-header">Pred: No</div> | |
| <div class="cm-header">Pred: Yes</div> | |
| <div class="cm-header" style="writing-mode:vertical-lr; transform:rotate(180deg); padding:0.3rem;">Actual: No</div> | |
| <div class="cm-cell cm-tn">33,988<span class="cm-label">True Neg</span></div> | |
| <div class="cm-cell cm-fp">3,680<span class="cm-label">False Pos</span></div> | |
| <div class="cm-header" style="writing-mode:vertical-lr; transform:rotate(180deg); padding:0.3rem;">Actual: Yes</div> | |
| <div class="cm-cell cm-fn">9,235<span class="cm-label">False Neg</span></div> | |
| <div class="cm-cell cm-tp">13,097<span class="cm-label">True Pos</span></div> | |
| </div> | |
| <p style="font-size:0.85rem; margin-top:1.2rem;"> | |
| The model is <strong>conservative by design</strong> — it has higher precision (78%) than recall (59%) | |
| for converters. This means when it flags a lead as "High Intent," that signal is reliable. The trade-off | |
| is that some true converters land in the "Evaluating" bucket, where SDR follow-up catches them. | |
| </p> | |
| <p style="font-size:0.85rem;"> | |
| This bias is intentional for B2B sales: a false positive (wasting an AE's time on a cold lead) is more | |
| expensive than a false negative (a warm lead getting an SDR call instead of a direct AE route). | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Hyperparameters & Feature Signals ── | |
| hyper_col1, hyper_col2 = st.columns(2, gap="large") | |
| with hyper_col1: | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Hyperparameters</h3> | |
| <p>The LightGBM classifier was tuned for a 300K-row, 10-feature tabular dataset with GPU acceleration.</p> | |
| <div class="code-block">LGBMClassifier( | |
| device = "gpu" | |
| n_estimators = 1,500 (with early stopping) | |
| learning_rate = 0.05 | |
| num_leaves = 63 | |
| max_depth = 10 | |
| min_child_samples = 50 | |
| subsample = 0.8 | |
| colsample_bytree = 0.8 | |
| reg_alpha = 0.1 (L1 regularization) | |
| reg_lambda = 1.0 (L2 regularization) | |
| random_state = 42 | |
| )</div> | |
| <p style="font-size:0.85rem; margin-top:0.5rem;"> | |
| <strong>Key design choices:</strong> 63 leaves with depth-10 cap balances expressiveness against overfitting. | |
| L1 + L2 regularization penalizes unnecessarily complex splits. 80% row and column subsampling adds stochasticity, | |
| reducing variance across boosting rounds. Early stopping with 50-round patience ensures the model stops | |
| learning before it starts memorizing. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with hyper_col2: | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Feature Signal Strength</h3> | |
| <p>Pearson correlations between each numeric feature and the conversion target, measured across the full 300K dataset.</p> | |
| <div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;"> | |
| <div style="display:grid; grid-template-columns:1fr 100px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);"> | |
| <span>Feature</span><span>Correlation</span><span>Strength</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Total Visits</span><span style="font-weight:600; color:var(--teal);">+0.498</span><span><span class="tag tag-teal">Strong</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Time on Website</span><span style="font-weight:600; color:var(--teal);">+0.460</span><span><span class="tag tag-teal">Strong</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 100px 80px; padding:0.55rem 1rem; font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Page Views / Visit</span><span style="font-weight:600; color:var(--warm);">+0.329</span><span><span class="tag tag-warm">Moderate</span></span> | |
| </div> | |
| </div> | |
| <h4>Channel Conversion Propensity</h4> | |
| <p>Logit coefficients assigned during data generation — representing each channel's inherent quality.</p> | |
| <div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;"> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);"> | |
| <span>Channel</span><span>Coeff.</span><span>Effect</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Referral</span><span style="color:var(--teal); font-weight:600;">+1.30</span><span><span class="tag tag-teal">Strongest</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Partner</span><span style="color:var(--teal); font-weight:600;">+0.90</span><span><span class="tag tag-teal">Strong</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Google</span><span style="color:var(--warm); font-weight:600;">+0.25</span><span><span class="tag tag-warm">Moderate</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Direct</span><span>0.00</span><span><span class="tag tag-muted">Neutral</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Facebook</span><span style="color:var(--accent);">-0.25</span><span><span class="tag tag-accent">Weak</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Instagram</span><span style="color:var(--accent);">-0.60</span><span><span class="tag tag-accent">Negative</span></span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 90px 80px; padding:0.5rem 1rem; font-size:0.88rem; color:var(--text-muted);"> | |
| <span>Cold Call</span><span style="color:var(--accent); font-weight:600;">-1.00</span><span><span class="tag tag-accent">Weakest</span></span> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Explainability & Routing ── | |
| explain_col1, explain_col2 = st.columns(2, gap="large") | |
| with explain_col1: | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Why Explainability Matters</h3> | |
| <p>Traditional lead scoring is a black box. A model says "82%" and the sales team has no idea why. Was it the | |
| channel? The browsing depth? The time spent? Without that context, every follow-up is generic.</p> | |
| <p>We deploy <strong>TreeSHAP</strong> — a game-theoretic framework based on Shapley values from cooperative game theory. | |
| For every single prediction, TreeSHAP computes the exact marginal contribution of each feature to the final score.</p> | |
| <h4>What this means in practice</h4> | |
| <ul> | |
| <li><strong>For Sales:</strong> "This lead scored 87% primarily because of 14 return visits via the Referral channel." | |
| That's actionable — the AE knows to reference the referrer in their outreach.</li> | |
| <li><strong>For Marketing:</strong> Aggregate SHAP values across cohorts reveal which channels and behaviors | |
| consistently drive conversions, informing budget allocation.</li> | |
| <li><strong>For Compliance:</strong> Full audit trail. Every prediction can be decomposed into its constituent | |
| factors — no unexplainable decisions.</li> | |
| </ul> | |
| <p style="font-size:0.85rem; color:var(--text-light); margin-top:0.5rem;"> | |
| TreeSHAP runs in O(TLD) time where T = trees, L = leaves, D = depth. For our 149-tree model with depth 10, | |
| explanation latency is under 50ms per prediction. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with explain_col2: | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Intent Routing Protocol</h3> | |
| <p>The scoring engine classifies every lead into one of three intent tiers. Each tier maps to a specific | |
| go-to-market action, ensuring that sales resources are allocated proportionally to predicted value.</p> | |
| <div style="background:var(--accent-light); border:1px solid #F6D6CB; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;"> | |
| <strong style="color:var(--accent); font-size:0.9rem;">High Intent — P(conv) > 70%</strong> | |
| <p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);"> | |
| Bypass SDRs entirely. Route to senior Account Executives for personalized, white-glove outreach | |
| within 24 hours. These leads have demonstrated strong multi-signal buying behavior. | |
| </p> | |
| </div> | |
| <div style="background:var(--warm-light); border:1px solid #F5E8C3; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;"> | |
| <strong style="color:var(--warm); font-size:0.9rem;">Evaluating — P(conv) 40–70%</strong> | |
| <p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);"> | |
| Interested but undecided. Route to SDRs for structured follow-up: targeted case studies, product | |
| demos, and persistent multi-touch cadences over 2–3 weeks. | |
| </p> | |
| </div> | |
| <div style="background:#F4F4F4; border:1px solid #E4E3E0; border-radius:6px; padding:1rem 1.25rem; margin:1rem 0;"> | |
| <strong style="color:var(--text-muted); font-size:0.9rem;">Low Intent — P(conv) < 40%</strong> | |
| <p style="font-size:0.88rem; margin:0.4rem 0 0 0; color:var(--text-main);"> | |
| Do not allocate manual sales time. Enroll in automated newsletter sequences and long-term | |
| nurture campaigns. Re-score monthly — intent can change as needs evolve. | |
| </p> | |
| </div> | |
| <p style="font-size:0.85rem; color:var(--text-light); margin-top:0.5rem;"> | |
| This protocol ensures AEs spend 100% of their time on qualified opportunities, while no lead | |
| is ever abandoned — just routed to the appropriate engagement tier. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Feature Importance Images ── | |
| fi_path = None | |
| for p in ["model/feature_importance.png", "../model/feature_importance.png"]: | |
| if os.path.exists(p): | |
| fi_path = p | |
| break | |
| shap_path = None | |
| for p in ["model/shap_summary.png", "../model/shap_summary.png"]: | |
| if os.path.exists(p): | |
| shap_path = p | |
| break | |
| def get_base64_image(image_path): | |
| with open(image_path, "rb") as img_file: | |
| return base64.b64encode(img_file.read()).decode() | |
| if fi_path or shap_path: | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| img_col1, img_col2 = st.columns(2, gap="large") | |
| if fi_path: | |
| with img_col1: | |
| b64_fi = get_base64_image(fi_path) | |
| st.markdown(f""" | |
| <div class="card"> | |
| <h3 style="font-family:'Crimson Pro',serif; font-size:1.35rem; margin-bottom:0.5rem;">Feature Importance (Split Count)</h3> | |
| <p style="color:var(--text-muted); font-size:0.88rem; margin-bottom:1rem;">Number of times each feature was used for a decision split across all 149 trees.</p> | |
| <img src="data:image/png;base64,{b64_fi}" style="width:100%; border-radius:4px;"> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if shap_path: | |
| with img_col2: | |
| b64_shap = get_base64_image(shap_path) | |
| st.markdown(f""" | |
| <div class="card"> | |
| <h3 style="font-family:'Crimson Pro',serif; font-size:1.35rem; margin-bottom:0.5rem;">SHAP Summary (Beeswarm)</h3> | |
| <p style="color:var(--text-muted); font-size:0.88rem; margin-bottom:1rem;">Distribution of SHAP values per feature across a 500-row sample. Color = feature value (red = high).</p> | |
| <img src="data:image/png;base64,{b64_shap}" style="width:100%; border-radius:4px;"> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ═══════════════════════════════════════════════════════════════════ | |
| # TAB 4: HOW IT WORKS (PIPELINE WALKTHROUGH) | |
| # ═══════════════════════════════════════════════════════════════════ | |
| with tab4: | |
| st.markdown('<div class="section-title">How It Works</div>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <p class="section-subtitle"> | |
| From raw data to actionable scores — a walkthrough of every stage in the ML pipeline. | |
| Each step is designed to be reproducible, auditable, and free of data leakage. | |
| </p> | |
| """, unsafe_allow_html=True) | |
| # ── Pipeline Steps ── | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>The Pipeline — End to End</h3> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--accent);">1</div> | |
| <div class="step-content"> | |
| <h4>Synthetic Data Generation</h4> | |
| <p>300,000 B2B leads are generated using a statistically rigorous process. Each lead has 3 behavioral | |
| features (visits, dwell time, page views) and 1 categorical feature (acquisition channel). | |
| Conversion labels are generated using a sigmoid function with main effects, interaction terms, and | |
| controlled noise — producing realistic, non-trivial decision boundaries that require a non-linear | |
| model to learn.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--accent);">2</div> | |
| <div class="step-content"> | |
| <h4>Feature Engineering</h4> | |
| <p>The categorical "Lead Source" column is one-hot encoded into 7 binary features, producing a | |
| total of 10 model inputs. No feature scaling is applied — tree-based models are invariant to | |
| monotonic transformations, so normalization would add complexity without benefit. Feature names | |
| are serialized alongside the model to prevent column-mismatch errors at inference time.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--teal);">3</div> | |
| <div class="step-content"> | |
| <h4>Train / Test Split</h4> | |
| <p>An 80/20 stratified split produces 240,000 training rows and 60,000 test rows. Stratification | |
| ensures both sets maintain the same conversion rate (~37%). The test set is strictly held out — it | |
| is never used for hyperparameter tuning or feature selection, only for final evaluation.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--teal);">4</div> | |
| <div class="step-content"> | |
| <h4>GPU-Accelerated Training</h4> | |
| <p>LightGBM trains with CUDA GPU acceleration. The model is configured for up to 1,500 boosting rounds, | |
| but early stopping (patience = 50) halts training when the validation AUC stops improving. In practice, | |
| the model converges at iteration 149 — just 10% of the maximum budget. This aggressive early stop | |
| confirms the feature set carries strong, learnable signal.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--warm);">5</div> | |
| <div class="step-content"> | |
| <h4>Evaluation & Validation</h4> | |
| <p>The trained model is evaluated on the 60K held-out test set across multiple metrics: AUC-ROC (0.83), | |
| accuracy (78.5%), precision (78%), and recall (59%). The precision-recall trade-off is intentionally | |
| conservative — we accept lower recall to ensure high-intent predictions are trustworthy. A full | |
| confusion matrix and classification report are generated for auditability.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step"> | |
| <div class="step-number" style="background:var(--warm);">6</div> | |
| <div class="step-content"> | |
| <h4>SHAP Explainability Layer</h4> | |
| <p>TreeSHAP is applied post-training to provide per-prediction feature attribution. For each scored lead, | |
| a waterfall chart decomposes the prediction into individual feature contributions. This transforms | |
| a single probability score into a narrative — "72% because of 14 visits and Referral source, | |
| partially offset by low page depth" — that sales teams can act on immediately.</p> | |
| </div> | |
| </div> | |
| <div class="pipeline-step" style="border-bottom:none;"> | |
| <div class="step-number" style="background:var(--text-main);">7</div> | |
| <div class="step-content"> | |
| <h4>Model Serialization & Deployment</h4> | |
| <p>The trained LightGBM classifier and its feature name list are bundled into a single pickle file | |
| using joblib. This bundle is loaded at app startup with Streamlit's cache_resource decorator — | |
| ensuring the model is loaded once and shared across all user sessions. The app supports both | |
| single-lead real-time scoring and batch CSV uploads with identical model inference.</p> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Data Generation Details ── | |
| gen_col1, gen_col2 = st.columns(2, gap="large") | |
| with gen_col1: | |
| st.markdown(""" | |
| <div class="card card-accent"> | |
| <h3>Data Generation — The Math</h3> | |
| <p>Conversion labels aren't randomly assigned — they're generated from a sigmoid (logistic) function | |
| with carefully designed coefficients, ensuring the dataset has <em>learnable structure</em> that mirrors | |
| real-world B2B dynamics.</p> | |
| <h4>The Logit Function</h4> | |
| <div class="code-block">logit = -0.80 # base intercept | |
| + 0.80 * visits_z # main: visit frequency | |
| + 0.75 * time_z # main: engagement depth | |
| + 0.60 * pages_z # main: content breadth | |
| + 0.35 * visits_z * time_z # interaction | |
| + 0.25 * visits_z * pages_z # interaction | |
| + 0.20 * time_z * pages_z # interaction | |
| + source_bonus * 0.50 # channel quality | |
| + noise ~ N(0, 0.35) # controlled noise | |
| P(convert) = sigmoid(logit)</div> | |
| <p style="font-size:0.85rem;"> | |
| The interaction terms are critical — they ensure that <strong>combinations of features</strong> matter, | |
| not just individual values. A lead with high visits but zero time-on-site (possible bot behavior) | |
| gets a different score than one with moderate visits and deep engagement. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with gen_col2: | |
| st.markdown(""" | |
| <div class="card card-teal"> | |
| <h3>Feature Distributions</h3> | |
| <p>Each numeric feature is generated from a gamma distribution (right-skewed), reflecting | |
| the typical long-tail patterns in real web analytics data. Most visitors have few visits; | |
| a small fraction are highly engaged.</p> | |
| <div style="border:1px solid var(--border); border-radius:6px; overflow:hidden; margin:1rem 0;"> | |
| <div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; background:var(--bg-main); padding:0.6rem 1rem; font-size:0.78rem; font-weight:600; text-transform:uppercase; letter-spacing:0.04em; border-bottom:2px solid var(--border);"> | |
| <span>Feature</span><span>Distribution</span><span>Range</span><span>Corr.</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.85rem; color:var(--text-muted);"> | |
| <span>TotalVisits</span><span>Gamma(3.0, 2.5)</span><span>1 – 30</span><span style="font-weight:600; color:var(--teal);">+0.498</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; border-bottom:1px solid var(--border-light); font-size:0.85rem; color:var(--text-muted);"> | |
| <span>Time on Site</span><span>visits×18 + Gamma</span><span>30 – 1200s</span><span style="font-weight:600; color:var(--teal);">+0.460</span> | |
| </div> | |
| <div style="display:grid; grid-template-columns:1fr 1fr 80px 80px; padding:0.55rem 1rem; font-size:0.85rem; color:var(--text-muted);"> | |
| <span>Page Views</span><span>visits×0.25 + Gamma</span><span>1 – 20</span><span style="font-weight:600; color:var(--warm);">+0.329</span> | |
| </div> | |
| </div> | |
| <h4>Why Gamma?</h4> | |
| <p style="font-size:0.88rem;"> | |
| Real web traffic is never normally distributed — it's always right-skewed. Most visitors browse briefly; | |
| a small but important tail spends significant time. Gamma distributions naturally model this shape, | |
| producing more realistic synthetic data than a simple uniform or Gaussian would. | |
| </p> | |
| <h4>Inter-Feature Correlations</h4> | |
| <p style="font-size:0.88rem;"> | |
| Time-on-site and page views are partially derived from visit count (with independent gamma noise), | |
| creating <strong>moderate multicollinearity</strong> — exactly what you'd see in real web analytics. | |
| The model must learn to separate the shared signal from the unique information each feature carries. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ── Tech Stack ── | |
| st.markdown(""" | |
| <div class="card"> | |
| <h3>Technology Stack</h3> | |
| <div style="display:flex; flex-wrap:wrap; gap:2rem; margin-top:0.5rem;"> | |
| <div style="flex:1; min-width:200px;"> | |
| <h4 style="color:var(--teal); margin-top:0;">ML & Data</h4> | |
| <ul> | |
| <li><strong>LightGBM 4.0+</strong> — Gradient boosted decision trees with GPU</li> | |
| <li><strong>SHAP 0.44+</strong> — TreeSHAP for per-prediction explanations</li> | |
| <li><strong>scikit-learn 1.3+</strong> — Train/test splits, metrics, preprocessing</li> | |
| <li><strong>Pandas / NumPy</strong> — Data manipulation and numerical ops</li> | |
| </ul> | |
| </div> | |
| <div style="flex:1; min-width:200px;"> | |
| <h4 style="color:var(--accent); margin-top:0;">Application</h4> | |
| <ul> | |
| <li><strong>Streamlit 1.32+</strong> — Interactive web application framework</li> | |
| <li><strong>Matplotlib</strong> — SHAP waterfall visualizations</li> | |
| <li><strong>joblib</strong> — Model serialization with compression</li> | |
| <li><strong>Faker</strong> — Realistic synthetic data generation</li> | |
| </ul> | |
| </div> | |
| <div style="flex:1; min-width:200px;"> | |
| <h4 style="margin-top:0;">Infrastructure</h4> | |
| <ul> | |
| <li><strong>CUDA GPU</strong> — Hardware-accelerated model training</li> | |
| <li><strong>Python 3.10+</strong> — Runtime environment</li> | |
| <li><strong>Git</strong> — Version control for code and pipeline artifacts</li> | |
| <li><strong>Streamlit Cache</strong> — Model loaded once, shared across sessions</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |