uncertainrods commited on
Commit
e1283b0
·
0 Parent(s):

deploy: initial push with LFS-tracked models

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.joblib filter=lfs diff=lfs merge=lfs -text
Cleaned_OutreachAI_Dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Stage 1: Builder ─────────────────────────────────────────────
2
+ # Use slim Python 3.11 to keep the image lean
3
+ FROM python:3.11-slim AS builder
4
+
5
+ # Install system-level build deps needed by shap/scipy/numpy
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ gcc \
8
+ g++ \
9
+ libgomp1 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /build
13
+
14
+ # Copy only the requirements first so Docker can cache this layer
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir --upgrade pip \
17
+ && pip install --no-cache-dir -r requirements.txt
18
+
19
+ # ── Stage 2: Runtime ─────────────────────────────────────────────
20
+ FROM python:3.11-slim AS runtime
21
+
22
+ # libgomp1 is required at runtime by XGBoost/SHAP for OpenMP
23
+ RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # Copy installed packages from builder
27
+ COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
28
+ COPY --from=builder /usr/local/bin /usr/local/bin
29
+
30
+ WORKDIR /app
31
+
32
+ # Copy application source — single flat structure for HF Spaces
33
+ COPY main.py .
34
+ COPY requirements.txt .
35
+
36
+ # Models are expected in a flat /app/models directory
37
+ COPY models/ ./models/
38
+
39
+ # Dataset for buyer_id lookups
40
+ COPY Cleaned_OutreachAI_Dataset.csv ./data/Cleaned_OutreachAI_Dataset.csv
41
+
42
+ # HF Spaces requires port 7860
43
+ EXPOSE 7860
44
+
45
+ # Health-check so HF Spaces can detect a ready container
46
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
47
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')"
48
+
49
+ # Start the FastAPI server on HF's required port
50
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ByteMe LOC8A2 — Agentic ML Microservice
3
+ emoji: 🤖
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ # ByteMe LOC8A2 — Agentic ML Microservice
13
+
14
+ A production-ready FastAPI microservice providing **ML-powered lead scoring, firmographic segmentation, channel recommendation, and deep SHAP explainability** for B2B outreach campaigns.
15
+
16
+ ## Endpoints
17
+
18
+ | Method | Endpoint | Description |
19
+ |--------|----------|-------------|
20
+ | `GET` | `/health` | Service health and model readiness check |
21
+ | `POST` | `/predict/profile` | Full ML inference + explainability for a `buyer_id` |
22
+ | `GET` | `/explain/params` | Dictionary of all parameter definitions |
23
+ | `GET` | `/explain/narrative/{buyer_id}` | Plain-text narrative + chart-ready arrays |
24
+
25
+ ## Models
26
+
27
+ | Model | Algorithm | Purpose |
28
+ |-------|-----------|---------|
29
+ | Propensity Scorer | XGBoost | Predicts conversion likelihood (score 75–98) |
30
+ | Firmographic Segmenter | KMeans (k=3) | Classifies into Startup / Mid-Market / Enterprise |
31
+ | Channel Predictor | Random Forest | Recommends outreach channel (LinkedIn / Email / Direct Call / WhatsApp) |
main.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Dict, Any, List
4
+ import joblib
5
+ import os
6
+ import pandas as pd
7
+ import numpy as np
8
+ import shap
9
+
10
+ app = FastAPI(title="Agentic ML Microservice", version="2.0.0")
11
+
12
+ # --- Globals for Model Loading ---
13
+ prop_model = None
14
+ firmo_model = None
15
+ chan_model = None
16
+ shap_explainer = None
17
+ df_db = None
18
+
19
+ # --- Constants ---
20
+ CHANNELS = ["LinkedIn", "Email", "Direct Call", "WhatsApp"]
21
+ SEGMENT_MAP = {0: "Startup", 1: "Mid-Market", 2: "Enterprise"}
22
+ TONE_MAP = {
23
+ "Enterprise": "Formal and value-driven",
24
+ "Startup": "Casual and high-energy",
25
+ "Mid-Market": "Professional and balanced",
26
+ }
27
+ CONSTRAINT_MAP = {
28
+ "Job_Promotion_Flag": "Congratulate them on their new role priorities.",
29
+ "Hiring_Increase_Flag": "Mention their recent hiring spree.",
30
+ "Revenue_Growth_Score": "Acknowledge their strong financial trajectory.",
31
+ "Clay_Intent_Signal": "Reference their third-party intent signals.",
32
+ "Apollo_Engagement_Score": "Highlight their active engagement score.",
33
+ "Composite_Growth_Signal": "Highlight their explosive scaling metrics (hiring + revenue).",
34
+ }
35
+ FEATURE_PLAIN_NAMES = {
36
+ "Job_Promotion_Flag": "Recent Job Promotion",
37
+ "Hiring_Increase_Flag": "Active Hiring",
38
+ "Revenue_Growth_Score": "Revenue Growth",
39
+ "Clay_Intent_Signal": "Third-Party Intent (Clay)",
40
+ "Apollo_Engagement_Score": "Engagement Score (Apollo)",
41
+ "Composite_Growth_Signal": "Growth Momentum (Hiring × Revenue)",
42
+ }
43
+
44
+ # --- Pydantic Schemas ---
45
+ class BuyerRequest(BaseModel):
46
+ buyer_id: str
47
+
48
+
49
+ @app.on_event("startup")
50
+ def load_models():
51
+ """Loads models and the local dataset into memory when the FastAPI server initializes."""
52
+ global prop_model, firmo_model, chan_model, shap_explainer, df_db
53
+ # In HF Spaces Docker, models are at /app/models/ and data at /app/data/
54
+ models_dir = os.path.join(os.path.dirname(__file__), 'models')
55
+ data_path = os.path.join(os.path.dirname(__file__), 'data', 'Cleaned_OutreachAI_Dataset.csv')
56
+
57
+ try:
58
+ prop_model = joblib.load(os.path.join(models_dir, 'propensity_xgb.joblib'))
59
+ firmo_model = joblib.load(os.path.join(models_dir, 'firmographics_kmeans.joblib'))
60
+ chan_model = joblib.load(os.path.join(models_dir, 'channel_rf.joblib'))
61
+ shap_explainer = shap.TreeExplainer(prop_model)
62
+
63
+ # Load the CSV purely as an in-memory lookup table
64
+ df_db = pd.read_csv(data_path)
65
+ print("Successfully loaded stateful models and CSV Data into memory.")
66
+ except Exception as e:
67
+ print(f"Warning: Failed to load models or data. Ensure train_models.py has been run. Error: {e}")
68
+
69
+
70
+ @app.get("/health")
71
+ def health_check():
72
+ if prop_model is None or df_db is None:
73
+ raise HTTPException(status_code=503, detail="Models or Database not loaded")
74
+ return {"status": "ok", "message": "API and Models are fully operational."}
75
+
76
+
77
+ # ================================================================
78
+ # EXPLAINABILITY ENGINE
79
+ # ================================================================
80
+
81
+ def build_propensity_explainability(prop_arr: pd.DataFrame, raw_prob: float, lead_score: int) -> dict:
82
+ """
83
+ Full SHAP waterfall: per-feature contributions with direction, magnitude,
84
+ impact percentages, and an auto-generated narrative.
85
+ """
86
+ # Get signed SHAP values (not absolute) for direction detection
87
+ raw_shap = shap_explainer.shap_values(prop_arr)[0]
88
+ abs_shap = np.abs(raw_shap)
89
+ total_impact = float(abs_shap.sum()) if abs_shap.sum() > 0 else 1.0
90
+
91
+ # Build sorted feature contributions
92
+ feature_contributions = []
93
+ for idx in np.argsort(abs_shap)[::-1]: # Descending by magnitude
94
+ fname = prop_arr.columns[idx]
95
+ sval = float(raw_shap[idx])
96
+ feature_contributions.append({
97
+ "feature": fname,
98
+ "display_name": FEATURE_PLAIN_NAMES.get(fname, fname),
99
+ "feature_value": float(prop_arr.iloc[0, idx]),
100
+ "shap_value": round(sval, 4),
101
+ "direction": "positive" if sval >= 0 else "negative",
102
+ "impact_pct": round((abs_shap[idx] / total_impact) * 100, 1),
103
+ })
104
+
105
+ # Base value from the SHAP explainer (expected model output)
106
+ base_value = float(shap_explainer.expected_value) if hasattr(shap_explainer.expected_value, '__float__') else float(shap_explainer.expected_value[1]) if hasattr(shap_explainer.expected_value, '__len__') else 0.5
107
+
108
+ # Auto-generate narrative from top 2 drivers
109
+ top1 = feature_contributions[0]
110
+ top2 = feature_contributions[1] if len(feature_contributions) > 1 else None
111
+ narrative = f"Lead score of {lead_score} driven primarily by {top1['display_name']} ({top1['impact_pct']}% impact, {top1['direction']})."
112
+ if top2 and top2['impact_pct'] > 10:
113
+ narrative += f" {top2['display_name']} reinforces this signal ({top2['impact_pct']}% impact)."
114
+
115
+ # XGBoost feature importances (model-level, not instance-level)
116
+ model_importances = dict(zip(prop_arr.columns, prop_model.feature_importances_))
117
+ total_imp = sum(model_importances.values()) or 1.0
118
+ model_weight_pct = {k: round((v / total_imp) * 100, 1) for k, v in sorted(model_importances.items(), key=lambda x: -x[1])}
119
+
120
+ return {
121
+ "raw_probability": round(raw_prob, 4),
122
+ "scaled_lead_score": lead_score,
123
+ "scaling_formula": "75 + (raw_probability × 23)",
124
+ "qualification_threshold": 80,
125
+ "base_value": round(base_value, 4),
126
+ "feature_contributions": feature_contributions,
127
+ "model_level_feature_weights": model_weight_pct,
128
+ "narrative": narrative,
129
+ }
130
+
131
+
132
+ def build_segmentation_explainability(firmo_arr: pd.DataFrame, cluster_idx: int, segmentation_class: str) -> dict:
133
+ """
134
+ KMeans reasoning: centroid distances, cluster boundaries, and confidence.
135
+ """
136
+ centroids = firmo_model.cluster_centers_
137
+ input_point = firmo_arr.values[0]
138
+
139
+ # Euclidean distance to each centroid
140
+ distances = {}
141
+ for cid, centroid in enumerate(centroids):
142
+ label = SEGMENT_MAP.get(cid, f"Cluster_{cid}")
143
+ dist = float(np.linalg.norm(input_point - centroid))
144
+ distances[label] = round(dist, 1)
145
+
146
+ # Confidence: inverse-distance ratio (closer = more confident)
147
+ all_dists = list(distances.values())
148
+ min_dist = min(all_dists) if min(all_dists) > 0 else 0.001
149
+ total_inv = sum(1.0 / d if d > 0 else 1000.0 for d in all_dists)
150
+ confidence = round((1.0 / min_dist) / total_inv, 3) if total_inv > 0 else 0.5
151
+
152
+ # Centroid values for context
153
+ centroid_profiles = {}
154
+ for cid, centroid in enumerate(centroids):
155
+ label = SEGMENT_MAP.get(cid, f"Cluster_{cid}")
156
+ centroid_profiles[label] = {
157
+ "avg_revenue_usd": round(float(centroid[0]), 0),
158
+ "avg_headcount": round(float(centroid[1]), 0),
159
+ }
160
+
161
+ revenue = float(firmo_arr.iloc[0]['Revenue_Size_USD'])
162
+ headcount = float(firmo_arr.iloc[0]['Headcount_Size'])
163
+
164
+ narrative = (
165
+ f"Classified as {segmentation_class} (closest centroid, distance: {distances[segmentation_class]:,.0f}). "
166
+ f"Revenue of ${revenue:,.0f} and headcount of {headcount:,.0f} place this company in the {segmentation_class} tier."
167
+ )
168
+
169
+ # Find nearest alternative
170
+ sorted_dists = sorted(distances.items(), key=lambda x: x[1])
171
+ if len(sorted_dists) > 1:
172
+ alt_label, alt_dist = sorted_dists[1]
173
+ boundary_gap = round(alt_dist - sorted_dists[0][1], 1)
174
+ narrative += f" Next closest tier: {alt_label} (gap: {boundary_gap:,.0f} units)."
175
+
176
+ return {
177
+ "assigned_cluster": segmentation_class,
178
+ "revenue_usd": revenue,
179
+ "headcount": headcount,
180
+ "centroid_distances": distances,
181
+ "centroid_profiles": centroid_profiles,
182
+ "cluster_confidence": confidence,
183
+ "narrative": narrative,
184
+ }
185
+
186
+
187
+ def build_channel_explainability(chan_arr: pd.DataFrame, channel_probs: np.ndarray, recommended_channel: str) -> dict:
188
+ """
189
+ Channel probability spread + Random Forest feature importances.
190
+ """
191
+ # Full probability breakdown
192
+ prob_spread = {}
193
+ for i, ch in enumerate(CHANNELS):
194
+ if i < len(channel_probs):
195
+ prob_spread[ch] = round(float(channel_probs[i]), 3)
196
+
197
+ # RF feature importances for channel model
198
+ rf_importances = dict(zip(chan_arr.columns, chan_model.feature_importances_))
199
+ sorted_imp = sorted(rf_importances.items(), key=lambda x: -x[1])
200
+ top_drivers = [
201
+ {"feature": feat, "importance": round(float(imp), 3)}
202
+ for feat, imp in sorted_imp[:5]
203
+ ]
204
+
205
+ # Build the narrative
206
+ top_feat = sorted_imp[0][0].replace("_", " ") if sorted_imp else "unknown"
207
+ rec_prob = prob_spread.get(recommended_channel, 0)
208
+
209
+ # Find runner-up channel
210
+ sorted_channels = sorted(prob_spread.items(), key=lambda x: -x[1])
211
+ runner_up = sorted_channels[1] if len(sorted_channels) > 1 else ("N/A", 0)
212
+
213
+ narrative = (
214
+ f"{recommended_channel} recommended with {rec_prob*100:.0f}% probability. "
215
+ f"{top_feat} is the strongest channel signal. "
216
+ f"Runner-up: {runner_up[0]} ({runner_up[1]*100:.0f}%)."
217
+ )
218
+
219
+ # Channel feature value snapshot (what the model actually saw)
220
+ feature_snapshot = {}
221
+ for col in chan_arr.columns:
222
+ feature_snapshot[col] = float(chan_arr.iloc[0][col])
223
+
224
+ return {
225
+ "recommended": recommended_channel,
226
+ "probability_spread": prob_spread,
227
+ "top_channel_drivers": top_drivers,
228
+ "feature_values_used": feature_snapshot,
229
+ "narrative": narrative,
230
+ }
231
+
232
+
233
+ def build_composite_signals(
234
+ profile, composite_growth: float, lead_score: int,
235
+ model_conf: float, chan_arr: pd.DataFrame
236
+ ) -> dict:
237
+ """
238
+ High-level composite signals aggregating across all models.
239
+ """
240
+ # Growth Momentum: based on Composite_Growth_Signal thresholds
241
+ if composite_growth >= 0.7:
242
+ growth_momentum = "HIGH"
243
+ elif composite_growth >= 0.3:
244
+ growth_momentum = "MODERATE"
245
+ else:
246
+ growth_momentum = "LOW"
247
+
248
+ # Engagement Intensity: average of channel behavior signals
249
+ engagement_cols = ['LinkedIn_Active', 'LinkedIn_Post_Engagement', 'Cold_Call_Response']
250
+ engagement_vals = [float(chan_arr.iloc[0].get(c, 0)) for c in engagement_cols if c in chan_arr.columns]
251
+ engagement_avg = np.mean(engagement_vals) if engagement_vals else 0
252
+ if engagement_avg >= 0.6:
253
+ engagement_intensity = "HIGH"
254
+ elif engagement_avg >= 0.3:
255
+ engagement_intensity = "MODERATE"
256
+ else:
257
+ engagement_intensity = "LOW"
258
+
259
+ # Data Quality Score: ratio of non-zero/non-null fields across key columns
260
+ key_fields = [
261
+ 'job_promotion_flag', 'hiring_increase_flag', 'revenue_growth_score',
262
+ 'clay_intent_signal', 'apollo_engagement_score', 'revenue_size_usd',
263
+ 'headcount_size', 'linkedin_active', 'email_open_rate', 'cold_call_response'
264
+ ]
265
+ filled = sum(1 for f in key_fields if pd.notna(profile.get(f)) and float(profile.get(f, 0)) != 0)
266
+ data_quality = round(filled / len(key_fields), 2)
267
+
268
+ # Qualification Margin
269
+ margin = lead_score - 80
270
+ if margin > 0:
271
+ margin_text = f"+{margin} points above threshold"
272
+ elif margin == 0:
273
+ margin_text = "Exactly at threshold"
274
+ else:
275
+ margin_text = f"{margin} points below threshold"
276
+
277
+ # Overall Outreach Readiness
278
+ readiness_score = 0
279
+ if growth_momentum == "HIGH": readiness_score += 3
280
+ elif growth_momentum == "MODERATE": readiness_score += 2
281
+ else: readiness_score += 1
282
+ if engagement_intensity == "HIGH": readiness_score += 3
283
+ elif engagement_intensity == "MODERATE": readiness_score += 2
284
+ else: readiness_score += 1
285
+ if lead_score >= 85: readiness_score += 3
286
+ elif lead_score >= 80: readiness_score += 2
287
+ else: readiness_score += 1
288
+ if model_conf >= 0.6: readiness_score += 2
289
+ else: readiness_score += 1
290
+
291
+ max_readiness = 11
292
+ readiness_pct = round((readiness_score / max_readiness) * 100)
293
+
294
+ return {
295
+ "growth_momentum": growth_momentum,
296
+ "composite_growth_value": round(composite_growth, 3),
297
+ "engagement_intensity": engagement_intensity,
298
+ "engagement_average": round(engagement_avg, 3),
299
+ "data_quality_score": data_quality,
300
+ "qualification_margin": margin_text,
301
+ "outreach_readiness_pct": readiness_pct,
302
+ }
303
+
304
+
305
+ # ================================================================
306
+ # EXPLAINABILITY UTILITY ENDPOINTS
307
+ # ================================================================
308
+
309
+ @app.get("/explain/params")
310
+ def explain_parameters() -> Dict[str, Any]:
311
+ """
312
+ Returns a dictionary of all explainability parameters, features, and model outputs
313
+ used in the pipeline with human-readable definitions.
314
+ """
315
+ return {
316
+ "propensity_features": {
317
+ "Job_Promotion_Flag": "Binary (0/1): Has a key decision-maker recently been promoted? Indicates openness to new tooling.",
318
+ "Hiring_Increase_Flag": "Binary (0/1): Is the company actively hiring? Strongest signal of capital deployment and growth.",
319
+ "Revenue_Growth_Score": "Float (0-1): How fast is revenue growing? Indicator of available budget.",
320
+ "Clay_Intent_Signal": "Float (0-1): Third-party buying intent from Clay. Indicates active market research.",
321
+ "Apollo_Engagement_Score": "Float (0-1): Engagement score from Apollo indicating general market activity.",
322
+ "Composite_Growth_Signal": "Engineered Value: Hiring_Increase_Flag × Revenue_Growth_Score. Amplifies profiles that are both hiring AND growing."
323
+ },
324
+ "propensity_outputs": {
325
+ "raw_probability": "Raw ML output (0.0 to 1.0) directly from the XGBoost model.",
326
+ "scaled_lead_score": "Score scaled to a 75-98 curve for natural human interpretation.",
327
+ "is_qualified": "Boolean threshold: true if scaled_lead_score >= 80.",
328
+ "shap_value": "The exact magnitude of impact a specific feature had on pulling the score up (positive) or down (negative)."
329
+ },
330
+ "firmographic_features": {
331
+ "Revenue_Size_USD": "Annual revenue in USD.",
332
+ "Headcount_Size": "Total number of employees."
333
+ },
334
+ "firmographic_outputs": {
335
+ "segmentation_class": "KMeans cluster mapped to 'Startup', 'Mid-Market', or 'Enterprise'.",
336
+ "centroid_distances": "Euclidean distance to the center of each of the 3 cluster tiers identifying how close they are to boundaries."
337
+ },
338
+ "channel_features": {
339
+ "historical_engagement": "Matrix across 10 datapoints including LinkedIn views, email open rates, and previous call responses."
340
+ },
341
+ "channel_outputs": {
342
+ "recommended_channel": "The specific communication medium with the highest predicted response probability.",
343
+ "probability_spread": "The exact baseline probability the model predicts across all 4 channels.",
344
+ "model_confidence": "The maximum probability value achieved across all available channels."
345
+ },
346
+ "composite_signals": {
347
+ "growth_momentum": "HIGH/MODERATE/LOW index derived from the multiplier of internal growth constraints.",
348
+ "engagement_intensity": "HIGH/MODERATE/LOW average computed across all historical interaction modes.",
349
+ "data_quality_score": "Float (0-1): Ratio representing how complete the lead's profile data was prior to inference."
350
+ }
351
+ }
352
+
353
+
354
+ @app.get("/explain/narrative/{buyer_id}")
355
+ def explain_narrative(buyer_id: str) -> Dict[str, Any]:
356
+ """
357
+ Executes inference internally but ONLY returns the clean, human-readable narrative
358
+ strings explaining exactly why the outputs were generated.
359
+ """
360
+ global prop_model, firmo_model, chan_model, df_db
361
+
362
+ if prop_model is None or df_db is None:
363
+ raise HTTPException(status_code=503, detail="Models or Database not loaded")
364
+
365
+ # DB Lookup
366
+ buyer = df_db[df_db['buyer_id'] == buyer_id]
367
+ if buyer.empty:
368
+ raise HTTPException(status_code=404, detail=f"Buyer ID '{buyer_id}' not found in database")
369
+
370
+ profile = buyer.iloc[0].fillna(0)
371
+
372
+ # 1. Propensity
373
+ composite_growth = float(profile.get('hiring_increase_flag', 0)) * float(profile.get('revenue_growth_score', 0))
374
+ prop_arr = pd.DataFrame([{
375
+ 'Job_Promotion_Flag': float(profile.get('job_promotion_flag', 0)),
376
+ 'Hiring_Increase_Flag': float(profile.get('hiring_increase_flag', 0)),
377
+ 'Revenue_Growth_Score': float(profile.get('revenue_growth_score', 0)),
378
+ 'Clay_Intent_Signal': float(profile.get('clay_intent_signal', 0)),
379
+ 'Apollo_Engagement_Score': float(profile.get('apollo_engagement_score', 0)),
380
+ 'Composite_Growth_Signal': composite_growth,
381
+ }])
382
+
383
+ raw_prob = float(prop_model.predict_proba(prop_arr)[0, 1])
384
+ lead_score = int(round(75 + (raw_prob * 23)))
385
+
386
+ prop_expl = build_propensity_explainability(prop_arr, raw_prob, lead_score)
387
+
388
+ # 2. Firmographics
389
+ firmo_arr = pd.DataFrame([{
390
+ 'Revenue_Size_USD': float(profile.get('revenue_size_usd', 0)),
391
+ 'Headcount_Size': float(profile.get('headcount_size', 0)),
392
+ }])
393
+ cluster_idx = int(firmo_model.predict(firmo_arr)[0])
394
+ segmentation_class = SEGMENT_MAP.get(cluster_idx, "Unknown")
395
+
396
+ firmo_expl = build_segmentation_explainability(firmo_arr, cluster_idx, segmentation_class)
397
+
398
+ # 3. Channel
399
+ chan_arr = pd.DataFrame([{
400
+ 'LinkedIn_Active': float(profile.get('linkedin_active', 0)),
401
+ 'LinkedIn_Post_Engagement': float(profile.get('linkedin_post_engagement', 0)),
402
+ 'LinkedIn_Profile_Views': float(profile.get('linkedin_profile_views', 0)),
403
+ 'Email_Verified': 1.0 if str(profile.get('email_verified', '0')).lower() in ['yes', '1', '1.0'] else 0.0,
404
+ 'Email_Open_Rate': float(profile.get('email_open_rate', 0)),
405
+ 'Email_Reply_History': 1.0 if str(profile.get('email_reply_history', '0')).lower() in ['past', '1', '1.0'] else 0.0,
406
+ 'Cold_Call_Response': float(profile.get('cold_call_response', 0)),
407
+ 'WhatsApp_Verified': float(profile.get('whatsapp_verified', 0)),
408
+ 'SMS_Verified': float(profile.get('sms_verified', 0)),
409
+ 'Previous_Channel_Response': 1.0 if pd.notna(profile.get('previous_channel_response')) else 0.0,
410
+ }])
411
+
412
+ channel_probs = chan_model.predict_proba(chan_arr)[0]
413
+ best_idx = int(np.argmax(channel_probs))
414
+ recommended_channel = CHANNELS[min(best_idx, 3)]
415
+
416
+ chan_expl = build_channel_explainability(chan_arr, channel_probs, recommended_channel)
417
+
418
+ # Return pure narratives + charting data
419
+ return {
420
+ "buyer_id": buyer_id,
421
+ "narratives": {
422
+ "propensity": prop_expl["narrative"],
423
+ "segmentation": firmo_expl["narrative"],
424
+ "channel": chan_expl["narrative"]
425
+ },
426
+ "chart_data": {
427
+ "propensity_waterfall_chart": {
428
+ "base_value": prop_expl["base_value"],
429
+ "features": [f["display_name"] for f in prop_expl["feature_contributions"]],
430
+ "shap_values": [f["shap_value"] for f in prop_expl["feature_contributions"]],
431
+ "actual_values": [f["feature_value"] for f in prop_expl["feature_contributions"]]
432
+ },
433
+ "firmographic_scatter_chart": {
434
+ "x_axis": "Revenue_Size_USD",
435
+ "y_axis": "Headcount_Size",
436
+ "company_position": {"x": firmo_expl["revenue_usd"], "y": firmo_expl["headcount"]},
437
+ "cluster_centroids": [
438
+ {
439
+ "cluster": k,
440
+ "x": v["avg_revenue_usd"],
441
+ "y": v["avg_headcount"],
442
+ "distance_from_company": firmo_expl["centroid_distances"].get(k)
443
+ }
444
+ for k, v in firmo_expl["centroid_profiles"].items()
445
+ ]
446
+ },
447
+ "channel_radar_chart": {
448
+ "labels": list(chan_expl["probability_spread"].keys()),
449
+ "probabilities": list(chan_expl["probability_spread"].values())
450
+ }
451
+ }
452
+ }
453
+
454
+
455
+ # ================================================================
456
+ # MAIN INFERENCE ENDPOINT
457
+ # ================================================================
458
+
459
+ @app.post("/predict/profile")
460
+ def predict_profile(request: BuyerRequest) -> Dict[str, Any]:
461
+ """
462
+ Retrieves lead data by buyer_id, executes ML inference across all 3 models,
463
+ and returns a comprehensive explainability report alongside the predictions.
464
+ """
465
+ global prop_model, firmo_model, chan_model, shap_explainer, df_db
466
+
467
+ if prop_model is None or df_db is None:
468
+ raise HTTPException(status_code=503, detail="Models or Database not loaded")
469
+
470
+ # 0. Database Lookup
471
+ buyer = df_db[df_db['buyer_id'] == request.buyer_id]
472
+ if buyer.empty:
473
+ raise HTTPException(status_code=404, detail=f"Buyer ID '{request.buyer_id}' not found in database")
474
+
475
+ profile = buyer.iloc[0].fillna(0)
476
+
477
+ # ── 1. PROPENSITY ENGINE ──────────────────────────────────
478
+ composite_growth = float(profile.get('hiring_increase_flag', 0)) * float(profile.get('revenue_growth_score', 0))
479
+ prop_arr = pd.DataFrame([{
480
+ 'Job_Promotion_Flag': float(profile.get('job_promotion_flag', 0)),
481
+ 'Hiring_Increase_Flag': float(profile.get('hiring_increase_flag', 0)),
482
+ 'Revenue_Growth_Score': float(profile.get('revenue_growth_score', 0)),
483
+ 'Clay_Intent_Signal': float(profile.get('clay_intent_signal', 0)),
484
+ 'Apollo_Engagement_Score': float(profile.get('apollo_engagement_score', 0)),
485
+ 'Composite_Growth_Signal': composite_growth,
486
+ }])
487
+
488
+ raw_prob = float(prop_model.predict_proba(prop_arr)[0, 1])
489
+ lead_score = int(round(75 + (raw_prob * 23)))
490
+ is_qualified = lead_score >= 80
491
+
492
+ # SHAP primary driver (for backward compatibility)
493
+ shap_vals_abs = np.abs(shap_explainer.shap_values(prop_arr)[0])
494
+ top_idx = int(np.argsort(shap_vals_abs)[-1])
495
+ p_driver = str(prop_arr.columns[top_idx])
496
+ langgraph_constraint = CONSTRAINT_MAP.get(p_driver, f"Acknowledge their {p_driver}")
497
+
498
+ # ── 2. FIRMOGRAPHICS ENGINE ───────────────────────────────
499
+ firmo_arr = pd.DataFrame([{
500
+ 'Revenue_Size_USD': float(profile.get('revenue_size_usd', 0)),
501
+ 'Headcount_Size': float(profile.get('headcount_size', 0)),
502
+ }])
503
+ cluster_idx = int(firmo_model.predict(firmo_arr)[0])
504
+ segmentation_class = SEGMENT_MAP.get(cluster_idx, "Unknown")
505
+ tone = TONE_MAP.get(segmentation_class, "Professional and balanced")
506
+
507
+ # ── 3. CHANNEL ENGINE ─────────────────────────────────────
508
+ chan_arr = pd.DataFrame([{
509
+ 'LinkedIn_Active': float(profile.get('linkedin_active', 0)),
510
+ 'LinkedIn_Post_Engagement': float(profile.get('linkedin_post_engagement', 0)),
511
+ 'LinkedIn_Profile_Views': float(profile.get('linkedin_profile_views', 0)),
512
+ 'Email_Verified': 1.0 if str(profile.get('email_verified', '0')).lower() in ['yes', '1', '1.0'] else 0.0,
513
+ 'Email_Open_Rate': float(profile.get('email_open_rate', 0)),
514
+ 'Email_Reply_History': 1.0 if str(profile.get('email_reply_history', '0')).lower() in ['past', '1', '1.0'] else 0.0,
515
+ 'Cold_Call_Response': float(profile.get('cold_call_response', 0)),
516
+ 'WhatsApp_Verified': float(profile.get('whatsapp_verified', 0)),
517
+ 'SMS_Verified': float(profile.get('sms_verified', 0)),
518
+ 'Previous_Channel_Response': 1.0 if pd.notna(profile.get('previous_channel_response')) else 0.0,
519
+ }])
520
+
521
+ channel_probs = chan_model.predict_proba(chan_arr)[0]
522
+ best_idx = int(np.argmax(channel_probs))
523
+ recommended_channel = CHANNELS[min(best_idx, 3)]
524
+ model_conf = float(np.max(channel_probs))
525
+
526
+ decision_status = "ROUTE_TO_AGENT" if model_conf >= 0.60 else "ROUTE_TO_VERIFICATION"
527
+
528
+ # ── 4. BUILD EXPLAINABILITY ───────────────────────────────
529
+ explainability = {
530
+ "propensity_breakdown": build_propensity_explainability(prop_arr, raw_prob, lead_score),
531
+ "segmentation_reasoning": build_segmentation_explainability(firmo_arr, cluster_idx, segmentation_class),
532
+ "channel_reasoning": build_channel_explainability(chan_arr, channel_probs, recommended_channel),
533
+ "composite_signals": build_composite_signals(profile, composite_growth, lead_score, model_conf, chan_arr),
534
+ }
535
+
536
+ # ── 5. FINAL PAYLOAD ──────────────────────────────────────
537
+ return {
538
+ "buyer_id": request.buyer_id,
539
+ "ml_decision": {
540
+ "status": decision_status,
541
+ "is_qualified": is_qualified,
542
+ "lead_score": lead_score,
543
+ "segmentation_class": segmentation_class,
544
+ "recommended_channel": recommended_channel,
545
+ "recommended_tone": tone,
546
+ "model_confidence": round(model_conf, 3),
547
+ },
548
+ "shap_anchors": {
549
+ "primary_driver": p_driver,
550
+ "langgraph_constraint": langgraph_constraint,
551
+ },
552
+ "explainability": explainability,
553
+ "raw_context": {
554
+ "industry": str(profile.get('industry', 'Unknown')),
555
+ "country": str(profile.get('country', 'Unknown')),
556
+ "group_memberships": str(profile.get('group_memberships', 'None')),
557
+ },
558
+ }
559
+
560
+
561
+ if __name__ == "__main__":
562
+ import uvicorn
563
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
models/channel_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35309f8fbea1887454c1cb65e776161911e4c2722bae6cd8920602359c717f3f
3
+ size 17752441
models/firmographics_kmeans.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b6ac872d35e886fd43f28706f2b642a4f2be58a1df46ef70c38d1418812b39
3
+ size 61927
models/propensity_xgb.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b68b75fb16944cbba55b185f5bd85211e64f9cc720de362133ac84e94b985c4
3
+ size 63802
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.0
2
+ numpy==1.26.4
3
+ openpyxl==3.1.2
4
+ scikit-learn==1.4.0
5
+ shap
6
+ xgboost==1.7.6
7
+ fastapi==0.109.0
8
+ uvicorn==0.27.0
9
+ joblib==1.3.2
10
+ pydantic==2.5.3
11
+ requests