Spaces:
Running
Running
File size: 9,873 Bytes
bd28470 5f138d4 bd28470 5f138d4 bd28470 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 | """
Scorer v2 β Signal Extraction + Deterministic Scoring
KEY DESIGN CHANGE:
Old: LLM computes score directly β hallucination risk
New: LLM extracts SIGNALS β Code computes score β zero hallucination
LLM is good at: "Does this company have legacy SAP?" (yes/no)
LLM is bad at: "Give this company 73 out of 100" (arbitrary)
So: LLM extracts signals, code does math.
"""
import logging
from nvidia_client import call_llm
logger = logging.getLogger(__name__)
# βββ Signal extraction prompt ββββββββββββββββββββββββββββββββ
SYSTEM_PROMPT = """You are a lead qualification engine.
Your job: extract SIGNALS from company data. You do NOT compute the final score.
The system computes scores deterministically from your signal extraction.
CRITICAL RULES:
- Extract only what the evidence supports
- For each signal, cite which piece of evidence supports it
- If evidence is weak or missing, say so honestly
- Output ONLY the structured JSON requested"""
def build_signal_prompt(data: dict, profile: dict, contacts: list) -> str:
has_verified_email = any(c.get("email_verified") for c in contacts)
has_linkedin = any(c.get("linkedin_personal_url") for c in contacts)
has_social = any(c.get("social_profiles") for c in contacts)
return f"""EXTRACT SIGNALS for lead scoring. Do not compute a score.
Company: {data.get('name', 'UNKNOWN')}
Industry: {data.get('industry', 'UNKNOWN')}
Employees: {data.get('employee_count', 'UNKNOWN')}
Tech stack: {', '.join(data.get('tech_stack', [])) or 'NONE'}
AI job postings: {data.get('ai_job_count', 0)}
Pain signals: {', '.join(data.get('pain_signals', [])) or 'NONE'}
Service match: {data.get('service_match') or 'NONE'}
AI readiness (from profile): {profile.get('ai_readiness', 'UNKNOWN')}
Has verified email: {has_verified_email}
Has personal LinkedIn: {has_linkedin}
Has social profiles: {has_social}
Growth signals count: {len(data.get('growth_signals', []))}
Output JSON:
{{
"company_fit_signals": {{
"industry_match": true,
"size_appropriate": true,
"evidence": "why"
}},
"ai_readiness_signals": {{
"level": "none|low|medium|high",
"tech_stack_relevant": false,
"ai_jobs_present": false,
"evidence": "why"
}},
"service_match_signals": {{
"matched": true,
"service_name": "which service",
"pain_count": 0,
"evidence": "which pain signals"
}},
"contact_quality_signals": {{
"email_verified": {str(has_verified_email).lower()},
"linkedin_found": {str(has_linkedin).lower()},
"decision_maker_identified": true
}},
"timing_signals": {{
"actively_growing": false,
"recently_active": true,
"evidence": "what suggests timing"
}},
"confidence": 0.0
}}"""
# βββ Main scoring function βββββββββββββββββββββββββββββββββββ
async def compute_score(
company_data: dict,
profile: dict,
contacts: list,
trace_id: str = ""
) -> dict:
"""
Step 1: LLM extracts signals (qualitative)
Step 2: Code computes score (deterministic, reproducible)
"""
# ββ Step 1: Signal extraction via LLM βββββββββββββββββββββ
signals = await _extract_signals(company_data, profile, contacts, trace_id)
# ββ Step 2: Deterministic scoring βββββββββββββββββββββββββ
score = _compute_deterministic_score(signals, company_data, profile, contacts)
return score
async def _extract_signals(data, profile, contacts, trace_id) -> dict:
"""Ask LLM to identify signals β NOT to score."""
try:
prompt = build_signal_prompt(data, profile, contacts)
result = await call_llm(
operation="score",
system_prompt=SYSTEM_PROMPT,
user_prompt=prompt,
model_index=2, # 8B model β signal extraction is simple
temperature=0.1,
max_tokens=400,
json_mode=True,
trace_id=trace_id,
company_id=data.get("id"),
)
if result.get("parsed"):
return result["parsed"]
except Exception as e:
logger.warning(f"Signal extraction failed: {e}")
# Fallback: extract signals from raw data
return _extract_signals_deterministic(data, profile, contacts)
def _extract_signals_deterministic(data, profile, contacts) -> dict:
"""Rule-based signal extraction when LLM fails."""
has_email = any(c.get("email_verified") for c in contacts)
has_linkedin = any(c.get("linkedin_personal_url") for c in contacts)
return {
"company_fit_signals": {
"industry_match": bool(data.get("industry")),
"size_appropriate": (data.get("employee_count") or 0) >= 3,
"evidence": "deterministic",
},
"ai_readiness_signals": {
"level": profile.get("ai_readiness", "low"),
"tech_stack_relevant": len(data.get("tech_stack", [])) > 0,
"ai_jobs_present": data.get("ai_job_count", 0) > 0,
"evidence": "deterministic",
},
"service_match_signals": {
"matched": bool(data.get("service_match")),
"service_name": data.get("service_match", "NONE"),
"pain_count": len(data.get("pain_signals", [])),
"evidence": "deterministic",
},
"contact_quality_signals": {
"email_verified": has_email,
"linkedin_found": has_linkedin,
"decision_maker_identified": len(contacts) > 0,
},
"timing_signals": {
"actively_growing": data.get("ai_job_count", 0) > 0,
"recently_active": True,
"evidence": "deterministic",
},
"confidence": 0.5,
}
# βββ Deterministic score computation βββββββββββββββββββββββββ
# This is where the ACTUAL score is calculated.
# No LLM involved β pure math from signals.
def _compute_deterministic_score(signals: dict, data: dict, profile: dict, contacts: list) -> dict:
"""
Weights:
company_fit: 25 pts
ai_readiness: 20 pts
service_match: 20 pts (NEW β replaces old AI readiness weight)
decision_maker: 20 pts
timing: 15 pts
"""
# ββ Company Fit (25 pts) ββββββββββββββββββββββββββββββββββ
fit = signals.get("company_fit_signals", {})
company_fit = 0
if fit.get("industry_match"): company_fit += 10
if fit.get("size_appropriate"): company_fit += 10
emp = data.get("employee_count") or 0
if emp >= 200: company_fit += 5
elif emp >= 50: company_fit += 3
elif emp >= 10: company_fit += 1
# ββ AI Readiness (20 pts) βββββββββββββββββββββββββββββββββ
ai_sig = signals.get("ai_readiness_signals", {})
ai_readiness = 0
level = ai_sig.get("level", "low")
if level == "high": ai_readiness += 12
elif level == "medium": ai_readiness += 8
elif level == "low": ai_readiness += 3
if ai_sig.get("tech_stack_relevant"): ai_readiness += 4
if ai_sig.get("ai_jobs_present"): ai_readiness += 4
ai_readiness = min(20, ai_readiness)
# ββ Service Match (20 pts) β KEY DIFFERENTIATOR βββββββββββ
svc = signals.get("service_match_signals", {})
service_match = 0
if svc.get("matched"):
service_match += 10
pain_count = svc.get("pain_count", 0)
service_match += min(10, pain_count * 3) # up to 10 pts for pain signals
service_match = min(20, service_match)
# ββ Decision Maker Access (20 pts) ββββββββββββββββββββββββ
contact = signals.get("contact_quality_signals", {})
dm = 0
if contact.get("email_verified"): dm += 12
elif any(c.get("email") for c in contacts): dm += 6
if contact.get("linkedin_found"): dm += 5
if contact.get("decision_maker_identified"): dm += 3
dm = min(20, dm)
# ββ Timing (15 pts) βββββββββββββββββββββββββββββββββββββββ
timing = signals.get("timing_signals", {})
timing_score = 5 # base: company exists and has website
if timing.get("actively_growing"): timing_score += 5
if timing.get("recently_active"): timing_score += 3
if len(data.get("growth_signals", [])) >= 2: timing_score += 2
timing_score = min(15, timing_score)
# ββ Total βββββββββββββββββββββββββββββββββββββββββββββββββ
total = company_fit + ai_readiness + service_match + dm + timing_score
tier = _score_to_tier(total)
return {
"company_fit": company_fit,
"ai_readiness_score": ai_readiness,
"service_match_score": service_match,
"decision_maker_access": dm,
"timing_score": timing_score,
"total_score": total,
"tier": tier,
"score_breakdown": {
"company_fit": f"{company_fit}/25",
"ai_readiness": f"{ai_readiness}/20",
"service_match": f"{service_match}/20",
"decision_maker": f"{dm}/20",
"timing": f"{timing_score}/15",
},
"score_reasoning": f"Deterministic score from {len(signals)} signal groups",
"llm_model": "deterministic_scorer",
"is_fallback": False,
}
def _score_to_tier(score: int) -> str:
if score >= 85: return "hot"
if score >= 70: return "warm"
if score >= 50: return "nurture"
return "archive"
|