""" Scorer v2 — Signal Extraction + Deterministic Scoring KEY DESIGN CHANGE: Old: LLM computes score directly → hallucination risk New: LLM extracts SIGNALS → Code computes score → zero hallucination LLM is good at: "Does this company have legacy SAP?" (yes/no) LLM is bad at: "Give this company 73 out of 100" (arbitrary) So: LLM extracts signals, code does math. """ import logging from nvidia_client import call_llm logger = logging.getLogger(__name__) # ─── Signal extraction prompt ──────────────────────────────── SYSTEM_PROMPT = """You are a lead qualification engine. Your job: extract SIGNALS from company data. You do NOT compute the final score. The system computes scores deterministically from your signal extraction. CRITICAL RULES: - Extract only what the evidence supports - For each signal, cite which piece of evidence supports it - If evidence is weak or missing, say so honestly - Output ONLY the structured JSON requested""" def build_signal_prompt(data: dict, profile: dict, contacts: list) -> str: has_verified_email = any(c.get("email_verified") for c in contacts) has_linkedin = any(c.get("linkedin_personal_url") for c in contacts) has_social = any(c.get("social_profiles") for c in contacts) return f"""EXTRACT SIGNALS for lead scoring. Do not compute a score. Company: {data.get('name', 'UNKNOWN')} Industry: {data.get('industry', 'UNKNOWN')} Employees: {data.get('employee_count', 'UNKNOWN')} Tech stack: {', '.join(data.get('tech_stack', [])) or 'NONE'} AI job postings: {data.get('ai_job_count', 0)} Pain signals: {', '.join(data.get('pain_signals', [])) or 'NONE'} Service match: {data.get('service_match') or 'NONE'} AI readiness (from profile): {profile.get('ai_readiness', 'UNKNOWN')} Has verified email: {has_verified_email} Has personal LinkedIn: {has_linkedin} Has social profiles: {has_social} Growth signals count: {len(data.get('growth_signals', []))} Output JSON: {{ "company_fit_signals": {{ "industry_match": true, "size_appropriate": true, "evidence": "why" }}, "ai_readiness_signals": {{ "level": "none|low|medium|high", "tech_stack_relevant": false, "ai_jobs_present": false, "evidence": "why" }}, "service_match_signals": {{ "matched": true, "service_name": "which service", "pain_count": 0, "evidence": "which pain signals" }}, "contact_quality_signals": {{ "email_verified": {str(has_verified_email).lower()}, "linkedin_found": {str(has_linkedin).lower()}, "decision_maker_identified": true }}, "timing_signals": {{ "actively_growing": false, "recently_active": true, "evidence": "what suggests timing" }}, "confidence": 0.0 }}""" # ─── Main scoring function ─────────────────────────────────── async def compute_score( company_data: dict, profile: dict, contacts: list, trace_id: str = "" ) -> dict: """ Step 1: LLM extracts signals (qualitative) Step 2: Code computes score (deterministic, reproducible) """ # ── Step 1: Signal extraction via LLM ───────────────────── signals = await _extract_signals(company_data, profile, contacts, trace_id) # ── Step 2: Deterministic scoring ───────────────────────── score = _compute_deterministic_score(signals, company_data, profile, contacts) return score async def _extract_signals(data, profile, contacts, trace_id) -> dict: """Ask LLM to identify signals — NOT to score.""" try: prompt = build_signal_prompt(data, profile, contacts) result = await call_llm( operation="score", system_prompt=SYSTEM_PROMPT, user_prompt=prompt, model_index=2, # 8B model — signal extraction is simple temperature=0.1, max_tokens=400, json_mode=True, trace_id=trace_id, company_id=data.get("id"), ) if result.get("parsed"): return result["parsed"] except Exception as e: logger.warning(f"Signal extraction failed: {e}") # Fallback: extract signals from raw data return _extract_signals_deterministic(data, profile, contacts) def _extract_signals_deterministic(data, profile, contacts) -> dict: """Rule-based signal extraction when LLM fails.""" has_email = any(c.get("email_verified") for c in contacts) has_linkedin = any(c.get("linkedin_personal_url") for c in contacts) return { "company_fit_signals": { "industry_match": bool(data.get("industry")), "size_appropriate": (data.get("employee_count") or 0) >= 3, "evidence": "deterministic", }, "ai_readiness_signals": { "level": profile.get("ai_readiness", "low"), "tech_stack_relevant": len(data.get("tech_stack", [])) > 0, "ai_jobs_present": data.get("ai_job_count", 0) > 0, "evidence": "deterministic", }, "service_match_signals": { "matched": bool(data.get("service_match")), "service_name": data.get("service_match", "NONE"), "pain_count": len(data.get("pain_signals", [])), "evidence": "deterministic", }, "contact_quality_signals": { "email_verified": has_email, "linkedin_found": has_linkedin, "decision_maker_identified": len(contacts) > 0, }, "timing_signals": { "actively_growing": data.get("ai_job_count", 0) > 0, "recently_active": True, "evidence": "deterministic", }, "confidence": 0.5, } # ─── Deterministic score computation ───────────────────────── # This is where the ACTUAL score is calculated. # No LLM involved — pure math from signals. def _compute_deterministic_score(signals: dict, data: dict, profile: dict, contacts: list) -> dict: """ Weights: company_fit: 25 pts ai_readiness: 20 pts service_match: 20 pts (NEW — replaces old AI readiness weight) decision_maker: 20 pts timing: 15 pts """ # ── Company Fit (25 pts) ────────────────────────────────── fit = signals.get("company_fit_signals", {}) company_fit = 0 if fit.get("industry_match"): company_fit += 10 if fit.get("size_appropriate"): company_fit += 10 emp = data.get("employee_count") or 0 if emp >= 200: company_fit += 5 elif emp >= 50: company_fit += 3 elif emp >= 10: company_fit += 1 # ── AI Readiness (20 pts) ───────────────────────────────── ai_sig = signals.get("ai_readiness_signals", {}) ai_readiness = 0 level = ai_sig.get("level", "low") if level == "high": ai_readiness += 12 elif level == "medium": ai_readiness += 8 elif level == "low": ai_readiness += 3 if ai_sig.get("tech_stack_relevant"): ai_readiness += 4 if ai_sig.get("ai_jobs_present"): ai_readiness += 4 ai_readiness = min(20, ai_readiness) # ── Service Match (20 pts) — KEY DIFFERENTIATOR ─────────── svc = signals.get("service_match_signals", {}) service_match = 0 if svc.get("matched"): service_match += 10 pain_count = svc.get("pain_count", 0) service_match += min(10, pain_count * 3) # up to 10 pts for pain signals service_match = min(20, service_match) # ── Decision Maker Access (20 pts) ──────────────────────── contact = signals.get("contact_quality_signals", {}) dm = 0 if contact.get("email_verified"): dm += 12 elif any(c.get("email") for c in contacts): dm += 6 if contact.get("linkedin_found"): dm += 5 if contact.get("decision_maker_identified"): dm += 3 dm = min(20, dm) # ── Timing (15 pts) ─────────────────────────────────────── timing = signals.get("timing_signals", {}) timing_score = 5 # base: company exists and has website if timing.get("actively_growing"): timing_score += 5 if timing.get("recently_active"): timing_score += 3 if len(data.get("growth_signals", [])) >= 2: timing_score += 2 timing_score = min(15, timing_score) # ── Total ───────────────────────────────────────────────── total = company_fit + ai_readiness + service_match + dm + timing_score tier = _score_to_tier(total) return { "company_fit": company_fit, "ai_readiness_score": ai_readiness, "service_match_score": service_match, "decision_maker_access": dm, "timing_score": timing_score, "total_score": total, "tier": tier, "score_breakdown": { "company_fit": f"{company_fit}/25", "ai_readiness": f"{ai_readiness}/20", "service_match": f"{service_match}/20", "decision_maker": f"{dm}/20", "timing": f"{timing_score}/15", }, "score_reasoning": f"Deterministic score from {len(signals)} signal groups", "llm_model": "deterministic_scorer", "is_fallback": False, } def _score_to_tier(score: int) -> str: if score >= 85: return "hot" if score >= 70: return "warm" if score >= 50: return "nurture" return "archive"