""" processors/validator.py — Lead quality validation and rejection filters. Rejects leads that are: - Missing all contact info (no phone AND no email AND no website) - Duplicate names across sources (fuzzy match) - Obviously tech companies (they don't need our services) Also cleans and validates phone numbers using PhoneValidator. """ import re from difflib import SequenceMatcher from typing import List, Set, Tuple import config from models import Lead from utils.logger import get_logger from utils.phone_validator import PhoneValidator logger = get_logger(__name__) class LeadValidator: """ Quality gate that filters out low-quality or irrelevant leads. Usage:: validator = LeadValidator() valid_leads, rejected = validator.validate_all(leads) """ def __init__(self): self.phone_validator = PhoneValidator() self._seen_names: List[str] = [] self._tech_patterns = [ re.compile(re.escape(kw), re.I) for kw in config.TECH_COMPANY_KEYWORDS ] def validate_all( self, leads: List[Lead] ) -> Tuple[List[Lead], List[Tuple[Lead, str]]]: """ Validate all leads and return (valid_leads, rejected_list). Each rejected entry is a tuple of (lead, reason_string). """ valid = [] rejected = [] self._seen_names = [] for lead in leads: reason = self._check_lead(lead) if reason: rejected.append((lead, reason)) logger.debug( f"Rejected: {lead.business_name!r} — {reason}" ) else: # Clean phone number if lead.phone: formatted, is_valid = self.phone_validator.validate(lead.phone) if is_valid: lead.phone = formatted else: lead.phone = "" # Remove invalid phone self._seen_names.append(lead.business_name.lower().strip()) valid.append(lead) logger.info( f"Validation: {len(leads)} leads -> " f"{len(valid)} valid, {len(rejected)} rejected" ) if rejected: reasons = {} for _, r in rejected: reasons[r] = reasons.get(r, 0) + 1 for reason, count in sorted(reasons.items(), key=lambda x: -x[1]): logger.info(f" Rejection reason: {reason} ({count})") return valid, rejected def _check_lead(self, lead: Lead) -> str: """ Check a single lead. Returns rejection reason string, or "" if valid. """ # ── Rule 1: Must have at least one contact method ─────────────── has_phone = bool(lead.phone and lead.phone.strip()) has_email = bool(lead.email and lead.email.strip()) has_website = bool(lead.website and lead.website.strip()) if not has_phone and not has_email and not has_website: return "No contact info (no phone, email, or website)" # ── Rule 2: Must have a business name ─────────────────────────── if not lead.business_name or not lead.business_name.strip(): return "Missing business name" # ── Rule 3: Reject obvious tech companies ─────────────────────── name_lower = lead.business_name.lower() for pattern in self._tech_patterns: if pattern.search(name_lower): return f"Tech company detected: matches '{pattern.pattern}'" # Also check notes/industry if populated if lead.industry: industry_lower = lead.industry.lower() for pattern in self._tech_patterns: if pattern.search(industry_lower): return f"Tech industry: {lead.industry}" # ── Rule 4: Fuzzy duplicate name check ────────────────────────── clean_name = name_lower.strip() for seen_name in self._seen_names: similarity = SequenceMatcher( None, clean_name, seen_name ).ratio() if similarity >= config.FUZZY_MATCH_THRESHOLD: return ( f"Fuzzy duplicate: '{lead.business_name}' " f"~= '{seen_name}' ({similarity:.0%} match)" ) return "" # Valid def validate_single(self, lead: Lead) -> Tuple[bool, str]: """ Validate a single lead without modifying internal seen-names list. Returns (is_valid, reason). """ reason = self._check_lead(lead) if not reason and lead.phone: formatted, is_valid = self.phone_validator.validate(lead.phone) if is_valid: lead.phone = formatted else: lead.phone = "" return (not bool(reason), reason)