Spaces:
Running
Running
| """ | |
| processors/validator.py β Lead quality validation and rejection filters. | |
| Rejects leads that are: | |
| - Missing all contact info (no phone AND no email AND no website) | |
| - Duplicate names across sources (fuzzy match) | |
| - Obviously tech companies (they don't need our services) | |
| Also cleans and validates phone numbers using PhoneValidator. | |
| """ | |
| import re | |
| from difflib import SequenceMatcher | |
| from typing import List, Set, Tuple | |
| import config | |
| from models import Lead | |
| from utils.logger import get_logger | |
| from utils.phone_validator import PhoneValidator | |
| logger = get_logger(__name__) | |
| class LeadValidator: | |
| """ | |
| Quality gate that filters out low-quality or irrelevant leads. | |
| Usage:: | |
| validator = LeadValidator() | |
| valid_leads, rejected = validator.validate_all(leads) | |
| """ | |
| def __init__(self): | |
| self.phone_validator = PhoneValidator() | |
| self._seen_names: List[str] = [] | |
| self._tech_patterns = [ | |
| re.compile(re.escape(kw), re.I) | |
| for kw in config.TECH_COMPANY_KEYWORDS | |
| ] | |
| def validate_all( | |
| self, leads: List[Lead] | |
| ) -> Tuple[List[Lead], List[Tuple[Lead, str]]]: | |
| """ | |
| Validate all leads and return (valid_leads, rejected_list). | |
| Each rejected entry is a tuple of (lead, reason_string). | |
| """ | |
| valid = [] | |
| rejected = [] | |
| self._seen_names = [] | |
| for lead in leads: | |
| reason = self._check_lead(lead) | |
| if reason: | |
| rejected.append((lead, reason)) | |
| logger.debug( | |
| f"Rejected: {lead.business_name!r} β {reason}" | |
| ) | |
| else: | |
| # Clean phone number | |
| if lead.phone: | |
| formatted, is_valid = self.phone_validator.validate(lead.phone) | |
| if is_valid: | |
| lead.phone = formatted | |
| else: | |
| lead.phone = "" # Remove invalid phone | |
| self._seen_names.append(lead.business_name.lower().strip()) | |
| valid.append(lead) | |
| logger.info( | |
| f"Validation: {len(leads)} leads -> " | |
| f"{len(valid)} valid, {len(rejected)} rejected" | |
| ) | |
| if rejected: | |
| reasons = {} | |
| for _, r in rejected: | |
| reasons[r] = reasons.get(r, 0) + 1 | |
| for reason, count in sorted(reasons.items(), key=lambda x: -x[1]): | |
| logger.info(f" Rejection reason: {reason} ({count})") | |
| return valid, rejected | |
| def _check_lead(self, lead: Lead) -> str: | |
| """ | |
| Check a single lead. Returns rejection reason string, or "" if valid. | |
| """ | |
| # ββ Rule 1: Must have at least one contact method βββββββββββββββ | |
| has_phone = bool(lead.phone and lead.phone.strip()) | |
| has_email = bool(lead.email and lead.email.strip()) | |
| has_website = bool(lead.website and lead.website.strip()) | |
| if not has_phone and not has_email and not has_website: | |
| return "No contact info (no phone, email, or website)" | |
| # ββ Rule 2: Must have a business name βββββββββββββββββββββββββββ | |
| if not lead.business_name or not lead.business_name.strip(): | |
| return "Missing business name" | |
| # ββ Rule 3: Reject obvious tech companies βββββββββββββββββββββββ | |
| name_lower = lead.business_name.lower() | |
| for pattern in self._tech_patterns: | |
| if pattern.search(name_lower): | |
| return f"Tech company detected: matches '{pattern.pattern}'" | |
| # Also check notes/industry if populated | |
| if lead.industry: | |
| industry_lower = lead.industry.lower() | |
| for pattern in self._tech_patterns: | |
| if pattern.search(industry_lower): | |
| return f"Tech industry: {lead.industry}" | |
| # ββ Rule 4: Fuzzy duplicate name check ββββββββββββββββββββββββββ | |
| clean_name = name_lower.strip() | |
| for seen_name in self._seen_names: | |
| similarity = SequenceMatcher( | |
| None, clean_name, seen_name | |
| ).ratio() | |
| if similarity >= config.FUZZY_MATCH_THRESHOLD: | |
| return ( | |
| f"Fuzzy duplicate: '{lead.business_name}' " | |
| f"~= '{seen_name}' ({similarity:.0%} match)" | |
| ) | |
| return "" # Valid | |
| def validate_single(self, lead: Lead) -> Tuple[bool, str]: | |
| """ | |
| Validate a single lead without modifying internal seen-names list. | |
| Returns (is_valid, reason). | |
| """ | |
| reason = self._check_lead(lead) | |
| if not reason and lead.phone: | |
| formatted, is_valid = self.phone_validator.validate(lead.phone) | |
| if is_valid: | |
| lead.phone = formatted | |
| else: | |
| lead.phone = "" | |
| return (not bool(reason), reason) | |