Spaces:
Running
Running
File size: 5,096 Bytes
beec01d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | """
processors/validator.py β Lead quality validation and rejection filters.
Rejects leads that are:
- Missing all contact info (no phone AND no email AND no website)
- Duplicate names across sources (fuzzy match)
- Obviously tech companies (they don't need our services)
Also cleans and validates phone numbers using PhoneValidator.
"""
import re
from difflib import SequenceMatcher
from typing import List, Set, Tuple
import config
from models import Lead
from utils.logger import get_logger
from utils.phone_validator import PhoneValidator
logger = get_logger(__name__)
class LeadValidator:
"""
Quality gate that filters out low-quality or irrelevant leads.
Usage::
validator = LeadValidator()
valid_leads, rejected = validator.validate_all(leads)
"""
def __init__(self):
self.phone_validator = PhoneValidator()
self._seen_names: List[str] = []
self._tech_patterns = [
re.compile(re.escape(kw), re.I)
for kw in config.TECH_COMPANY_KEYWORDS
]
def validate_all(
self, leads: List[Lead]
) -> Tuple[List[Lead], List[Tuple[Lead, str]]]:
"""
Validate all leads and return (valid_leads, rejected_list).
Each rejected entry is a tuple of (lead, reason_string).
"""
valid = []
rejected = []
self._seen_names = []
for lead in leads:
reason = self._check_lead(lead)
if reason:
rejected.append((lead, reason))
logger.debug(
f"Rejected: {lead.business_name!r} β {reason}"
)
else:
# Clean phone number
if lead.phone:
formatted, is_valid = self.phone_validator.validate(lead.phone)
if is_valid:
lead.phone = formatted
else:
lead.phone = "" # Remove invalid phone
self._seen_names.append(lead.business_name.lower().strip())
valid.append(lead)
logger.info(
f"Validation: {len(leads)} leads -> "
f"{len(valid)} valid, {len(rejected)} rejected"
)
if rejected:
reasons = {}
for _, r in rejected:
reasons[r] = reasons.get(r, 0) + 1
for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
logger.info(f" Rejection reason: {reason} ({count})")
return valid, rejected
def _check_lead(self, lead: Lead) -> str:
"""
Check a single lead. Returns rejection reason string, or "" if valid.
"""
# ββ Rule 1: Must have at least one contact method βββββββββββββββ
has_phone = bool(lead.phone and lead.phone.strip())
has_email = bool(lead.email and lead.email.strip())
has_website = bool(lead.website and lead.website.strip())
if not has_phone and not has_email and not has_website:
return "No contact info (no phone, email, or website)"
# ββ Rule 2: Must have a business name βββββββββββββββββββββββββββ
if not lead.business_name or not lead.business_name.strip():
return "Missing business name"
# ββ Rule 3: Reject obvious tech companies βββββββββββββββββββββββ
name_lower = lead.business_name.lower()
for pattern in self._tech_patterns:
if pattern.search(name_lower):
return f"Tech company detected: matches '{pattern.pattern}'"
# Also check notes/industry if populated
if lead.industry:
industry_lower = lead.industry.lower()
for pattern in self._tech_patterns:
if pattern.search(industry_lower):
return f"Tech industry: {lead.industry}"
# ββ Rule 4: Fuzzy duplicate name check ββββββββββββββββββββββββββ
clean_name = name_lower.strip()
for seen_name in self._seen_names:
similarity = SequenceMatcher(
None, clean_name, seen_name
).ratio()
if similarity >= config.FUZZY_MATCH_THRESHOLD:
return (
f"Fuzzy duplicate: '{lead.business_name}' "
f"~= '{seen_name}' ({similarity:.0%} match)"
)
return "" # Valid
def validate_single(self, lead: Lead) -> Tuple[bool, str]:
"""
Validate a single lead without modifying internal seen-names list.
Returns (is_valid, reason).
"""
reason = self._check_lead(lead)
if not reason and lead.phone:
formatted, is_valid = self.phone_validator.validate(lead.phone)
if is_valid:
lead.phone = formatted
else:
lead.phone = ""
return (not bool(reason), reason)
|