LeadGenPro / lead_gen /processors /validator.py
MaSTer-suFYan
feat: LeadGen Pro v2.0 β€” full system with bug fixes
beec01d
"""
processors/validator.py β€” Lead quality validation and rejection filters.
Rejects leads that are:
- Missing all contact info (no phone AND no email AND no website)
- Duplicate names across sources (fuzzy match)
- Obviously tech companies (they don't need our services)
Also cleans and validates phone numbers using PhoneValidator.
"""
import re
from difflib import SequenceMatcher
from typing import List, Set, Tuple
import config
from models import Lead
from utils.logger import get_logger
from utils.phone_validator import PhoneValidator
logger = get_logger(__name__)
class LeadValidator:
"""
Quality gate that filters out low-quality or irrelevant leads.
Usage::
validator = LeadValidator()
valid_leads, rejected = validator.validate_all(leads)
"""
def __init__(self):
self.phone_validator = PhoneValidator()
self._seen_names: List[str] = []
self._tech_patterns = [
re.compile(re.escape(kw), re.I)
for kw in config.TECH_COMPANY_KEYWORDS
]
def validate_all(
self, leads: List[Lead]
) -> Tuple[List[Lead], List[Tuple[Lead, str]]]:
"""
Validate all leads and return (valid_leads, rejected_list).
Each rejected entry is a tuple of (lead, reason_string).
"""
valid = []
rejected = []
self._seen_names = []
for lead in leads:
reason = self._check_lead(lead)
if reason:
rejected.append((lead, reason))
logger.debug(
f"Rejected: {lead.business_name!r} β€” {reason}"
)
else:
# Clean phone number
if lead.phone:
formatted, is_valid = self.phone_validator.validate(lead.phone)
if is_valid:
lead.phone = formatted
else:
lead.phone = "" # Remove invalid phone
self._seen_names.append(lead.business_name.lower().strip())
valid.append(lead)
logger.info(
f"Validation: {len(leads)} leads -> "
f"{len(valid)} valid, {len(rejected)} rejected"
)
if rejected:
reasons = {}
for _, r in rejected:
reasons[r] = reasons.get(r, 0) + 1
for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
logger.info(f" Rejection reason: {reason} ({count})")
return valid, rejected
def _check_lead(self, lead: Lead) -> str:
"""
Check a single lead. Returns rejection reason string, or "" if valid.
"""
# ── Rule 1: Must have at least one contact method ───────────────
has_phone = bool(lead.phone and lead.phone.strip())
has_email = bool(lead.email and lead.email.strip())
has_website = bool(lead.website and lead.website.strip())
if not has_phone and not has_email and not has_website:
return "No contact info (no phone, email, or website)"
# ── Rule 2: Must have a business name ───────────────────────────
if not lead.business_name or not lead.business_name.strip():
return "Missing business name"
# ── Rule 3: Reject obvious tech companies ───────────────────────
name_lower = lead.business_name.lower()
for pattern in self._tech_patterns:
if pattern.search(name_lower):
return f"Tech company detected: matches '{pattern.pattern}'"
# Also check notes/industry if populated
if lead.industry:
industry_lower = lead.industry.lower()
for pattern in self._tech_patterns:
if pattern.search(industry_lower):
return f"Tech industry: {lead.industry}"
# ── Rule 4: Fuzzy duplicate name check ──────────────────────────
clean_name = name_lower.strip()
for seen_name in self._seen_names:
similarity = SequenceMatcher(
None, clean_name, seen_name
).ratio()
if similarity >= config.FUZZY_MATCH_THRESHOLD:
return (
f"Fuzzy duplicate: '{lead.business_name}' "
f"~= '{seen_name}' ({similarity:.0%} match)"
)
return "" # Valid
def validate_single(self, lead: Lead) -> Tuple[bool, str]:
"""
Validate a single lead without modifying internal seen-names list.
Returns (is_valid, reason).
"""
reason = self._check_lead(lead)
if not reason and lead.phone:
formatted, is_valid = self.phone_validator.validate(lead.phone)
if is_valid:
lead.phone = formatted
else:
lead.phone = ""
return (not bool(reason), reason)