File size: 5,096 Bytes
beec01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
processors/validator.py β€” Lead quality validation and rejection filters.

Rejects leads that are:
  - Missing all contact info (no phone AND no email AND no website)
  - Duplicate names across sources (fuzzy match)
  - Obviously tech companies (they don't need our services)

Also cleans and validates phone numbers using PhoneValidator.
"""

import re
from difflib import SequenceMatcher
from typing import List, Set, Tuple

import config
from models import Lead
from utils.logger import get_logger
from utils.phone_validator import PhoneValidator

logger = get_logger(__name__)


class LeadValidator:
    """
    Quality gate that filters out low-quality or irrelevant leads.

    Usage::

        validator = LeadValidator()
        valid_leads, rejected = validator.validate_all(leads)
    """

    def __init__(self):
        self.phone_validator = PhoneValidator()
        self._seen_names: List[str] = []
        self._tech_patterns = [
            re.compile(re.escape(kw), re.I)
            for kw in config.TECH_COMPANY_KEYWORDS
        ]

    def validate_all(
        self, leads: List[Lead]
    ) -> Tuple[List[Lead], List[Tuple[Lead, str]]]:
        """
        Validate all leads and return (valid_leads, rejected_list).

        Each rejected entry is a tuple of (lead, reason_string).
        """
        valid = []
        rejected = []
        self._seen_names = []

        for lead in leads:
            reason = self._check_lead(lead)
            if reason:
                rejected.append((lead, reason))
                logger.debug(
                    f"Rejected: {lead.business_name!r} β€” {reason}"
                )
            else:
                # Clean phone number
                if lead.phone:
                    formatted, is_valid = self.phone_validator.validate(lead.phone)
                    if is_valid:
                        lead.phone = formatted
                    else:
                        lead.phone = ""  # Remove invalid phone

                self._seen_names.append(lead.business_name.lower().strip())
                valid.append(lead)

        logger.info(
            f"Validation: {len(leads)} leads -> "
            f"{len(valid)} valid, {len(rejected)} rejected"
        )
        if rejected:
            reasons = {}
            for _, r in rejected:
                reasons[r] = reasons.get(r, 0) + 1
            for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
                logger.info(f"  Rejection reason: {reason} ({count})")

        return valid, rejected

    def _check_lead(self, lead: Lead) -> str:
        """
        Check a single lead. Returns rejection reason string, or "" if valid.
        """
        # ── Rule 1: Must have at least one contact method ───────────────
        has_phone = bool(lead.phone and lead.phone.strip())
        has_email = bool(lead.email and lead.email.strip())
        has_website = bool(lead.website and lead.website.strip())

        if not has_phone and not has_email and not has_website:
            return "No contact info (no phone, email, or website)"

        # ── Rule 2: Must have a business name ───────────────────────────
        if not lead.business_name or not lead.business_name.strip():
            return "Missing business name"

        # ── Rule 3: Reject obvious tech companies ───────────────────────
        name_lower = lead.business_name.lower()
        for pattern in self._tech_patterns:
            if pattern.search(name_lower):
                return f"Tech company detected: matches '{pattern.pattern}'"

        # Also check notes/industry if populated
        if lead.industry:
            industry_lower = lead.industry.lower()
            for pattern in self._tech_patterns:
                if pattern.search(industry_lower):
                    return f"Tech industry: {lead.industry}"

        # ── Rule 4: Fuzzy duplicate name check ──────────────────────────
        clean_name = name_lower.strip()
        for seen_name in self._seen_names:
            similarity = SequenceMatcher(
                None, clean_name, seen_name
            ).ratio()
            if similarity >= config.FUZZY_MATCH_THRESHOLD:
                return (
                    f"Fuzzy duplicate: '{lead.business_name}' "
                    f"~= '{seen_name}' ({similarity:.0%} match)"
                )

        return ""  # Valid

    def validate_single(self, lead: Lead) -> Tuple[bool, str]:
        """
        Validate a single lead without modifying internal seen-names list.
        Returns (is_valid, reason).
        """
        reason = self._check_lead(lead)
        if not reason and lead.phone:
            formatted, is_valid = self.phone_validator.validate(lead.phone)
            if is_valid:
                lead.phone = formatted
            else:
                lead.phone = ""
        return (not bool(reason), reason)