File size: 5,674 Bytes
4655858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
import os
from typing import Optional
from text_extractor import JobCore
from llm_client import google_search
from metrics import log_metric


def patch_missing(core: JobCore) -> JobCore:
    """Patch missing fields in JobCore using Google search."""
    
    # Check if Google patching is enabled
    if not os.getenv("GOOGLE_PATCH_ENABLED", "true").lower() in ["true", "1", "yes"]:
        return core
    
    # Only patch if we have basic company info
    if not core.company:
        return core
    
    patches_applied = 0
    
    # Patch salary if missing
    if not core.salary_low and not core.salary_high:
        salary_info = _patch_salary(core.company, core.role)
        if salary_info:
            core.salary_low, core.salary_high = salary_info
            core.source_map["salary"] = "google"
            patches_applied += 1
    
    # Patch funding if missing
    if not core.funding:
        funding_info = _patch_funding(core.company)
        if funding_info:
            core.funding = funding_info
            core.source_map["funding"] = "google"
            patches_applied += 1
    
    # Patch mission if missing
    if not core.mission:
        mission_info = _patch_mission(core.company)
        if mission_info:
            core.mission = mission_info
            core.source_map["mission"] = "google"
            patches_applied += 1
    
    # Patch location if missing
    if not core.location:
        location_info = _patch_location(core.company)
        if location_info:
            core.location = location_info
            core.source_map["location"] = "google"
            patches_applied += 1
    
    log_metric("patch_missing", {
        "company": core.company,
        "patches_applied": patches_applied,
        "source_map": core.source_map
    })
    
    return core


def _patch_salary(company: str, role: str) -> Optional[tuple[int, int]]:
    """Search for salary information and extract range."""
    if not company or not role:
        return None
    
    query = f'"{company}" "{role}" salary range'
    snippets = google_search(query, top=3, timeout=5)
    
    for snippet in snippets:
        # Look for salary patterns like "$120k-$180k", "$150,000-$200,000"
        salary_patterns = [
            r'\$(\d+)k?[-–]\$?(\d+)k?',
            r'\$(\d+),?(\d+)[-–]\$?(\d+),?(\d+)',
            r'(\d+)k?[-–](\d+)k?\s*(?:per|/)?\s*year',
        ]
        
        for pattern in salary_patterns:
            match = re.search(pattern, snippet, re.IGNORECASE)
            if match:
                try:
                    if 'k' in match.group(0).lower():
                        low = int(match.group(1)) * 1000
                        high = int(match.group(2)) * 1000
                    else:
                        low = int(match.group(1))
                        high = int(match.group(2))
                    
                    # Sanity check: reasonable salary range
                    if 30000 <= low <= 500000 and 30000 <= high <= 500000 and low < high:
                        return (low, high)
                except (ValueError, IndexError):
                    continue
    
    return None


def _patch_funding(company: str) -> Optional[str]:
    """Search for funding information."""
    if not company:
        return None
    
    query = f'"{company}" funding round raised'
    snippets = google_search(query, top=3, timeout=5)
    
    for snippet in snippets:
        # Look for funding patterns
        funding_patterns = [
            r'raised \$(\d+(?:\.\d+)?[MB]?)',
            r'Series [A-Z] \$(\d+(?:\.\d+)?[MB]?)',
            r'\$(\d+(?:\.\d+)?[MB]?) (?:Series|round|funding)',
            r'(\$\d+(?:\.\d+)?[MB]? (?:million|billion))',
        ]
        
        for pattern in funding_patterns:
            match = re.search(pattern, snippet, re.IGNORECASE)
            if match:
                return match.group(0)[:50]  # Limit length
    
    return None


def _patch_mission(company: str) -> Optional[str]:
    """Search for company mission/tagline."""
    if not company:
        return None
    
    query = f'"{company}" company mission tagline about'
    snippets = google_search(query, top=3, timeout=5)
    
    for snippet in snippets:
        # Look for mission-like sentences
        sentences = re.split(r'[.!?]+', snippet)
        for sentence in sentences:
            sentence = sentence.strip()
            # Look for sentences that describe what the company does
            if (len(sentence) > 20 and len(sentence) < 200 and 
                any(word in sentence.lower() for word in ['build', 'create', 'develop', 'provide', 'help', 'enable', 'platform'])):
                return sentence
    
    return None


def _patch_location(company: str) -> Optional[str]:
    """Search for company headquarters location."""
    if not company:
        return None
    
    query = f'"{company}" headquarters location'
    snippets = google_search(query, top=3, timeout=5)
    
    for snippet in snippets:
        # Look for location patterns
        location_patterns = [
            r'([A-Z][a-z]+,\s*[A-Z]{2})',  # City, State
            r'([A-Z][a-z]+\s+[A-Z][a-z]+,\s*[A-Z]{2})',  # City City, State
            r'([A-Z][a-z]+,\s*[A-Z][a-z]+)',  # City, Country
        ]
        
        for pattern in location_patterns:
            match = re.search(pattern, snippet)
            if match:
                location = match.group(1).strip()
                # Sanity check for common US locations
                if any(state in location for state in ['CA', 'NY', 'WA', 'TX', 'MA']):
                    return location
    
    return None