File size: 5,674 Bytes
4655858 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import re
import os
from typing import Optional
from text_extractor import JobCore
from llm_client import google_search
from metrics import log_metric
def patch_missing(core: JobCore) -> JobCore:
"""Patch missing fields in JobCore using Google search."""
# Check if Google patching is enabled
if not os.getenv("GOOGLE_PATCH_ENABLED", "true").lower() in ["true", "1", "yes"]:
return core
# Only patch if we have basic company info
if not core.company:
return core
patches_applied = 0
# Patch salary if missing
if not core.salary_low and not core.salary_high:
salary_info = _patch_salary(core.company, core.role)
if salary_info:
core.salary_low, core.salary_high = salary_info
core.source_map["salary"] = "google"
patches_applied += 1
# Patch funding if missing
if not core.funding:
funding_info = _patch_funding(core.company)
if funding_info:
core.funding = funding_info
core.source_map["funding"] = "google"
patches_applied += 1
# Patch mission if missing
if not core.mission:
mission_info = _patch_mission(core.company)
if mission_info:
core.mission = mission_info
core.source_map["mission"] = "google"
patches_applied += 1
# Patch location if missing
if not core.location:
location_info = _patch_location(core.company)
if location_info:
core.location = location_info
core.source_map["location"] = "google"
patches_applied += 1
log_metric("patch_missing", {
"company": core.company,
"patches_applied": patches_applied,
"source_map": core.source_map
})
return core
def _patch_salary(company: str, role: str) -> Optional[tuple[int, int]]:
"""Search for salary information and extract range."""
if not company or not role:
return None
query = f'"{company}" "{role}" salary range'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for salary patterns like "$120k-$180k", "$150,000-$200,000"
salary_patterns = [
r'\$(\d+)k?[-–]\$?(\d+)k?',
r'\$(\d+),?(\d+)[-–]\$?(\d+),?(\d+)',
r'(\d+)k?[-–](\d+)k?\s*(?:per|/)?\s*year',
]
for pattern in salary_patterns:
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
try:
if 'k' in match.group(0).lower():
low = int(match.group(1)) * 1000
high = int(match.group(2)) * 1000
else:
low = int(match.group(1))
high = int(match.group(2))
# Sanity check: reasonable salary range
if 30000 <= low <= 500000 and 30000 <= high <= 500000 and low < high:
return (low, high)
except (ValueError, IndexError):
continue
return None
def _patch_funding(company: str) -> Optional[str]:
"""Search for funding information."""
if not company:
return None
query = f'"{company}" funding round raised'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for funding patterns
funding_patterns = [
r'raised \$(\d+(?:\.\d+)?[MB]?)',
r'Series [A-Z] \$(\d+(?:\.\d+)?[MB]?)',
r'\$(\d+(?:\.\d+)?[MB]?) (?:Series|round|funding)',
r'(\$\d+(?:\.\d+)?[MB]? (?:million|billion))',
]
for pattern in funding_patterns:
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
return match.group(0)[:50] # Limit length
return None
def _patch_mission(company: str) -> Optional[str]:
"""Search for company mission/tagline."""
if not company:
return None
query = f'"{company}" company mission tagline about'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for mission-like sentences
sentences = re.split(r'[.!?]+', snippet)
for sentence in sentences:
sentence = sentence.strip()
# Look for sentences that describe what the company does
if (len(sentence) > 20 and len(sentence) < 200 and
any(word in sentence.lower() for word in ['build', 'create', 'develop', 'provide', 'help', 'enable', 'platform'])):
return sentence
return None
def _patch_location(company: str) -> Optional[str]:
"""Search for company headquarters location."""
if not company:
return None
query = f'"{company}" headquarters location'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for location patterns
location_patterns = [
r'([A-Z][a-z]+,\s*[A-Z]{2})', # City, State
r'([A-Z][a-z]+\s+[A-Z][a-z]+,\s*[A-Z]{2})', # City City, State
r'([A-Z][a-z]+,\s*[A-Z][a-z]+)', # City, Country
]
for pattern in location_patterns:
match = re.search(pattern, snippet)
if match:
location = match.group(1).strip()
# Sanity check for common US locations
if any(state in location for state in ['CA', 'NY', 'WA', 'TX', 'MA']):
return location
return None |