Gamortsey commited on
Commit
87e5329
·
verified ·
1 Parent(s): 5916940

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -151
app.py CHANGED
@@ -124,150 +124,19 @@ def extract_phones(text, region="GH"):
124
  pass
125
  return list(set(phones))
126
 
127
- # ---------- REPLACE scrape_contacts WITH THIS FUNCTION ----------
128
- def _fetch_url_text(url, timeout=10):
129
- """Fetch url and return BeautifulSoup-parsed object and raw text (or (None, ""))"""
130
- try:
131
- r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
132
- if not r.ok or not r.text:
133
- return None, ""
134
- soup = BeautifulSoup(r.text, "html.parser")
135
- text = soup.get_text(separator=" ")
136
- text = " ".join(text.split())[:300000]
137
- return soup, text
138
- except Exception as e:
139
- # network/DNS errors will be logged by caller
140
- return None, ""
141
-
142
- def _extract_emails_from_soup(soup, text):
143
- """Return list of unique candidate emails found in anchors, JSON-LD, meta, and text."""
144
- emails = set()
145
-
146
- # 1) mailto: links
147
- try:
148
- for a in soup.find_all("a", href=True):
149
- href = a["href"].strip()
150
- if href.startswith("mailto:"):
151
- # mailto may contain name and params -> split
152
- mail = href.split("mailto:")[1].split("?")[0]
153
- if EMAIL_REGEX.fullmatch(mail):
154
- emails.add(mail)
155
- except Exception:
156
- pass
157
-
158
- # 2) JSON-LD structured data (common for org pages)
159
- try:
160
- for script in soup.find_all("script", type="application/ld+json"):
161
- try:
162
- import json
163
- data = json.loads(script.string or "{}")
164
- # walk data for email fields (simple)
165
- def walk(o):
166
- if isinstance(o, dict):
167
- for k,v in o.items():
168
- if isinstance(v, (dict,list)):
169
- walk(v)
170
- else:
171
- if isinstance(v, str) and EMAIL_REGEX.search(v):
172
- emails.add(EMAIL_REGEX.search(v).group(0))
173
- elif isinstance(o, list):
174
- for it in o:
175
- walk(it)
176
- walk(data)
177
- except Exception:
178
- continue
179
- except Exception:
180
- pass
181
-
182
- # 3) meta tags
183
- try:
184
- for meta in soup.find_all("meta"):
185
- for attr in ("content","name"):
186
- if meta.get(attr) and isinstance(meta.get(attr), str):
187
- m = EMAIL_REGEX.search(meta.get(attr))
188
- if m:
189
- emails.add(m.group(0))
190
- except Exception:
191
- pass
192
-
193
- # 4) text regex fallback
194
- try:
195
- for m in EMAIL_REGEX.findall(text or ""):
196
- emails.add(m)
197
- except Exception:
198
- pass
199
-
200
- return list(emails)
201
-
202
  def scrape_contacts(url, region="GH"):
203
- """
204
- Robustly scrape the given URL for emails and phones.
205
- Strategy:
206
- 1) Fetch the page, extract mailto and regex emails.
207
- 2) If none found, try common contact/about/team URLs (bounded attempts).
208
- 3) Return {"emails": [..], "phones": [..]}
209
- """
210
- urls_tried = set()
211
  try:
212
- # normalize url
213
- orig = url or ""
214
- if not orig:
215
  return {"emails": [], "phones": []}
216
- # ensure scheme
217
- if not orig.startswith("http"):
218
- orig = "http://" + orig
219
- # first fetch main page
220
- soup, text = _fetch_url_text(orig)
221
- urls_tried.add(orig)
222
- emails = []
223
- phones = []
224
-
225
- if soup or text:
226
- emails = _extract_emails_from_soup(soup if soup else BeautifulSoup("", "html.parser"), text)
227
- phones = extract_phones(text or "", region)
228
-
229
- # If we have no emails, attempt a small set of common contact pages (bounded)
230
- if not emails:
231
- contact_paths = ["/contact", "/contact-us", "/contact-us/", "/contact.html",
232
- "/about", "/about-us", "/team", "/staff", "/contactus"]
233
- # prefer same host; build base url
234
- try:
235
- from urllib.parse import urljoin
236
- for p in contact_paths:
237
- next_url = urljoin(orig, p)
238
- if next_url in urls_tried:
239
- continue
240
- soup2, text2 = _fetch_url_text(next_url)
241
- urls_tried.add(next_url)
242
- if not soup2 and not text2:
243
- continue
244
- emails2 = _extract_emails_from_soup(soup2 if soup2 else BeautifulSoup("", "html.parser"), text2)
245
- phones2 = extract_phones(text2 or "", region)
246
- if emails2:
247
- emails = emails2
248
- if phones2 and not phones:
249
- phones = phones2
250
- # stop early if found emails
251
- if emails:
252
- break
253
- except Exception:
254
- pass
255
-
256
- # Final dedup & sanitization: prefer readable emails
257
- final_emails = []
258
- for e in emails:
259
- if isinstance(e, str) and EMAIL_REGEX.fullmatch(e):
260
- final_emails.append(e.strip())
261
- final_emails = list(dict.fromkeys(final_emails)) # preserve order unique
262
-
263
- final_phones = list(dict.fromkeys(phones))
264
-
265
- return {"emails": final_emails, "phones": final_phones}
266
  except Exception as e:
267
  print(f"[scrape error] {url} -> {e}")
268
  return {"emails": [], "phones": []}
269
- # ---------- END scrape_contacts replacement ----------
270
-
271
 
272
  # ============================
273
  # NER + STORY → PROFESSIONS
@@ -387,17 +256,13 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
387
  "source_query": r.get("query","")
388
  })
389
 
390
- # Second pass: for entries with "Not found", try a focused contact path (sequentially, bounded)
391
- for p in professionals:
392
- if p["email"] == "Not found":
393
- try:
394
- contacts = scrape_contacts(p["url"], region)
395
- if contacts["emails"]:
396
- p["email"] = contacts["emails"][0]
397
- if contacts["phones"]:
398
- p["phone"] = contacts["phones"][0]
399
- except Exception:
400
- pass
401
 
402
  # ============================
403
  # DRAFT (mailto + .eml)
@@ -420,8 +285,7 @@ def build_mailto_and_eml(to_addr, subject, body, default_from="noreply@ally.ai")
420
  f.write(msg.as_bytes())
421
 
422
  # Create mailto link (this part is fine)
423
- mailto = f"mailto:{urllib.parse.quote(to_addr)}?subject={urllib.parse.quote(subject or '')}&body={urllib.parse.quote(body or '')}"
424
-
425
 
426
  return mailto, fname
427
 
 
124
  pass
125
  return list(set(phones))
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def scrape_contacts(url, region="GH"):
 
 
 
 
 
 
 
 
128
  try:
129
+ res = requests.get(url, headers=HEADERS, timeout=12)
130
+ if not res.ok or not res.text:
 
131
  return {"emails": [], "phones": []}
132
+ text = BeautifulSoup(res.text, "html.parser").get_text(separator=" ")
133
+ text = " ".join(text.split())[:300000]
134
+ emails = list(set(EMAIL_REGEX.findall(text)))
135
+ phones = extract_phones(text, region)
136
+ return {"emails": emails, "phones": phones}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  except Exception as e:
138
  print(f"[scrape error] {url} -> {e}")
139
  return {"emails": [], "phones": []}
 
 
140
 
141
  # ============================
142
  # NER + STORY → PROFESSIONS
 
256
  "source_query": r.get("query","")
257
  })
258
 
259
+ summary = generate_summary("; ".join(queries[:3]) + (" ..." if len(queries)>3 else ""),
260
+ list(set(all_people)), list(set(all_orgs)), list(set(all_locs)))
261
+
262
+ # Sort by availability of email/phone
263
+ professionals.sort(key=lambda it: (0 if it["email"]!="Not found" else 1,
264
+ 0 if it["phone"]!="Not found" else 1))
265
+ return {"summary": summary, "professionals": professionals, "queries_used": queries}
 
 
 
 
266
 
267
  # ============================
268
  # DRAFT (mailto + .eml)
 
285
  f.write(msg.as_bytes())
286
 
287
  # Create mailto link (this part is fine)
288
+ mailto = f"mailto:{to_addr}?subject={subject}&body={body}"
 
289
 
290
  return mailto, fname
291