Spaces:

Gamortsey
/

AllyAI_Help_finder

Sleeping

App Files Files Community

Gamortsey commited on Aug 19, 2025

Commit

87e5329

verified ·

1 Parent(s): 5916940

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -151

app.py CHANGED Viewed

@@ -124,150 +124,19 @@ def extract_phones(text, region="GH"):
             pass
     return list(set(phones))
-# ---------- REPLACE scrape_contacts WITH THIS FUNCTION ----------
-def _fetch_url_text(url, timeout=10):
-    """Fetch url and return BeautifulSoup-parsed object and raw text (or (None, ""))"""
-    try:
-        r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
-        if not r.ok or not r.text:
-            return None, ""
-        soup = BeautifulSoup(r.text, "html.parser")
-        text = soup.get_text(separator=" ")
-        text = " ".join(text.split())[:300000]
-        return soup, text
-    except Exception as e:
-        # network/DNS errors will be logged by caller
-        return None, ""
-def _extract_emails_from_soup(soup, text):
-    """Return list of unique candidate emails found in anchors, JSON-LD, meta, and text."""
-    emails = set()
-    # 1) mailto: links
-    try:
-        for a in soup.find_all("a", href=True):
-            href = a["href"].strip()
-            if href.startswith("mailto:"):
-                # mailto may contain name and params -> split
-                mail = href.split("mailto:")[1].split("?")[0]
-                if EMAIL_REGEX.fullmatch(mail):
-                    emails.add(mail)
-    except Exception:
-        pass
-    # 2) JSON-LD structured data (common for org pages)
-    try:
-        for script in soup.find_all("script", type="application/ld+json"):
-            try:
-                import json
-                data = json.loads(script.string or "{}")
-                # walk data for email fields (simple)
-                def walk(o):
-                    if isinstance(o, dict):
-                        for k,v in o.items():
-                            if isinstance(v, (dict,list)):
-                                walk(v)
-                            else:
-                                if isinstance(v, str) and EMAIL_REGEX.search(v):
-                                    emails.add(EMAIL_REGEX.search(v).group(0))
-                    elif isinstance(o, list):
-                        for it in o:
-                            walk(it)
-                walk(data)
-            except Exception:
-                continue
-    except Exception:
-        pass
-    # 3) meta tags
-    try:
-        for meta in soup.find_all("meta"):
-            for attr in ("content","name"):
-                if meta.get(attr) and isinstance(meta.get(attr), str):
-                    m = EMAIL_REGEX.search(meta.get(attr))
-                    if m:
-                        emails.add(m.group(0))
-    except Exception:
-        pass
-    # 4) text regex fallback
-    try:
-        for m in EMAIL_REGEX.findall(text or ""):
-            emails.add(m)
-    except Exception:
-        pass
-    return list(emails)
 def scrape_contacts(url, region="GH"):
-    """
-    Robustly scrape the given URL for emails and phones.
-    Strategy:
-      1) Fetch the page, extract mailto and regex emails.
-      2) If none found, try common contact/about/team URLs (bounded attempts).
-      3) Return {"emails": [..], "phones": [..]}
-    """
-    urls_tried = set()
     try:
-        # normalize url
-        orig = url or ""
-        if not orig:
             return {"emails": [], "phones": []}
-        # ensure scheme
-        if not orig.startswith("http"):
-            orig = "http://" + orig
-        # first fetch main page
-        soup, text = _fetch_url_text(orig)
-        urls_tried.add(orig)
-        emails = []
-        phones = []
-        if soup or text:
-            emails = _extract_emails_from_soup(soup if soup else BeautifulSoup("", "html.parser"), text)
-            phones = extract_phones(text or "", region)
-        # If we have no emails, attempt a small set of common contact pages (bounded)
-        if not emails:
-            contact_paths = ["/contact", "/contact-us", "/contact-us/", "/contact.html",
-                             "/about", "/about-us", "/team", "/staff", "/contactus"]
-            # prefer same host; build base url
-            try:
-                from urllib.parse import urljoin
-                for p in contact_paths:
-                    next_url = urljoin(orig, p)
-                    if next_url in urls_tried:
-                        continue
-                    soup2, text2 = _fetch_url_text(next_url)
-                    urls_tried.add(next_url)
-                    if not soup2 and not text2:
-                        continue
-                    emails2 = _extract_emails_from_soup(soup2 if soup2 else BeautifulSoup("", "html.parser"), text2)
-                    phones2 = extract_phones(text2 or "", region)
-                    if emails2:
-                        emails = emails2
-                    if phones2 and not phones:
-                        phones = phones2
-                    # stop early if found emails
-                    if emails:
-                        break
-            except Exception:
-                pass
-        # Final dedup & sanitization: prefer readable emails
-        final_emails = []
-        for e in emails:
-            if isinstance(e, str) and EMAIL_REGEX.fullmatch(e):
-                final_emails.append(e.strip())
-        final_emails = list(dict.fromkeys(final_emails))  # preserve order unique
-        final_phones = list(dict.fromkeys(phones))
-        return {"emails": final_emails, "phones": final_phones}
     except Exception as e:
         print(f"[scrape error] {url} -> {e}")
         return {"emails": [], "phones": []}
-# ---------- END scrape_contacts replacement ----------
 # ============================
 # NER + STORY → PROFESSIONS
@@ -387,17 +256,13 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
                 "source_query": r.get("query","")
             })
-    # Second pass: for entries with "Not found", try a focused contact path (sequentially, bounded)
-    for p in professionals:
-        if p["email"] == "Not found":
-            try:
-                contacts = scrape_contacts(p["url"], region)
-                if contacts["emails"]:
-                    p["email"] = contacts["emails"][0]
-                if contacts["phones"]:
-                    p["phone"] = contacts["phones"][0]
-            except Exception:
-                pass
 # ============================
 # DRAFT (mailto + .eml)
@@ -420,8 +285,7 @@ def build_mailto_and_eml(to_addr, subject, body, default_from="noreply@ally.ai")
         f.write(msg.as_bytes())
     # Create mailto link (this part is fine)
-    mailto = f"mailto:{urllib.parse.quote(to_addr)}?subject={urllib.parse.quote(subject or '')}&body={urllib.parse.quote(body or '')}"
     return mailto, fname

             pass
     return list(set(phones))
 def scrape_contacts(url, region="GH"):
     try:
+        res = requests.get(url, headers=HEADERS, timeout=12)
+        if not res.ok or not res.text:
             return {"emails": [], "phones": []}
+        text = BeautifulSoup(res.text, "html.parser").get_text(separator=" ")
+        text = " ".join(text.split())[:300000]
+        emails = list(set(EMAIL_REGEX.findall(text)))
+        phones = extract_phones(text, region)
+        return {"emails": emails, "phones": phones}
     except Exception as e:
         print(f"[scrape error] {url} -> {e}")
         return {"emails": [], "phones": []}
 # ============================
 # NER + STORY → PROFESSIONS
                 "source_query": r.get("query","")
             })
+    summary = generate_summary("; ".join(queries[:3]) + (" ..." if len(queries)>3 else ""),
+                               list(set(all_people)), list(set(all_orgs)), list(set(all_locs)))
+    # Sort by availability of email/phone
+    professionals.sort(key=lambda it: (0 if it["email"]!="Not found" else 1,
+                                       0 if it["phone"]!="Not found" else 1))
+    return {"summary": summary, "professionals": professionals, "queries_used": queries}
 # ============================
 # DRAFT (mailto + .eml)
         f.write(msg.as_bytes())
     # Create mailto link (this part is fine)
+    mailto = f"mailto:{to_addr}?subject={subject}&body={body}"
     return mailto, fname