Spaces:
Sleeping
Sleeping
| # email_finder.py | |
| import requests | |
| import re | |
| from urllib.parse import urljoin | |
| from bs4 import BeautifulSoup | |
| def find_emails(domain): | |
| """ | |
| Attempts to find emails by visiting the homepage and common subpages | |
| like /about, /contact, /team, etc. | |
| """ | |
| base_url = f"https://{domain}" | |
| candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"] | |
| visited_pages = set() | |
| found_emails = set() | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)" | |
| } | |
| for path in candidate_paths: | |
| url = urljoin(base_url, path) | |
| if url in visited_pages: | |
| continue | |
| visited_pages.add(url) | |
| try: | |
| response = requests.get(url, timeout=10, headers=headers) | |
| if response.status_code != 200: | |
| continue | |
| content = response.text | |
| # Extract emails from page | |
| emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content)) | |
| found_emails.update(emails) | |
| # Also check for additional links in case there are nested team/contact/about pages | |
| soup = BeautifulSoup(content, "html.parser") | |
| links = soup.find_all("a", href=True) | |
| for link in links: | |
| href = link["href"] | |
| if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]): | |
| sub_url = urljoin(base_url, href) | |
| if sub_url not in visited_pages: | |
| visited_pages.add(sub_url) | |
| try: | |
| sub_resp = requests.get(sub_url, timeout=10, headers=headers) | |
| sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text)) | |
| found_emails.update(sub_emails) | |
| except Exception: | |
| continue | |
| except Exception: | |
| continue | |
| return list(found_emails) if found_emails else ["No emails found."] |