EmailFinder / email_finder.py
zanegraper's picture
Updated
26082dc
# email_finder.py
import requests
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def find_emails(domain):
"""
Attempts to find emails by visiting the homepage and common subpages
like /about, /contact, /team, etc.
"""
base_url = f"https://{domain}"
candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"]
visited_pages = set()
found_emails = set()
headers = {
"User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)"
}
for path in candidate_paths:
url = urljoin(base_url, path)
if url in visited_pages:
continue
visited_pages.add(url)
try:
response = requests.get(url, timeout=10, headers=headers)
if response.status_code != 200:
continue
content = response.text
# Extract emails from page
emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
found_emails.update(emails)
# Also check for additional links in case there are nested team/contact/about pages
soup = BeautifulSoup(content, "html.parser")
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]):
sub_url = urljoin(base_url, href)
if sub_url not in visited_pages:
visited_pages.add(sub_url)
try:
sub_resp = requests.get(sub_url, timeout=10, headers=headers)
sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text))
found_emails.update(sub_emails)
except Exception:
continue
except Exception:
continue
return list(found_emails) if found_emails else ["No emails found."]