Spaces:

zanegraper
/

EmailFinder

Sleeping

App Files Files Community

EmailFinder / email_finder.py

zanegraper

Updated

26082dc 10 months ago

raw

history blame contribute delete

2.11 kB

	# email_finder.py

	import requests
	import re
	from urllib.parse import urljoin
	from bs4 import BeautifulSoup

	def find_emails(domain):
	"""
	Attempts to find emails by visiting the homepage and common subpages
	like /about, /contact, /team, etc.
	"""
	base_url = f"https://{domain}"
	candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"]
	visited_pages = set()
	found_emails = set()

	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)"
	}

	for path in candidate_paths:
	url = urljoin(base_url, path)
	if url in visited_pages:
	continue
	visited_pages.add(url)
	try:
	response = requests.get(url, timeout=10, headers=headers)
	if response.status_code != 200:
	continue
	content = response.text

	# Extract emails from page
	emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
	found_emails.update(emails)

	# Also check for additional links in case there are nested team/contact/about pages
	soup = BeautifulSoup(content, "html.parser")
	links = soup.find_all("a", href=True)
	for link in links:
	href = link["href"]
	if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]):
	sub_url = urljoin(base_url, href)
	if sub_url not in visited_pages:
	visited_pages.add(sub_url)
	try:
	sub_resp = requests.get(sub_url, timeout=10, headers=headers)
	sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text))
	found_emails.update(sub_emails)
	except Exception:
	continue

	except Exception:
	continue

	return list(found_emails) if found_emails else ["No emails found."]