Spaces:
Sleeping
Sleeping
Commit ·
26082dc
1
Parent(s): 377a2c0
Updated
Browse files
__pycache__/email_finder.cpython-311.pyc
ADDED
|
Binary file (2.89 kB). View file
|
|
|
__pycache__/emailfinder_wrapper.cpython-311.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
email_finder.py
CHANGED
|
@@ -1,12 +1,56 @@
|
|
| 1 |
# email_finder.py
|
|
|
|
| 2 |
import requests
|
| 3 |
import re
|
|
|
|
|
|
|
|
|
|
| 4 |
def find_emails(domain):
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# email_finder.py
|
| 2 |
+
|
| 3 |
import requests
|
| 4 |
import re
|
| 5 |
+
from urllib.parse import urljoin
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
|
| 8 |
def find_emails(domain):
|
| 9 |
+
"""
|
| 10 |
+
Attempts to find emails by visiting the homepage and common subpages
|
| 11 |
+
like /about, /contact, /team, etc.
|
| 12 |
+
"""
|
| 13 |
+
base_url = f"https://{domain}"
|
| 14 |
+
candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"]
|
| 15 |
+
visited_pages = set()
|
| 16 |
+
found_emails = set()
|
| 17 |
+
|
| 18 |
+
headers = {
|
| 19 |
+
"User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
for path in candidate_paths:
|
| 23 |
+
url = urljoin(base_url, path)
|
| 24 |
+
if url in visited_pages:
|
| 25 |
+
continue
|
| 26 |
+
visited_pages.add(url)
|
| 27 |
+
try:
|
| 28 |
+
response = requests.get(url, timeout=10, headers=headers)
|
| 29 |
+
if response.status_code != 200:
|
| 30 |
+
continue
|
| 31 |
+
content = response.text
|
| 32 |
+
|
| 33 |
+
# Extract emails from page
|
| 34 |
+
emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
|
| 35 |
+
found_emails.update(emails)
|
| 36 |
+
|
| 37 |
+
# Also check for additional links in case there are nested team/contact/about pages
|
| 38 |
+
soup = BeautifulSoup(content, "html.parser")
|
| 39 |
+
links = soup.find_all("a", href=True)
|
| 40 |
+
for link in links:
|
| 41 |
+
href = link["href"]
|
| 42 |
+
if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]):
|
| 43 |
+
sub_url = urljoin(base_url, href)
|
| 44 |
+
if sub_url not in visited_pages:
|
| 45 |
+
visited_pages.add(sub_url)
|
| 46 |
+
try:
|
| 47 |
+
sub_resp = requests.get(sub_url, timeout=10, headers=headers)
|
| 48 |
+
sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text))
|
| 49 |
+
found_emails.update(sub_emails)
|
| 50 |
+
except Exception:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
except Exception:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
return list(found_emails) if found_emails else ["No emails found."]
|