File size: 2,108 Bytes
d2019da
26082dc
d2019da
 
26082dc
 
 
d2019da
26082dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# email_finder.py

import requests
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup

def find_emails(domain):
    """
    Attempts to find emails by visiting the homepage and common subpages
    like /about, /contact, /team, etc.
    """
    base_url = f"https://{domain}"
    candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"]
    visited_pages = set()
    found_emails = set()

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)"
    }

    for path in candidate_paths:
        url = urljoin(base_url, path)
        if url in visited_pages:
            continue
        visited_pages.add(url)
        try:
            response = requests.get(url, timeout=10, headers=headers)
            if response.status_code != 200:
                continue
            content = response.text

            # Extract emails from page
            emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
            found_emails.update(emails)

            # Also check for additional links in case there are nested team/contact/about pages
            soup = BeautifulSoup(content, "html.parser")
            links = soup.find_all("a", href=True)
            for link in links:
                href = link["href"]
                if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]):
                    sub_url = urljoin(base_url, href)
                    if sub_url not in visited_pages:
                        visited_pages.add(sub_url)
                        try:
                            sub_resp = requests.get(sub_url, timeout=10, headers=headers)
                            sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text))
                            found_emails.update(sub_emails)
                        except Exception:
                            continue

        except Exception:
            continue

    return list(found_emails) if found_emails else ["No emails found."]