zanegraper commited on
Commit
26082dc
·
1 Parent(s): 377a2c0
__pycache__/email_finder.cpython-311.pyc ADDED
Binary file (2.89 kB). View file
 
__pycache__/emailfinder_wrapper.cpython-311.pyc ADDED
Binary file (1.18 kB). View file
 
email_finder.py CHANGED
@@ -1,12 +1,56 @@
1
  # email_finder.py
 
2
  import requests
3
  import re
 
 
 
4
  def find_emails(domain):
5
- url = f"https://{domain}"
6
- try:
7
- response = requests.get(url, timeout=10)
8
- content = response.text
9
- emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
10
- return list(emails) if emails else ["No emails found."]
11
- except Exception as e:
12
- return [f"Error fetching domain: {e}"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # email_finder.py
2
+
3
  import requests
4
  import re
5
+ from urllib.parse import urljoin
6
+ from bs4 import BeautifulSoup
7
+
8
  def find_emails(domain):
9
+ """
10
+ Attempts to find emails by visiting the homepage and common subpages
11
+ like /about, /contact, /team, etc.
12
+ """
13
+ base_url = f"https://{domain}"
14
+ candidate_paths = ["", "about", "contact", "team", "staff", "leadership", "info"]
15
+ visited_pages = set()
16
+ found_emails = set()
17
+
18
+ headers = {
19
+ "User-Agent": "Mozilla/5.0 (compatible; EmailFinderBot/1.0; +https://example.com/bot)"
20
+ }
21
+
22
+ for path in candidate_paths:
23
+ url = urljoin(base_url, path)
24
+ if url in visited_pages:
25
+ continue
26
+ visited_pages.add(url)
27
+ try:
28
+ response = requests.get(url, timeout=10, headers=headers)
29
+ if response.status_code != 200:
30
+ continue
31
+ content = response.text
32
+
33
+ # Extract emails from page
34
+ emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", content))
35
+ found_emails.update(emails)
36
+
37
+ # Also check for additional links in case there are nested team/contact/about pages
38
+ soup = BeautifulSoup(content, "html.parser")
39
+ links = soup.find_all("a", href=True)
40
+ for link in links:
41
+ href = link["href"]
42
+ if any(word in href.lower() for word in ["about", "contact", "team", "staff", "leadership"]):
43
+ sub_url = urljoin(base_url, href)
44
+ if sub_url not in visited_pages:
45
+ visited_pages.add(sub_url)
46
+ try:
47
+ sub_resp = requests.get(sub_url, timeout=10, headers=headers)
48
+ sub_emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", sub_resp.text))
49
+ found_emails.update(sub_emails)
50
+ except Exception:
51
+ continue
52
+
53
+ except Exception:
54
+ continue
55
+
56
+ return list(found_emails) if found_emails else ["No emails found."]