princemaxp commited on
Commit
86cb7f3
·
verified ·
1 Parent(s): 3d7d6cb

Update parse_email.py

Browse files
Files changed (1) hide show
  1. parse_email.py +40 -14
parse_email.py CHANGED
@@ -1,27 +1,53 @@
1
  import email
2
- import re
3
  from bs4 import BeautifulSoup
 
4
 
5
  def parse_email(file_path):
6
  with open(file_path, "rb") as f:
7
- msg = email.message_from_binary_file(f)
8
 
 
9
  headers = dict(msg.items())
10
-
11
- # Extract body (handle plain + HTML)
12
  body = ""
13
  if msg.is_multipart():
14
  for part in msg.walk():
15
- if part.get_content_type() == "text/plain":
16
- body += part.get_payload(decode=True).decode(errors="ignore")
17
- elif part.get_content_type() == "text/html":
18
- html = part.get_payload(decode=True).decode(errors="ignore")
19
- soup = BeautifulSoup(html, "html.parser")
20
- body += soup.get_text()
 
 
 
 
 
 
 
21
  else:
22
- body = msg.get_payload(decode=True).decode(errors="ignore")
 
 
 
 
 
 
 
 
 
23
 
24
- # Extract URLs
25
- urls = re.findall(r'(https?://\S+)', body)
 
 
 
 
 
 
 
 
26
 
27
- return headers, body, urls
 
1
  import email
2
+ from email import policy
3
  from bs4 import BeautifulSoup
4
+ import re
5
 
6
  def parse_email(file_path):
7
  with open(file_path, "rb") as f:
8
+ msg = email.message_from_binary_file(f, policy=policy.default)
9
 
10
+ # --- 1. Extract headers ---
11
  headers = dict(msg.items())
12
+
13
+ # --- 2. Extract body (text + html) ---
14
  body = ""
15
  if msg.is_multipart():
16
  for part in msg.walk():
17
+ content_type = part.get_content_type()
18
+ if content_type == "text/plain":
19
+ try:
20
+ body += part.get_content()
21
+ except:
22
+ pass
23
+ elif content_type == "text/html":
24
+ try:
25
+ html_body = part.get_content()
26
+ soup = BeautifulSoup(html_body, "html.parser")
27
+ body += soup.get_text(" ", strip=True)
28
+ except:
29
+ pass
30
  else:
31
+ try:
32
+ body = msg.get_content()
33
+ except:
34
+ body = ""
35
+
36
+ # --- 3. Extract URLs ---
37
+ urls = set()
38
+
39
+ # From plain text
40
+ urls.update(re.findall(r"https?://[^\s]+", body))
41
 
42
+ # From HTML parts
43
+ for part in msg.walk():
44
+ if part.get_content_type() == "text/html":
45
+ try:
46
+ html_body = part.get_content()
47
+ soup = BeautifulSoup(html_body, "html.parser")
48
+ for link in soup.find_all("a", href=True):
49
+ urls.add(link["href"])
50
+ except:
51
+ pass
52
 
53
+ return headers, body, list(urls)