Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 5, 2025

Commit

86cb7f3

verified ·

1 Parent(s): 3d7d6cb

Update parse_email.py

Browse files

Files changed (1) hide show

parse_email.py +40 -14

parse_email.py CHANGED Viewed

@@ -1,27 +1,53 @@
 import email
-import re
 from bs4 import BeautifulSoup
 def parse_email(file_path):
     with open(file_path, "rb") as f:
-        msg = email.message_from_binary_file(f)
     headers = dict(msg.items())
-    # Extract body (handle plain + HTML)
     body = ""
     if msg.is_multipart():
         for part in msg.walk():
-            if part.get_content_type() == "text/plain":
-                body += part.get_payload(decode=True).decode(errors="ignore")
-            elif part.get_content_type() == "text/html":
-                html = part.get_payload(decode=True).decode(errors="ignore")
-                soup = BeautifulSoup(html, "html.parser")
-                body += soup.get_text()
     else:
-        body = msg.get_payload(decode=True).decode(errors="ignore")
-    # Extract URLs
-    urls = re.findall(r'(https?://\S+)', body)
-    return headers, body, urls

 import email
+from email import policy
 from bs4 import BeautifulSoup
+import re
 def parse_email(file_path):
     with open(file_path, "rb") as f:
+        msg = email.message_from_binary_file(f, policy=policy.default)
+    # --- 1. Extract headers ---
     headers = dict(msg.items())
+    # --- 2. Extract body (text + html) ---
     body = ""
     if msg.is_multipart():
         for part in msg.walk():
+            content_type = part.get_content_type()
+            if content_type == "text/plain":
+                try:
+                    body += part.get_content()
+                except:
+                    pass
+            elif content_type == "text/html":
+                try:
+                    html_body = part.get_content()
+                    soup = BeautifulSoup(html_body, "html.parser")
+                    body += soup.get_text(" ", strip=True)
+                except:
+                    pass
     else:
+        try:
+            body = msg.get_content()
+        except:
+            body = ""
+    # --- 3. Extract URLs ---
+    urls = set()
+    # From plain text
+    urls.update(re.findall(r"https?://[^\s]+", body))
+    # From HTML parts
+    for part in msg.walk():
+        if part.get_content_type() == "text/html":
+            try:
+                html_body = part.get_content()
+                soup = BeautifulSoup(html_body, "html.parser")
+                for link in soup.find_all("a", href=True):
+                    urls.add(link["href"])
+            except:
+                pass
+    return headers, body, list(urls)