Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on 18 days ago

Commit

e30d91f

verified ·

1 Parent(s): b00d456

Update parse_email.py

Browse files

Files changed (1) hide show

parse_email.py +146 -65

parse_email.py CHANGED Viewed

@@ -1,96 +1,177 @@
 # parse_email.py
 import email
 from email import policy
 from bs4 import BeautifulSoup
 import re
 import base64
-import io
-def _extract_inline_images_from_html(html):
-    images = []
     soup = BeautifulSoup(html or "", "html.parser")
     for img in soup.find_all("img"):
         src = img.get("src", "")
-        if src.startswith("data:image/"):
-            # e.g. data:image/png;base64,iVBORw0...
             try:
                 header, b64 = src.split(",", 1)
-                data = base64.b64decode(b64)
-                images.append(data)
             except Exception:
-                continue
-    return images
 def parse_email(file_path):
     """
-    Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
     """
     with open(file_path, "rb") as f:
-        msg = email.message_from_binary_file(f, policy=policy.default)
     headers = dict(msg.items())
-    subject = headers.get("Subject", "") or ""
-    body = ""
-    images = []
-    # Walk parts - handle multipart and attachments
-    if msg.is_multipart():
-        for part in msg.walk():
-            ctype = part.get_content_type()
-            disp = str(part.get("Content-Disposition") or "").lower()
-            # attachments that are images
-            if ctype.startswith("image/"):
-                try:
-                    data = part.get_payload(decode=True)
-                    if data:
-                        images.append(data)
-                except Exception:
-                    pass
-            # text/plain
-            if ctype == "text/plain" and "attachment" not in disp:
-                try:
-                    body += part.get_content()
-                except Exception:
-                    pass
-            # text/html
-            if ctype == "text/html" and "attachment" not in disp:
-                try:
-                    html_body = part.get_content()
-                    # extract inline images from this html (data URIs)
-                    images += _extract_inline_images_from_html(html_body)
-                    # convert html to text
-                    soup = BeautifulSoup(html_body, "html.parser")
-                    body += soup.get_text(" ", strip=True)
-                except Exception:
-                    pass
-    else:
-        # not multipart
-        try:
-            if msg.get_content_type() == "text/html":
-                html_body = msg.get_content()
-                images += _extract_inline_images_from_html(html_body)
-                soup = BeautifulSoup(html_body, "html.parser")
-                body = soup.get_text(" ", strip=True)
-            else:
-                body = msg.get_content()
-        except Exception:
-            body = ""
-    # URL extraction (from combined body)
-    urls = set()
     try:
-        urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
     except Exception:
         pass
-    # Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
     for k, v in headers.items():
         try:
-            urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
         except Exception:
             pass
-    return headers, subject, body, list(urls), images

 # parse_email.py
 import email
 from email import policy
+from email.parser import BytesParser
+from email.utils import parsedate_to_datetime
 from bs4 import BeautifulSoup
 import re
 import base64
+import quopri
+# ------------------------------------------------
+# Helpers
+# ------------------------------------------------
+URL_REGEX = re.compile(r"https?://[^\s\"'<>]+")
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def decode_payload(part):
+    payload = part.get_payload(decode=True)
+    if payload is None:
+        return ""
+    charset = part.get_content_charset() or "utf-8"
+    try:
+        return payload.decode(charset, errors="replace")
+    except Exception:
+        return payload.decode("utf-8", errors="replace")
+def extract_urls_from_text(text):
+    return set(URL_REGEX.findall(text or ""))
+# ------------------------------------------------
+# HTML ANALYSIS
+# ------------------------------------------------
+def analyze_html(html):
     soup = BeautifulSoup(html or "", "html.parser")
+    text = soup.get_text(" ", strip=True)
+    urls = set()
+    hidden_links = []
+    for tag in soup.find_all("a"):
+        href = tag.get("href", "").strip()
+        anchor_text = normalize_text(tag.get_text())
+        if href.startswith("http"):
+            urls.add(href)
+            # Anchor mismatch (classic phishing trick)
+            if anchor_text and anchor_text.startswith("http") and anchor_text not in href:
+                hidden_links.append({
+                    "displayed": anchor_text,
+                    "actual": href
+                })
+    # Inline images (base64)
+    inline_images = []
     for img in soup.find_all("img"):
         src = img.get("src", "")
+        if src.startswith("data:image"):
             try:
                 header, b64 = src.split(",", 1)
+                inline_images.append(base64.b64decode(b64))
             except Exception:
+                pass
+    return normalize_text(text), urls, hidden_links, inline_images
+# ------------------------------------------------
+# MAIN PARSER
+# ------------------------------------------------
 def parse_email(file_path):
     """
+    Returns:
+        headers: dict
+        metadata: dict
+        body: str
+        urls: list
+        hidden_links: list
+        attachments: list
+        images: list (bytes)
     """
     with open(file_path, "rb") as f:
+        msg = BytesParser(policy=policy.default).parse(f)
+    # -----------------------
+    # HEADERS
+    # -----------------------
     headers = dict(msg.items())
+    metadata = {
+        "subject": headers.get("Subject", ""),
+        "from": headers.get("From", ""),
+        "to": headers.get("To", ""),
+        "date": None,
+        "message_id": headers.get("Message-ID", "")
+    }
     try:
+        metadata["date"] = parsedate_to_datetime(headers.get("Date"))
     except Exception:
         pass
+    # -----------------------
+    # CONTENT EXTRACTION
+    # -----------------------
+    full_text = ""
+    urls = set()
+    hidden_links = []
+    images = []
+    attachments = []
+    for part in msg.walk():
+        ctype = part.get_content_type()
+        disp = (part.get("Content-Disposition") or "").lower()
+        # -------- Attachments --------
+        if "attachment" in disp:
+            attachments.append({
+                "filename": part.get_filename(),
+                "content_type": ctype,
+                "size": len(part.get_payload(decode=True) or b""),
+            })
+            continue
+        # -------- Images --------
+        if ctype.startswith("image/"):
+            try:
+                data = part.get_payload(decode=True)
+                if data:
+                    images.append(data)
+            except Exception:
+                pass
+            continue
+        # -------- Text Plain --------
+        if ctype == "text/plain":
+            text = decode_payload(part)
+            text = normalize_text(text)
+            full_text += " " + text
+            urls.update(extract_urls_from_text(text))
+        # -------- HTML --------
+        elif ctype == "text/html":
+            html = decode_payload(part)
+            text, found_urls, hidden, inline_imgs = analyze_html(html)
+            full_text += " " + text
+            urls.update(found_urls)
+            hidden_links.extend(hidden)
+            images.extend(inline_imgs)
+    # -----------------------
+    # HEADER URL EXTRACTION
+    # -----------------------
     for k, v in headers.items():
         try:
+            urls.update(extract_urls_from_text(str(v)))
         except Exception:
             pass
+    return (
+        headers,
+        metadata,
+        normalize_text(full_text),
+        list(urls),
+        hidden_links,
+        attachments,
+        images,
+    )