Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Jan 8

Commit

748e27f

verified ·

1 Parent(s): a4c47a0

Update parse_email.py

Browse files

Files changed (1) hide show

parse_email.py +72 -140

parse_email.py CHANGED Viewed

@@ -1,177 +1,109 @@
 # parse_email.py
 import email
 from email import policy
-from email.parser import BytesParser
-from email.utils import parsedate_to_datetime
 from bs4 import BeautifulSoup
 import re
 import base64
-import quopri
-# ------------------------------------------------
-# Helpers
-# ------------------------------------------------
-URL_REGEX = re.compile(r"https?://[^\s\"'<>]+")
-def normalize_text(text: str) -> str:
-    if not text:
-        return ""
-    text = re.sub(r"\s+", " ", text)
-    return text.strip()
-def decode_payload(part):
-    payload = part.get_payload(decode=True)
-    if payload is None:
-        return ""
-    charset = part.get_content_charset() or "utf-8"
-    try:
-        return payload.decode(charset, errors="replace")
-    except Exception:
-        return payload.decode("utf-8", errors="replace")
-def extract_urls_from_text(text):
-    return set(URL_REGEX.findall(text or ""))
-# ------------------------------------------------
-# HTML ANALYSIS
-# ------------------------------------------------
-def analyze_html(html):
     soup = BeautifulSoup(html or "", "html.parser")
-    text = soup.get_text(" ", strip=True)
-    urls = set()
-    hidden_links = []
-    for tag in soup.find_all("a"):
-        href = tag.get("href", "").strip()
-        anchor_text = normalize_text(tag.get_text())
-        if href.startswith("http"):
-            urls.add(href)
-            # Anchor mismatch (classic phishing trick)
-            if anchor_text and anchor_text.startswith("http") and anchor_text not in href:
-                hidden_links.append({
-                    "displayed": anchor_text,
-                    "actual": href
-                })
-    # Inline images (base64)
-    inline_images = []
     for img in soup.find_all("img"):
         src = img.get("src", "")
-        if src.startswith("data:image"):
             try:
-                header, b64 = src.split(",", 1)
-                inline_images.append(base64.b64decode(b64))
             except Exception:
                 pass
-    return normalize_text(text), urls, hidden_links, inline_images
-# ------------------------------------------------
-# MAIN PARSER
-# ------------------------------------------------
 def parse_email(file_path):
     """
     Returns:
-        headers: dict
-        metadata: dict
-        body: str
-        urls: list
-        hidden_links: list
-        attachments: list
-        images: list (bytes)
     """
     with open(file_path, "rb") as f:
-        msg = BytesParser(policy=policy.default).parse(f)
-    # -----------------------
-    # HEADERS
-    # -----------------------
     headers = dict(msg.items())
-    metadata = {
-        "subject": headers.get("Subject", ""),
-        "from": headers.get("From", ""),
-        "to": headers.get("To", ""),
-        "date": None,
-        "message_id": headers.get("Message-ID", "")
-    }
-    try:
-        metadata["date"] = parsedate_to_datetime(headers.get("Date"))
-    except Exception:
-        pass
-    # -----------------------
-    # CONTENT EXTRACTION
-    # -----------------------
-    full_text = ""
-    urls = set()
-    hidden_links = []
     images = []
     attachments = []
-    for part in msg.walk():
-        ctype = part.get_content_type()
-        disp = (part.get("Content-Disposition") or "").lower()
-        # -------- Attachments --------
-        if "attachment" in disp:
-            attachments.append({
-                "filename": part.get_filename(),
-                "content_type": ctype,
-                "size": len(part.get_payload(decode=True) or b""),
-            })
-            continue
-        # -------- Images --------
-        if ctype.startswith("image/"):
             try:
-                data = part.get_payload(decode=True)
-                if data:
-                    images.append(data)
             except Exception:
                 pass
-            continue
-        # -------- Text Plain --------
-        if ctype == "text/plain":
-            text = decode_payload(part)
-            text = normalize_text(text)
-            full_text += " " + text
-            urls.update(extract_urls_from_text(text))
-        # -------- HTML --------
-        elif ctype == "text/html":
-            html = decode_payload(part)
-            text, found_urls, hidden, inline_imgs = analyze_html(html)
-            full_text += " " + text
-            urls.update(found_urls)
-            hidden_links.extend(hidden)
-            images.extend(inline_imgs)
-    # -----------------------
-    # HEADER URL EXTRACTION
-    # -----------------------
-    for k, v in headers.items():
         try:
-            urls.update(extract_urls_from_text(str(v)))
         except Exception:
             pass
-    return (
-        headers,
-        metadata,
-        normalize_text(full_text),
-        list(urls),
-        hidden_links,
-        attachments,
-        images,
-    )

 # parse_email.py
 import email
 from email import policy
 from bs4 import BeautifulSoup
 import re
 import base64
+def _extract_inline_images_from_html(html):
+    images = []
     soup = BeautifulSoup(html or "", "html.parser")
     for img in soup.find_all("img"):
         src = img.get("src", "")
+        if src.startswith("data:image/"):
             try:
+                _, b64 = src.split(",", 1)
+                images.append(base64.b64decode(b64))
             except Exception:
                 pass
+    return images
 def parse_email(file_path):
     """
     Returns:
+    headers (dict),
+    subject (str),
+    body (str),
+    urls (list),
+    images (list of bytes),
+    attachments (list of dict)
     """
     with open(file_path, "rb") as f:
+        msg = email.message_from_binary_file(f, policy=policy.default)
     headers = dict(msg.items())
+    subject = headers.get("Subject", "") or ""
+    body = ""
     images = []
     attachments = []
+    urls = set()
+    if msg.is_multipart():
+        for part in msg.walk():
+            ctype = part.get_content_type()
+            disp = str(part.get("Content-Disposition") or "").lower()
+            # ---------- ATTACHMENTS ----------
+            if "attachment" in disp:
+                try:
+                    data = part.get_payload(decode=True)
+                    attachments.append({
+                        "filename": part.get_filename(),
+                        "content_type": ctype,
+                        "size": len(data) if data else 0,
+                        "data": data
+                    })
+                except Exception:
+                    pass
+                continue
+            # ---------- INLINE IMAGES ----------
+            if ctype.startswith("image/"):
+                try:
+                    data = part.get_payload(decode=True)
+                    if data:
+                        images.append(data)
+                except Exception:
+                    pass
+            # ---------- TEXT ----------
             try:
+                if ctype == "text/plain":
+                    body += part.get_content() + "\n"
+                elif ctype == "text/html":
+                    html = part.get_content()
+                    images += _extract_inline_images_from_html(html)
+                    soup = BeautifulSoup(html, "html.parser")
+                    body += soup.get_text(" ", strip=True) + "\n"
             except Exception:
                 pass
+    else:
+        try:
+            if msg.get_content_type() == "text/html":
+                html = msg.get_content()
+                images += _extract_inline_images_from_html(html)
+                soup = BeautifulSoup(html, "html.parser")
+                body = soup.get_text(" ", strip=True)
+            else:
+                body = msg.get_content()
+        except Exception:
+            pass
+    # ---------- URL EXTRACTION ----------
+    try:
+        urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
+    except Exception:
+        pass
+    for _, v in headers.items():
         try:
+            urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
         except Exception:
             pass
+    return headers, subject, body.strip(), list(urls), images, attachments