Spaces:

princemaxp
/

CySecGuardians

Sleeping

File size: 1,493 Bytes

3bb3451
86cb7f3
3bb3451
86cb7f3
3bb3451
 
 
86cb7f3
3bb3451
86cb7f3
3bb3451
86cb7f3
 
3bb3451
 
 
86cb7f3
 
 
 
 
 
 
 
 
 
 
 
 
3bb3451
86cb7f3
 
 
 
 
 
 
 
3bb3451
86cb7f3
 
 
 
 
 
 
 
 
3bb3451
86cb7f3

import email
from email import policy
from bs4 import BeautifulSoup
import re

def parse_email(file_path):
    with open(file_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)

    # --- 1. Extract headers ---
    headers = dict(msg.items())

    # --- 2. Extract body (text + html) ---
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            if content_type == "text/plain":
                try:
                    body += part.get_content()
                except:
                    pass
            elif content_type == "text/html":
                try:
                    html_body = part.get_content()
                    soup = BeautifulSoup(html_body, "html.parser")
                    body += soup.get_text(" ", strip=True)
                except:
                    pass
    else:
        try:
            body = msg.get_content()
        except:
            body = ""

    # --- 3. Extract URLs ---
    urls = set()
    urls.update(re.findall(r"https?://[^\s]+", body))

    for part in msg.walk():
        if part.get_content_type() == "text/html":
            try:
                html_body = part.get_content()
                soup = BeautifulSoup(html_body, "html.parser")
                for link in soup.find_all("a", href=True):
                    urls.add(link["href"])
            except:
                pass

    return headers, body, list(urls)