File size: 1,493 Bytes
3bb3451
86cb7f3
3bb3451
86cb7f3
3bb3451
 
 
86cb7f3
3bb3451
86cb7f3
3bb3451
86cb7f3
 
3bb3451
 
 
86cb7f3
 
 
 
 
 
 
 
 
 
 
 
 
3bb3451
86cb7f3
 
 
 
 
 
 
 
3bb3451
86cb7f3
 
 
 
 
 
 
 
 
3bb3451
86cb7f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import email
from email import policy
from bs4 import BeautifulSoup
import re

def parse_email(file_path):
    with open(file_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)

    # --- 1. Extract headers ---
    headers = dict(msg.items())

    # --- 2. Extract body (text + html) ---
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            if content_type == "text/plain":
                try:
                    body += part.get_content()
                except:
                    pass
            elif content_type == "text/html":
                try:
                    html_body = part.get_content()
                    soup = BeautifulSoup(html_body, "html.parser")
                    body += soup.get_text(" ", strip=True)
                except:
                    pass
    else:
        try:
            body = msg.get_content()
        except:
            body = ""

    # --- 3. Extract URLs ---
    urls = set()
    urls.update(re.findall(r"https?://[^\s]+", body))

    for part in msg.walk():
        if part.get_content_type() == "text/html":
            try:
                html_body = part.get_content()
                soup = BeautifulSoup(html_body, "html.parser")
                for link in soup.find_all("a", href=True):
                    urls.add(link["href"])
            except:
                pass

    return headers, body, list(urls)