Spaces:
Sleeping
Sleeping
File size: 1,493 Bytes
3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 3bb3451 86cb7f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import email
from email import policy
from bs4 import BeautifulSoup
import re
def parse_email(file_path):
with open(file_path, "rb") as f:
msg = email.message_from_binary_file(f, policy=policy.default)
# --- 1. Extract headers ---
headers = dict(msg.items())
# --- 2. Extract body (text + html) ---
body = ""
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
if content_type == "text/plain":
try:
body += part.get_content()
except:
pass
elif content_type == "text/html":
try:
html_body = part.get_content()
soup = BeautifulSoup(html_body, "html.parser")
body += soup.get_text(" ", strip=True)
except:
pass
else:
try:
body = msg.get_content()
except:
body = ""
# --- 3. Extract URLs ---
urls = set()
urls.update(re.findall(r"https?://[^\s]+", body))
for part in msg.walk():
if part.get_content_type() == "text/html":
try:
html_body = part.get_content()
soup = BeautifulSoup(html_body, "html.parser")
for link in soup.find_all("a", href=True):
urls.add(link["href"])
except:
pass
return headers, body, list(urls)
|