import email from email import policy from bs4 import BeautifulSoup import re def parse_email(file_path): with open(file_path, "rb") as f: msg = email.message_from_binary_file(f, policy=policy.default) # --- 1. Extract headers --- headers = dict(msg.items()) # --- 2. Extract body (text + html) --- body = "" if msg.is_multipart(): for part in msg.walk(): content_type = part.get_content_type() if content_type == "text/plain": try: body += part.get_content() except: pass elif content_type == "text/html": try: html_body = part.get_content() soup = BeautifulSoup(html_body, "html.parser") body += soup.get_text(" ", strip=True) except: pass else: try: body = msg.get_content() except: body = "" # --- 3. Extract URLs --- urls = set() urls.update(re.findall(r"https?://[^\s]+", body)) for part in msg.walk(): if part.get_content_type() == "text/html": try: html_body = part.get_content() soup = BeautifulSoup(html_body, "html.parser") for link in soup.find_all("a", href=True): urls.add(link["href"]) except: pass return headers, body, list(urls)