Spaces:
Sleeping
Sleeping
| import re | |
| from bs4 import BeautifulSoup | |
| import streamlit as st | |
| # Define patterns that indicate the start of a previous message in multiple languages | |
| PRIOR_MESSAGE_MARKERS = [ | |
| # English patterns | |
| re.compile(r'^From:', re.IGNORECASE), | |
| re.compile(r'^Sent:', re.IGNORECASE), | |
| re.compile(r'^Subject:', re.IGNORECASE), | |
| re.compile(r'^To:', re.IGNORECASE), | |
| re.compile(r'^Date:', re.IGNORECASE), | |
| re.compile(r'^On .* wrote:', re.IGNORECASE), | |
| re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE), | |
| re.compile(r'^Begin forwarded message:', re.IGNORECASE), | |
| # Custom separators in email (like lines of dashes or borders) | |
| re.compile(r'^-+.*-+$'), # For lines like "--------------------------------------------------" | |
| # Portuguese patterns | |
| re.compile(r'^Em .* escreveu:', re.IGNORECASE), | |
| re.compile(r'^De:\s', re.IGNORECASE), | |
| re.compile(r'^Para:\s', re.IGNORECASE), | |
| re.compile(r'^Data:\s', re.IGNORECASE), | |
| re.compile(r'^Assunto:\s', re.IGNORECASE), | |
| re.compile(r'^Mensagem original', re.IGNORECASE), | |
| # French patterns | |
| re.compile(r'^De :\s', re.IGNORECASE), # "From:" | |
| re.compile(r'^Le .* a écrit :', re.IGNORECASE), # "On DATE, NAME wrote:" | |
| # German patterns | |
| re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
| re.compile(r'^Von:\s', re.IGNORECASE), # "From:" | |
| # Spanish patterns | |
| re.compile(r'^El .* escribió:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
| # Chinese patterns | |
| re.compile(r'^历史邮件$', re.IGNORECASE), # "Historical Emails" | |
| # Dutch patterns | |
| re.compile(r'^Op .* schreef.*:', re.IGNORECASE), | |
| re.compile(r'^Van:\s', re.IGNORECASE), | |
| re.compile(r'^Aan:\s', re.IGNORECASE), | |
| re.compile(r'^Onderwerp:\s', re.IGNORECASE), | |
| re.compile(r'^Verzonden:\s', re.IGNORECASE), | |
| re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE), | |
| # Italian patterns | |
| re.compile(r'^Il .* ha scritto:', re.IGNORECASE), | |
| re.compile(r'^Da:\s', re.IGNORECASE), | |
| re.compile(r'^A:\s', re.IGNORECASE), | |
| re.compile(r'^Oggetto:\s', re.IGNORECASE), | |
| re.compile(r'^Data:\s', re.IGNORECASE), | |
| re.compile(r'^Messaggio originale', re.IGNORECASE), | |
| ] | |
| def remove_quoted_text(soup): | |
| """Remove quoted text sections from the email HTML content.""" | |
| # Remove blockquotes or quoted sections (typical for email threads) | |
| for blockquote in soup.find_all('blockquote'): | |
| blockquote.decompose() | |
| # Remove any divs that might indicate forwarded or quoted messages | |
| for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'): | |
| div.decompose() | |
| # Remove horizontal rules (often used to separate replies) | |
| for hr in soup.find_all('hr'): | |
| hr.decompose() | |
| # Remove tables with dotted borders (a typical marker of a previous conversation) | |
| for table in soup.find_all('table'): | |
| if table.get('style') and 'border-top:1px dotted' in table.get('style'): | |
| table.decompose() | |
| return soup | |
| def extract_latest_message_from_lines(lines): | |
| """Extract the latest message from the list of lines.""" | |
| latest_message_lines = [] | |
| for line in lines: | |
| # Clean up the line | |
| line = line.strip() | |
| # Ensure that we're handling None values safely | |
| if line is None or line == '': | |
| continue | |
| # Check if the line matches any prior message markers | |
| if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS): | |
| break # Stop if a prior message marker is found | |
| latest_message_lines.append(line) | |
| return '\n'.join(latest_message_lines).strip() | |
| def extract_latest_email_text(email_html): | |
| """ | |
| Extracts the text of the latest email message from the given HTML content, | |
| removing any quoted threads or previous messages. | |
| """ | |
| # Parse the HTML content | |
| soup = BeautifulSoup(email_html, 'html.parser') | |
| # Remove quoted text sections | |
| soup = remove_quoted_text(soup) | |
| # Extract the text and split into lines | |
| email_text = soup.get_text(separator='\n', strip=True) | |
| lines = email_text.split('\n') | |
| # Extract the latest message | |
| latest_email_text = extract_latest_message_from_lines(lines) | |
| return latest_email_text | |
| # Streamlit app | |
| def main(): | |
| st.title("Email Latest Message Extractor") | |
| st.write(""" | |
| This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages. | |
| Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message. | |
| """) | |
| # Input field for the raw HTML email content | |
| email_html = st.text_area("Paste the HTML email content here", height=300) | |
| # Button to process the input | |
| if st.button("Extract Latest Message"): | |
| if email_html.strip(): | |
| try: | |
| latest_message = extract_latest_email_text(email_html) | |
| st.subheader("Extracted Latest Message:") | |
| st.text_area("Latest Message", value=latest_message, height=200) | |
| except Exception as e: | |
| st.error(f"An error occurred: {e}") | |
| else: | |
| st.warning("Please paste the HTML content of the email.") | |
| if __name__ == "__main__": | |
| main() | |