Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import re | |
| import contractions | |
| import pandas as pd | |
| from transformers import BartTokenizer, BartForConditionalGeneration | |
| # Set page config | |
| st.set_page_config( | |
| page_title="Smart Summarizer", | |
| page_icon="✂️", | |
| layout="centered" | |
| ) | |
| st.markdown(""" | |
| <style> | |
| /* Force everything to dark mode */ | |
| html, body, .main, .stApp { | |
| background-color: #0f0f11 !important; | |
| color: #ffffff !important; | |
| } | |
| /* Universal text color */ | |
| * { | |
| color: #e0e0e0 !important; | |
| } | |
| /* Text area styling */ | |
| .stTextArea textarea { | |
| background-color: #1e1e22 !important; | |
| color: #ffffff !important; | |
| font-size: 16px !important; | |
| border: 1px solid #444 !important; | |
| } | |
| textarea:focus, .stTextArea textarea:focus { | |
| border: 2px solid #89CFF0 !important; | |
| box-shadow: 0 0 0 0.2rem rgba(137, 207, 240, 0.4); | |
| } | |
| /* Button styling */ | |
| .stButton>button { | |
| background-color: #7b2cbf; | |
| color: white; | |
| font-weight: bold; | |
| border: none; | |
| border-radius: 6px; | |
| padding: 0.5rem 1rem; | |
| } | |
| .stButton>button:hover { | |
| background-color: #5a189a; | |
| color: #add8e6 !important; | |
| } | |
| /* Sidebar */ | |
| section[data-testid="stSidebar"] { | |
| background-color: #1e1e22 !important; | |
| } | |
| /* Header / white band fix */ | |
| header[data-testid="stHeader"] { | |
| background: transparent !important; | |
| } | |
| /* Table styling */ | |
| .stTable td, .stTable th { | |
| color: #f4f4f4 !important; | |
| border-color: #333 !important; | |
| } | |
| /* Markdown headers */ | |
| h1, h2, h3, h4 { | |
| color: #9d4edd !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Load model and tokenizer | |
| MODEL_PATH = "./models/fine-tuned_bart_base" | |
| model = BartForConditionalGeneration.from_pretrained(MODEL_PATH) | |
| model = model. cpu() | |
| tokenizer = BartTokenizer.from_pretrained(MODEL_PATH) | |
| #Helper functions | |
| def extract_speakers(dialogue): | |
| """ | |
| Extracts the names of the first two speakers in a dialogue. | |
| Speaker 1: From the first non-space character to the first colon. | |
| Speaker 2: From the first newline to the second colon. | |
| Args: | |
| dialogue (str): The dialogue text containing speaker names and conversation. | |
| Returns: | |
| tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails. | |
| """ | |
| try: | |
| # First speaker: from start (after leading spaces) to the first colon | |
| dialogue = dialogue.lstrip() | |
| speaker_1 = dialogue[:dialogue.index(':')].strip() | |
| # Find the start of the second speaker after first newline | |
| newline_index = dialogue.index('\n') | |
| sub_dialogue = dialogue[newline_index + 1:] | |
| speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip() | |
| return speaker_1, speaker_2 | |
| except (ValueError, IndexError): | |
| return None, None | |
| chat_words = { | |
| "AFAIK": "As Far As I Know", | |
| "AFK": "Away From Keyboard", | |
| "ASAP": "As Soon As Possible", | |
| "ATK": "At The Keyboard", | |
| "ATM": "At The Moment", | |
| "A3": "Anytime, Anywhere, Anyplace", | |
| "BAK": "Back At Keyboard", | |
| "BBL": "Be Back Later", | |
| "BBS": "Be Back Soon", | |
| "BFN": "Bye For Now", | |
| "B4N": "Bye For Now", | |
| "BRB": "Be Right Back", | |
| "BRT": "Be Right There", | |
| "BTW": "By The Way", | |
| "B4": "Before", | |
| "CU": "See You", | |
| "CUL8R": "See You Later", | |
| "CYA": "See You", | |
| "FAQ": "Frequently Asked Questions", | |
| "FC": "Fingers Crossed", | |
| "FWIW": "For What It's Worth", | |
| "FYI": "For Your Information", | |
| "GAL": "Get A Life", | |
| "GG": "Good Game", | |
| "GN": "Good Night", | |
| "GMTA": "Great Minds Think Alike", | |
| "GR8": "Great!", | |
| "G9": "Genius", | |
| "IC": "I See", | |
| "ICQ": "I Seek you (also a chat program)", | |
| "ILU": "I Love You", | |
| "IMHO": "In My Honest/Humble Opinion", | |
| "IMO": "In My Opinion", | |
| "IOW": "In Other Words", | |
| "IRL": "In Real Life", | |
| "KISS": "Keep It Simple, Stupid", | |
| "LDR": "Long Distance Relationship", | |
| "LMAO": "Laugh My A.. Off", | |
| "LOL": "Laughing Out Loud", | |
| "LTNS": "Long Time No See", | |
| "L8R": "Later", | |
| "MTE": "My Thoughts Exactly", | |
| "M8": "Mate", | |
| "NRN": "No Reply Necessary", | |
| "OIC": "Oh I See", | |
| "PITA": "Pain In The A..", | |
| "PRT": "Party", | |
| "PRW": "Parents Are Watching", | |
| "QPSA": "Que Pasa?", | |
| "ROFL": "Rolling On The Floor Laughing", | |
| "ROFLOL": "Rolling On The Floor Laughing Out Loud", | |
| "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off", | |
| "SK8": "Skate", | |
| "STATS": "Your sex and age", | |
| "ASL": "Age, Sex, Location", | |
| "THX": "Thank You", | |
| "TTFN": "Ta-Ta For Now!", | |
| "TTYL": "Talk To You Later", | |
| "U": "You", | |
| "U2": "You Too", | |
| "U4E": "Yours For Ever", | |
| "WB": "Welcome Back", | |
| "WTF": "What The F...", | |
| "WTG": "Way To Go!", | |
| "WUF": "Where Are You From?", | |
| "W8": "Wait...", | |
| "7K": "Sick:-D Laughter", | |
| "TFW": "That feeling when", | |
| "MFW": "My face when", | |
| "MRW": "My reaction when", | |
| "IFYP": "I feel your pain", | |
| "LOL": "Laughing out loud", | |
| "TNTL": "Trying not to laugh", | |
| "JK": "Just kidding", | |
| "IDC": "I don’t care", | |
| "ILY": "I love you", | |
| "IMU": "I miss you", | |
| "ADIH": "Another day in hell", | |
| "IDC": "I don’t care", | |
| "ZZZ": "Sleeping, bored, tired", | |
| "WYWH": "Wish you were here", | |
| "TIME": "Tears in my eyes", | |
| "BAE": "Before anyone else", | |
| "FIMH": "Forever in my heart", | |
| "BSAAW": "Big smile and a wink", | |
| "BWL": "Bursting with laughter", | |
| "LMAO": "Laughing my a** off", | |
| "BFF": "Best friends forever", | |
| "CSL": "Can’t stop laughing", | |
| } | |
| def preprocess_text(text): | |
| """ | |
| Preprocesses input text by applying the following cleaning operations: | |
| - Lowercases the text | |
| - Expands contractions (e.g., "can't" -> "cannot") | |
| - Removes URLs | |
| - Removes emojis | |
| - Converts chat abbreviations to full forms using a chat_words dictionary | |
| - Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?") | |
| - Cleans up whitespace around punctuation | |
| Args: | |
| text (str): The input text string to preprocess. | |
| Returns: | |
| str: The cleaned and preprocessed text. | |
| """ | |
| # Lowercase the text | |
| text = text.lower() | |
| # Expand contractions | |
| text = contractions.fix(text) | |
| # Remove URLs | |
| text = re.sub(r'https?://\S+|www\.\S+', '', text) | |
| # Remove emojis | |
| emoji_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" | |
| u"\U0001F300-\U0001F5FF" | |
| u"\U0001F680-\U0001F6FF" | |
| u"\U0001F1E0-\U0001F1FF" | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE) | |
| text = emoji_pattern.sub(r'', text) | |
| # Convert chat abbreviations | |
| words = text.split() | |
| new_text = [chat_words.get(w.upper(), w) for w in words] | |
| text = " ".join(new_text) | |
| # Remove HTML tags | |
| text = re.sub(r'<.*?>', '', text) | |
| # Normalize repeated punctuation | |
| text = re.sub(r'([!?.,])\1{1,}', r'\1', text) # e.g., "!!!" → "!" | |
| text = re.sub(r'\.{2,}', '.', text) # e.g., "..." → "." | |
| # Normalize spacing around punctuation | |
| text = re.sub(r'\s*([.,!?\'"-])\s*', r' \1 ', text) | |
| # Normalize whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str: | |
| """Replace speaker names with placeholders.""" | |
| text = text.replace(speaker_1, "<speaker1>") | |
| text = text.replace(speaker_2, "<speaker2>") | |
| return text | |
| def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str: | |
| """Replace placeholders with original speaker names.""" | |
| text = text.replace("<speaker1>", speaker_1) | |
| text = text.replace("<speaker2>", speaker_2) | |
| return text | |
| # Inference function | |
| def summarize_text(txt): | |
| speaker_1, speaker_2 = extract_speakers(txt) | |
| txt = preprocess_text(txt) | |
| txt = anonymize_speakers(txt, speaker_1, speaker_2) | |
| inputs = tokenizer(txt, return_tensors="pt") | |
| inputs = {k: v.cpu() for k, v in inputs.items()} | |
| summary_ids = model.generate(**inputs) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| summary = deanonymize_speakers(summary, speaker_1, speaker_2) | |
| return summary | |
| # Title | |
| st.markdown("<h1 style='color:#9d4edd;'>Smart Summarizer</h1>", unsafe_allow_html=True) | |
| st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.") | |
| # Text input | |
| text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...") | |
| # Summarize button | |
| if st.button("Summarize"): | |
| if text_input.strip(): | |
| with st.spinner("Generating summary..."): | |
| summary = summarize_text(text_input) | |
| st.markdown("<div class='section-header'>Summary</div>", unsafe_allow_html=True) | |
| st.write(summary) | |
| else: | |
| st.warning("Please enter text to summarize.") | |
| # Show simplified model metrics | |
| st.markdown("<div class='section-header'>Model Performance</div>", unsafe_allow_html=True) | |
| metrics = { | |
| "ROUGE-1": "0.4193", | |
| "ROUGE-2": "0.2064", | |
| "ROUGE-L": "0.3469", | |
| } | |
| metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"]) | |
| metrics_df = metrics_df.set_index("Metric") | |
| st.table(metrics_df) |