import streamlit as st import re import contractions import pandas as pd from transformers import BartTokenizer, BartForConditionalGeneration # Set page config st.set_page_config( page_title="Smart Summarizer", page_icon="✂️", layout="centered" ) st.markdown(""" """, unsafe_allow_html=True) # Load model and tokenizer MODEL_PATH = "./models/fine-tuned_bart_base" model = BartForConditionalGeneration.from_pretrained(MODEL_PATH) model = model. cpu() tokenizer = BartTokenizer.from_pretrained(MODEL_PATH) #Helper functions def extract_speakers(dialogue): """ Extracts the names of the first two speakers in a dialogue. Speaker 1: From the first non-space character to the first colon. Speaker 2: From the first newline to the second colon. Args: dialogue (str): The dialogue text containing speaker names and conversation. Returns: tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails. """ try: # First speaker: from start (after leading spaces) to the first colon dialogue = dialogue.lstrip() speaker_1 = dialogue[:dialogue.index(':')].strip() # Find the start of the second speaker after first newline newline_index = dialogue.index('\n') sub_dialogue = dialogue[newline_index + 1:] speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip() return speaker_1, speaker_2 except (ValueError, IndexError): return None, None chat_words = { "AFAIK": "As Far As I Know", "AFK": "Away From Keyboard", "ASAP": "As Soon As Possible", "ATK": "At The Keyboard", "ATM": "At The Moment", "A3": "Anytime, Anywhere, Anyplace", "BAK": "Back At Keyboard", "BBL": "Be Back Later", "BBS": "Be Back Soon", "BFN": "Bye For Now", "B4N": "Bye For Now", "BRB": "Be Right Back", "BRT": "Be Right There", "BTW": "By The Way", "B4": "Before", "CU": "See You", "CUL8R": "See You Later", "CYA": "See You", "FAQ": "Frequently Asked Questions", "FC": "Fingers Crossed", "FWIW": "For What It's Worth", "FYI": "For Your Information", "GAL": "Get A Life", "GG": "Good Game", "GN": "Good Night", "GMTA": "Great Minds Think Alike", "GR8": "Great!", "G9": "Genius", "IC": "I See", "ICQ": "I Seek you (also a chat program)", "ILU": "I Love You", "IMHO": "In My Honest/Humble Opinion", "IMO": "In My Opinion", "IOW": "In Other Words", "IRL": "In Real Life", "KISS": "Keep It Simple, Stupid", "LDR": "Long Distance Relationship", "LMAO": "Laugh My A.. Off", "LOL": "Laughing Out Loud", "LTNS": "Long Time No See", "L8R": "Later", "MTE": "My Thoughts Exactly", "M8": "Mate", "NRN": "No Reply Necessary", "OIC": "Oh I See", "PITA": "Pain In The A..", "PRT": "Party", "PRW": "Parents Are Watching", "QPSA": "Que Pasa?", "ROFL": "Rolling On The Floor Laughing", "ROFLOL": "Rolling On The Floor Laughing Out Loud", "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off", "SK8": "Skate", "STATS": "Your sex and age", "ASL": "Age, Sex, Location", "THX": "Thank You", "TTFN": "Ta-Ta For Now!", "TTYL": "Talk To You Later", "U": "You", "U2": "You Too", "U4E": "Yours For Ever", "WB": "Welcome Back", "WTF": "What The F...", "WTG": "Way To Go!", "WUF": "Where Are You From?", "W8": "Wait...", "7K": "Sick:-D Laughter", "TFW": "That feeling when", "MFW": "My face when", "MRW": "My reaction when", "IFYP": "I feel your pain", "LOL": "Laughing out loud", "TNTL": "Trying not to laugh", "JK": "Just kidding", "IDC": "I don’t care", "ILY": "I love you", "IMU": "I miss you", "ADIH": "Another day in hell", "IDC": "I don’t care", "ZZZ": "Sleeping, bored, tired", "WYWH": "Wish you were here", "TIME": "Tears in my eyes", "BAE": "Before anyone else", "FIMH": "Forever in my heart", "BSAAW": "Big smile and a wink", "BWL": "Bursting with laughter", "LMAO": "Laughing my a** off", "BFF": "Best friends forever", "CSL": "Can’t stop laughing", } def preprocess_text(text): """ Preprocesses input text by applying the following cleaning operations: - Lowercases the text - Expands contractions (e.g., "can't" -> "cannot") - Removes URLs - Removes emojis - Converts chat abbreviations to full forms using a chat_words dictionary - Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?") - Cleans up whitespace around punctuation Args: text (str): The input text string to preprocess. Returns: str: The cleaned and preprocessed text. """ # Lowercase the text text = text.lower() # Expand contractions text = contractions.fix(text) # Remove URLs text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove emojis emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) text = emoji_pattern.sub(r'', text) # Convert chat abbreviations words = text.split() new_text = [chat_words.get(w.upper(), w) for w in words] text = " ".join(new_text) # Remove HTML tags text = re.sub(r'<.*?>', '', text) # Normalize repeated punctuation text = re.sub(r'([!?.,])\1{1,}', r'\1', text) # e.g., "!!!" → "!" text = re.sub(r'\.{2,}', '.', text) # e.g., "..." → "." # Normalize spacing around punctuation text = re.sub(r'\s*([.,!?\'"-])\s*', r' \1 ', text) # Normalize whitespace text = re.sub(r'\s+', ' ', text).strip() return text def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str: """Replace speaker names with placeholders.""" text = text.replace(speaker_1, "") text = text.replace(speaker_2, "") return text def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str: """Replace placeholders with original speaker names.""" text = text.replace("", speaker_1) text = text.replace("", speaker_2) return text # Inference function def summarize_text(txt): speaker_1, speaker_2 = extract_speakers(txt) txt = preprocess_text(txt) txt = anonymize_speakers(txt, speaker_1, speaker_2) inputs = tokenizer(txt, return_tensors="pt") inputs = {k: v.cpu() for k, v in inputs.items()} summary_ids = model.generate(**inputs) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) summary = deanonymize_speakers(summary, speaker_1, speaker_2) return summary # Title st.markdown("

Smart Summarizer

", unsafe_allow_html=True) st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.") # Text input text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...") # Summarize button if st.button("Summarize"): if text_input.strip(): with st.spinner("Generating summary..."): summary = summarize_text(text_input) st.markdown("
Summary
", unsafe_allow_html=True) st.write(summary) else: st.warning("Please enter text to summarize.") # Show simplified model metrics st.markdown("
Model Performance
", unsafe_allow_html=True) metrics = { "ROUGE-1": "0.4193", "ROUGE-2": "0.2064", "ROUGE-L": "0.3469", } metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"]) metrics_df = metrics_df.set_index("Metric") st.table(metrics_df)