saharM's picture
fix cache
8ea80e6
import streamlit as st
import re
import contractions
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
# Set page config
st.set_page_config(
page_title="Smart Summarizer",
page_icon="✂️",
layout="centered"
)
st.markdown("""
<style>
/* Force everything to dark mode */
html, body, .main, .stApp {
background-color: #0f0f11 !important;
color: #ffffff !important;
}
/* Universal text color */
* {
color: #e0e0e0 !important;
}
/* Text area styling */
.stTextArea textarea {
background-color: #1e1e22 !important;
color: #ffffff !important;
font-size: 16px !important;
border: 1px solid #444 !important;
}
textarea:focus, .stTextArea textarea:focus {
border: 2px solid #89CFF0 !important;
box-shadow: 0 0 0 0.2rem rgba(137, 207, 240, 0.4);
}
/* Button styling */
.stButton>button {
background-color: #7b2cbf;
color: white;
font-weight: bold;
border: none;
border-radius: 6px;
padding: 0.5rem 1rem;
}
.stButton>button:hover {
background-color: #5a189a;
color: #add8e6 !important;
}
/* Sidebar */
section[data-testid="stSidebar"] {
background-color: #1e1e22 !important;
}
/* Header / white band fix */
header[data-testid="stHeader"] {
background: transparent !important;
}
/* Table styling */
.stTable td, .stTable th {
color: #f4f4f4 !important;
border-color: #333 !important;
}
/* Markdown headers */
h1, h2, h3, h4 {
color: #9d4edd !important;
}
</style>
""", unsafe_allow_html=True)
# Load model and tokenizer
MODEL_PATH = "./models/fine-tuned_bart_base"
model = BartForConditionalGeneration.from_pretrained(MODEL_PATH)
model = model. cpu()
tokenizer = BartTokenizer.from_pretrained(MODEL_PATH)
#Helper functions
def extract_speakers(dialogue):
"""
Extracts the names of the first two speakers in a dialogue.
Speaker 1: From the first non-space character to the first colon.
Speaker 2: From the first newline to the second colon.
Args:
dialogue (str): The dialogue text containing speaker names and conversation.
Returns:
tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails.
"""
try:
# First speaker: from start (after leading spaces) to the first colon
dialogue = dialogue.lstrip()
speaker_1 = dialogue[:dialogue.index(':')].strip()
# Find the start of the second speaker after first newline
newline_index = dialogue.index('\n')
sub_dialogue = dialogue[newline_index + 1:]
speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip()
return speaker_1, speaker_2
except (ValueError, IndexError):
return None, None
chat_words = {
"AFAIK": "As Far As I Know",
"AFK": "Away From Keyboard",
"ASAP": "As Soon As Possible",
"ATK": "At The Keyboard",
"ATM": "At The Moment",
"A3": "Anytime, Anywhere, Anyplace",
"BAK": "Back At Keyboard",
"BBL": "Be Back Later",
"BBS": "Be Back Soon",
"BFN": "Bye For Now",
"B4N": "Bye For Now",
"BRB": "Be Right Back",
"BRT": "Be Right There",
"BTW": "By The Way",
"B4": "Before",
"CU": "See You",
"CUL8R": "See You Later",
"CYA": "See You",
"FAQ": "Frequently Asked Questions",
"FC": "Fingers Crossed",
"FWIW": "For What It's Worth",
"FYI": "For Your Information",
"GAL": "Get A Life",
"GG": "Good Game",
"GN": "Good Night",
"GMTA": "Great Minds Think Alike",
"GR8": "Great!",
"G9": "Genius",
"IC": "I See",
"ICQ": "I Seek you (also a chat program)",
"ILU": "I Love You",
"IMHO": "In My Honest/Humble Opinion",
"IMO": "In My Opinion",
"IOW": "In Other Words",
"IRL": "In Real Life",
"KISS": "Keep It Simple, Stupid",
"LDR": "Long Distance Relationship",
"LMAO": "Laugh My A.. Off",
"LOL": "Laughing Out Loud",
"LTNS": "Long Time No See",
"L8R": "Later",
"MTE": "My Thoughts Exactly",
"M8": "Mate",
"NRN": "No Reply Necessary",
"OIC": "Oh I See",
"PITA": "Pain In The A..",
"PRT": "Party",
"PRW": "Parents Are Watching",
"QPSA": "Que Pasa?",
"ROFL": "Rolling On The Floor Laughing",
"ROFLOL": "Rolling On The Floor Laughing Out Loud",
"ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
"SK8": "Skate",
"STATS": "Your sex and age",
"ASL": "Age, Sex, Location",
"THX": "Thank You",
"TTFN": "Ta-Ta For Now!",
"TTYL": "Talk To You Later",
"U": "You",
"U2": "You Too",
"U4E": "Yours For Ever",
"WB": "Welcome Back",
"WTF": "What The F...",
"WTG": "Way To Go!",
"WUF": "Where Are You From?",
"W8": "Wait...",
"7K": "Sick:-D Laughter",
"TFW": "That feeling when",
"MFW": "My face when",
"MRW": "My reaction when",
"IFYP": "I feel your pain",
"LOL": "Laughing out loud",
"TNTL": "Trying not to laugh",
"JK": "Just kidding",
"IDC": "I don’t care",
"ILY": "I love you",
"IMU": "I miss you",
"ADIH": "Another day in hell",
"IDC": "I don’t care",
"ZZZ": "Sleeping, bored, tired",
"WYWH": "Wish you were here",
"TIME": "Tears in my eyes",
"BAE": "Before anyone else",
"FIMH": "Forever in my heart",
"BSAAW": "Big smile and a wink",
"BWL": "Bursting with laughter",
"LMAO": "Laughing my a** off",
"BFF": "Best friends forever",
"CSL": "Can’t stop laughing",
}
def preprocess_text(text):
"""
Preprocesses input text by applying the following cleaning operations:
- Lowercases the text
- Expands contractions (e.g., "can't" -> "cannot")
- Removes URLs
- Removes emojis
- Converts chat abbreviations to full forms using a chat_words dictionary
- Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?")
- Cleans up whitespace around punctuation
Args:
text (str): The input text string to preprocess.
Returns:
str: The cleaned and preprocessed text.
"""
# Lowercase the text
text = text.lower()
# Expand contractions
text = contractions.fix(text)
# Remove URLs
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# Remove emojis
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
# Convert chat abbreviations
words = text.split()
new_text = [chat_words.get(w.upper(), w) for w in words]
text = " ".join(new_text)
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# Normalize repeated punctuation
text = re.sub(r'([!?.,])\1{1,}', r'\1', text) # e.g., "!!!" → "!"
text = re.sub(r'\.{2,}', '.', text) # e.g., "..." → "."
# Normalize spacing around punctuation
text = re.sub(r'\s*([.,!?\'"-])\s*', r' \1 ', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
"""Replace speaker names with placeholders."""
text = text.replace(speaker_1, "<speaker1>")
text = text.replace(speaker_2, "<speaker2>")
return text
def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
"""Replace placeholders with original speaker names."""
text = text.replace("<speaker1>", speaker_1)
text = text.replace("<speaker2>", speaker_2)
return text
# Inference function
def summarize_text(txt):
speaker_1, speaker_2 = extract_speakers(txt)
txt = preprocess_text(txt)
txt = anonymize_speakers(txt, speaker_1, speaker_2)
inputs = tokenizer(txt, return_tensors="pt")
inputs = {k: v.cpu() for k, v in inputs.items()}
summary_ids = model.generate(**inputs)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary = deanonymize_speakers(summary, speaker_1, speaker_2)
return summary
# Title
st.markdown("<h1 style='color:#9d4edd;'>Smart Summarizer</h1>", unsafe_allow_html=True)
st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.")
# Text input
text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...")
# Summarize button
if st.button("Summarize"):
if text_input.strip():
with st.spinner("Generating summary..."):
summary = summarize_text(text_input)
st.markdown("<div class='section-header'>Summary</div>", unsafe_allow_html=True)
st.write(summary)
else:
st.warning("Please enter text to summarize.")
# Show simplified model metrics
st.markdown("<div class='section-header'>Model Performance</div>", unsafe_allow_html=True)
metrics = {
"ROUGE-1": "0.4193",
"ROUGE-2": "0.2064",
"ROUGE-L": "0.3469",
}
metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
metrics_df = metrics_df.set_index("Metric")
st.table(metrics_df)