Spaces:

saharM
/

dialogue_summarizer

Sleeping

App Files Files Community

dialogue_summarizer / app.py

saharM

fix cache

8ea80e6 7 months ago

raw

history blame contribute delete

9.48 kB

	import streamlit as st
	import re
	import contractions
	import pandas as pd
	from transformers import BartTokenizer, BartForConditionalGeneration

	# Set page config
	st.set_page_config(
	page_title="Smart Summarizer",
	page_icon="✂️",
	layout="centered"
	)

	st.markdown("""
	<style>
	/* Force everything to dark mode */
	html, body, .main, .stApp {
	background-color: #0f0f11 !important;
	color: #ffffff !important;
	}
	/* Universal text color */
	* {
	color: #e0e0e0 !important;
	}
	/* Text area styling */
	.stTextArea textarea {
	background-color: #1e1e22 !important;
	color: #ffffff !important;
	font-size: 16px !important;
	border: 1px solid #444 !important;
	}
	textarea:focus, .stTextArea textarea:focus {
	border: 2px solid #89CFF0 !important;
	box-shadow: 0 0 0 0.2rem rgba(137, 207, 240, 0.4);
	}
	/* Button styling */
	.stButton>button {
	background-color: #7b2cbf;
	color: white;
	font-weight: bold;
	border: none;
	border-radius: 6px;
	padding: 0.5rem 1rem;
	}
	.stButton>button:hover {
	background-color: #5a189a;
	color: #add8e6 !important;
	}
	/* Sidebar */
	section[data-testid="stSidebar"] {
	background-color: #1e1e22 !important;
	}
	/* Header / white band fix */
	header[data-testid="stHeader"] {
	background: transparent !important;
	}
	/* Table styling */
	.stTable td, .stTable th {
	color: #f4f4f4 !important;
	border-color: #333 !important;
	}
	/* Markdown headers */
	h1, h2, h3, h4 {
	color: #9d4edd !important;
	}
	</style>
	""", unsafe_allow_html=True)

	# Load model and tokenizer
	MODEL_PATH = "./models/fine-tuned_bart_base"
	model = BartForConditionalGeneration.from_pretrained(MODEL_PATH)
	model = model. cpu()
	tokenizer = BartTokenizer.from_pretrained(MODEL_PATH)


	#Helper functions
	def extract_speakers(dialogue):
	"""
	Extracts the names of the first two speakers in a dialogue.
	Speaker 1: From the first non-space character to the first colon.
	Speaker 2: From the first newline to the second colon.
	Args:
	dialogue (str): The dialogue text containing speaker names and conversation.
	Returns:
	tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails.
	"""
	try:
	# First speaker: from start (after leading spaces) to the first colon
	dialogue = dialogue.lstrip()
	speaker_1 = dialogue[:dialogue.index(':')].strip()

	# Find the start of the second speaker after first newline
	newline_index = dialogue.index('\n')
	sub_dialogue = dialogue[newline_index + 1:]
	speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip()

	return speaker_1, speaker_2

	except (ValueError, IndexError):
	return None, None

	chat_words = {
	"AFAIK": "As Far As I Know",
	"AFK": "Away From Keyboard",
	"ASAP": "As Soon As Possible",
	"ATK": "At The Keyboard",
	"ATM": "At The Moment",
	"A3": "Anytime, Anywhere, Anyplace",
	"BAK": "Back At Keyboard",
	"BBL": "Be Back Later",
	"BBS": "Be Back Soon",
	"BFN": "Bye For Now",
	"B4N": "Bye For Now",
	"BRB": "Be Right Back",
	"BRT": "Be Right There",
	"BTW": "By The Way",
	"B4": "Before",
	"CU": "See You",
	"CUL8R": "See You Later",
	"CYA": "See You",
	"FAQ": "Frequently Asked Questions",
	"FC": "Fingers Crossed",
	"FWIW": "For What It's Worth",
	"FYI": "For Your Information",
	"GAL": "Get A Life",
	"GG": "Good Game",
	"GN": "Good Night",
	"GMTA": "Great Minds Think Alike",
	"GR8": "Great!",
	"G9": "Genius",
	"IC": "I See",
	"ICQ": "I Seek you (also a chat program)",
	"ILU": "I Love You",
	"IMHO": "In My Honest/Humble Opinion",
	"IMO": "In My Opinion",
	"IOW": "In Other Words",
	"IRL": "In Real Life",
	"KISS": "Keep It Simple, Stupid",
	"LDR": "Long Distance Relationship",
	"LMAO": "Laugh My A.. Off",
	"LOL": "Laughing Out Loud",
	"LTNS": "Long Time No See",
	"L8R": "Later",
	"MTE": "My Thoughts Exactly",
	"M8": "Mate",
	"NRN": "No Reply Necessary",
	"OIC": "Oh I See",
	"PITA": "Pain In The A..",
	"PRT": "Party",
	"PRW": "Parents Are Watching",
	"QPSA": "Que Pasa?",
	"ROFL": "Rolling On The Floor Laughing",
	"ROFLOL": "Rolling On The Floor Laughing Out Loud",
	"ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
	"SK8": "Skate",
	"STATS": "Your sex and age",
	"ASL": "Age, Sex, Location",
	"THX": "Thank You",
	"TTFN": "Ta-Ta For Now!",
	"TTYL": "Talk To You Later",
	"U": "You",
	"U2": "You Too",
	"U4E": "Yours For Ever",
	"WB": "Welcome Back",
	"WTF": "What The F...",
	"WTG": "Way To Go!",
	"WUF": "Where Are You From?",
	"W8": "Wait...",
	"7K": "Sick:-D Laughter",
	"TFW": "That feeling when",
	"MFW": "My face when",
	"MRW": "My reaction when",
	"IFYP": "I feel your pain",
	"LOL": "Laughing out loud",
	"TNTL": "Trying not to laugh",
	"JK": "Just kidding",
	"IDC": "I don’t care",
	"ILY": "I love you",
	"IMU": "I miss you",
	"ADIH": "Another day in hell",
	"IDC": "I don’t care",
	"ZZZ": "Sleeping, bored, tired",
	"WYWH": "Wish you were here",
	"TIME": "Tears in my eyes",
	"BAE": "Before anyone else",
	"FIMH": "Forever in my heart",
	"BSAAW": "Big smile and a wink",
	"BWL": "Bursting with laughter",
	"LMAO": "Laughing my a** off",
	"BFF": "Best friends forever",
	"CSL": "Can’t stop laughing",
	}

	def preprocess_text(text):
	"""
	Preprocesses input text by applying the following cleaning operations:
	- Lowercases the text
	- Expands contractions (e.g., "can't" -> "cannot")
	- Removes URLs
	- Removes emojis
	- Converts chat abbreviations to full forms using a chat_words dictionary
	- Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?")
	- Cleans up whitespace around punctuation
	Args:
	text (str): The input text string to preprocess.
	Returns:
	str: The cleaned and preprocessed text.
	"""
	# Lowercase the text
	text = text.lower()

	# Expand contractions
	text = contractions.fix(text)

	# Remove URLs
	text = re.sub(r'https?://\S+\|www\.\S+', '', text)

	# Remove emojis
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"
	u"\U0001F300-\U0001F5FF"
	u"\U0001F680-\U0001F6FF"
	u"\U0001F1E0-\U0001F1FF"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	text = emoji_pattern.sub(r'', text)

	# Convert chat abbreviations
	words = text.split()
	new_text = [chat_words.get(w.upper(), w) for w in words]
	text = " ".join(new_text)

	# Remove HTML tags
	text = re.sub(r'<.*?>', '', text)

	# Normalize repeated punctuation
	text = re.sub(r'([!?.,])\1{1,}', r'\1', text) # e.g., "!!!" → "!"
	text = re.sub(r'\.{2,}', '.', text) # e.g., "..." → "."

	# Normalize spacing around punctuation
	text = re.sub(r'\s([.,!?\'"-])\s', r' \1 ', text)

	# Normalize whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
	"""Replace speaker names with placeholders."""
	text = text.replace(speaker_1, "<speaker1>")
	text = text.replace(speaker_2, "<speaker2>")
	return text

	def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
	"""Replace placeholders with original speaker names."""
	text = text.replace("<speaker1>", speaker_1)
	text = text.replace("<speaker2>", speaker_2)
	return text


	# Inference function
	def summarize_text(txt):
	speaker_1, speaker_2 = extract_speakers(txt)
	txt = preprocess_text(txt)
	txt = anonymize_speakers(txt, speaker_1, speaker_2)
	inputs = tokenizer(txt, return_tensors="pt")
	inputs = {k: v.cpu() for k, v in inputs.items()}
	summary_ids = model.generate(**inputs)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	summary = deanonymize_speakers(summary, speaker_1, speaker_2)
	return summary

	# Title
	st.markdown("<h1 style='color:#9d4edd;'>Smart Summarizer</h1>", unsafe_allow_html=True)
	st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.")

	# Text input
	text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...")

	# Summarize button
	if st.button("Summarize"):
	if text_input.strip():
	with st.spinner("Generating summary..."):
	summary = summarize_text(text_input)
	st.markdown("<div class='section-header'>Summary</div>", unsafe_allow_html=True)
	st.write(summary)
	else:
	st.warning("Please enter text to summarize.")

	# Show simplified model metrics
	st.markdown("<div class='section-header'>Model Performance</div>", unsafe_allow_html=True)

	metrics = {
	"ROUGE-1": "0.4193",
	"ROUGE-2": "0.2064",
	"ROUGE-L": "0.3469",
	}

	metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
	metrics_df = metrics_df.set_index("Metric")
	st.table(metrics_df)