Spaces:

saharM
/

dialogue_summarizer

Sleeping

File size: 9,479 Bytes

import streamlit as st
import re
import contractions
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration

# Set page config
st.set_page_config(
    page_title="Smart Summarizer",
    page_icon="✂️",
    layout="centered"
)

st.markdown("""
    <style>
        /* Force everything to dark mode */
        html, body, .main, .stApp {
            background-color: #0f0f11 !important;
            color: #ffffff !important;
        }
        /* Universal text color */
        * {
            color: #e0e0e0 !important;
        }
        /* Text area styling */
        .stTextArea textarea {
            background-color: #1e1e22 !important;
            color: #ffffff !important;
            font-size: 16px !important;
            border: 1px solid #444 !important;
        }
        textarea:focus, .stTextArea textarea:focus {
            border: 2px solid #89CFF0 !important;
            box-shadow: 0 0 0 0.2rem rgba(137, 207, 240, 0.4);
        }
        /* Button styling */
        .stButton>button {
            background-color: #7b2cbf;
            color: white;
            font-weight: bold;
            border: none;
            border-radius: 6px;
            padding: 0.5rem 1rem;
        }
        .stButton>button:hover {
            background-color: #5a189a;
            color: #add8e6 !important;
        }
        /* Sidebar */
        section[data-testid="stSidebar"] {
            background-color: #1e1e22 !important;
        }
        /* Header / white band fix */
        header[data-testid="stHeader"] {
            background: transparent !important;
        }
        /* Table styling */
        .stTable td, .stTable th {
            color: #f4f4f4 !important;
            border-color: #333 !important;
        }
        /* Markdown headers */
        h1, h2, h3, h4 {
            color: #9d4edd !important;
        }
    </style>
""", unsafe_allow_html=True)

# Load model and tokenizer
MODEL_PATH = "./models/fine-tuned_bart_base"
model = BartForConditionalGeneration.from_pretrained(MODEL_PATH)
model = model. cpu() 
tokenizer = BartTokenizer.from_pretrained(MODEL_PATH)


#Helper functions
def extract_speakers(dialogue):
    """
    Extracts the names of the first two speakers in a dialogue.
    Speaker 1: From the first non-space character to the first colon.
    Speaker 2: From the first newline to the second colon.
    Args:
        dialogue (str): The dialogue text containing speaker names and conversation.
    Returns:
        tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails.
    """
    try:
        # First speaker: from start (after leading spaces) to the first colon
        dialogue = dialogue.lstrip()
        speaker_1 = dialogue[:dialogue.index(':')].strip()

        # Find the start of the second speaker after first newline
        newline_index = dialogue.index('\n')
        sub_dialogue = dialogue[newline_index + 1:]
        speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip()

        return speaker_1, speaker_2

    except (ValueError, IndexError):
        return None, None

chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
     "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laughter",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don’t care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing",
}

def preprocess_text(text):
    """
    Preprocesses input text by applying the following cleaning operations:
    - Lowercases the text
    - Expands contractions (e.g., "can't" -> "cannot")
    - Removes URLs
    - Removes emojis
    - Converts chat abbreviations to full forms using a chat_words dictionary
    - Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?")
    - Cleans up whitespace around punctuation
    Args:
        text (str): The input text string to preprocess.
    Returns:
        str: The cleaned and preprocessed text.
    """
    # Lowercase the text
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Convert chat abbreviations
    words = text.split()
    new_text = [chat_words.get(w.upper(), w) for w in words]
    text = " ".join(new_text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Normalize repeated punctuation
    text = re.sub(r'([!?.,])\1{1,}', r'\1', text)  # e.g., "!!!" → "!"
    text = re.sub(r'\.{2,}', '.', text)           # e.g., "..." → "."

    # Normalize spacing around punctuation
    text = re.sub(r'\s*([.,!?\'"-])\s*', r' \1 ', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
    """Replace speaker names with placeholders."""
    text = text.replace(speaker_1, "<speaker1>")
    text = text.replace(speaker_2, "<speaker2>")
    return text

def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
    """Replace placeholders with original speaker names."""
    text = text.replace("<speaker1>", speaker_1)
    text = text.replace("<speaker2>", speaker_2)
    return text


# Inference function
def summarize_text(txt):
  speaker_1, speaker_2 = extract_speakers(txt)
  txt = preprocess_text(txt)
  txt = anonymize_speakers(txt, speaker_1, speaker_2)
  inputs = tokenizer(txt, return_tensors="pt")
  inputs = {k: v.cpu() for k, v in inputs.items()}
  summary_ids = model.generate(**inputs)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  summary = deanonymize_speakers(summary, speaker_1, speaker_2)
  return summary

# Title
st.markdown("<h1 style='color:#9d4edd;'>Smart Summarizer</h1>", unsafe_allow_html=True)
st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.")

# Text input
text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...")

# Summarize button
if st.button("Summarize"):
    if text_input.strip():
        with st.spinner("Generating summary..."):
            summary = summarize_text(text_input)
        st.markdown("<div class='section-header'>Summary</div>", unsafe_allow_html=True)
        st.write(summary)
    else:
        st.warning("Please enter text to summarize.")

# Show simplified model metrics
st.markdown("<div class='section-header'>Model Performance</div>", unsafe_allow_html=True)

metrics = {
    "ROUGE-1": "0.4193",
    "ROUGE-2": "0.2064",
    "ROUGE-L": "0.3469",
}

metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
metrics_df = metrics_df.set_index("Metric")
st.table(metrics_df)