File size: 9,479 Bytes
2f5849c
 
 
 
dc44059
2f5849c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
030743b
02ed743
49f382c
dc44059
2f5849c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4081b3
2f5849c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71885e4
2f5849c
 
 
 
 
 
 
 
 
10aeeef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import streamlit as st
import re
import contractions
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration

# Set page config
st.set_page_config(
    page_title="Smart Summarizer",
    page_icon="✂️",
    layout="centered"
)

st.markdown("""
    <style>
        /* Force everything to dark mode */
        html, body, .main, .stApp {
            background-color: #0f0f11 !important;
            color: #ffffff !important;
        }
        /* Universal text color */
        * {
            color: #e0e0e0 !important;
        }
        /* Text area styling */
        .stTextArea textarea {
            background-color: #1e1e22 !important;
            color: #ffffff !important;
            font-size: 16px !important;
            border: 1px solid #444 !important;
        }
        textarea:focus, .stTextArea textarea:focus {
            border: 2px solid #89CFF0 !important;
            box-shadow: 0 0 0 0.2rem rgba(137, 207, 240, 0.4);
        }
        /* Button styling */
        .stButton>button {
            background-color: #7b2cbf;
            color: white;
            font-weight: bold;
            border: none;
            border-radius: 6px;
            padding: 0.5rem 1rem;
        }
        .stButton>button:hover {
            background-color: #5a189a;
            color: #add8e6 !important;
        }
        /* Sidebar */
        section[data-testid="stSidebar"] {
            background-color: #1e1e22 !important;
        }
        /* Header / white band fix */
        header[data-testid="stHeader"] {
            background: transparent !important;
        }
        /* Table styling */
        .stTable td, .stTable th {
            color: #f4f4f4 !important;
            border-color: #333 !important;
        }
        /* Markdown headers */
        h1, h2, h3, h4 {
            color: #9d4edd !important;
        }
    </style>
""", unsafe_allow_html=True)

# Load model and tokenizer
MODEL_PATH = "./models/fine-tuned_bart_base"
model = BartForConditionalGeneration.from_pretrained(MODEL_PATH)
model = model. cpu() 
tokenizer = BartTokenizer.from_pretrained(MODEL_PATH)


#Helper functions
def extract_speakers(dialogue):
    """
    Extracts the names of the first two speakers in a dialogue.
    Speaker 1: From the first non-space character to the first colon.
    Speaker 2: From the first newline to the second colon.
    Args:
        dialogue (str): The dialogue text containing speaker names and conversation.
    Returns:
        tuple: A tuple of (speaker_1, speaker_2) or (None, None) if extraction fails.
    """
    try:
        # First speaker: from start (after leading spaces) to the first colon
        dialogue = dialogue.lstrip()
        speaker_1 = dialogue[:dialogue.index(':')].strip()

        # Find the start of the second speaker after first newline
        newline_index = dialogue.index('\n')
        sub_dialogue = dialogue[newline_index + 1:]
        speaker_2 = sub_dialogue[:sub_dialogue.index(':')].strip()

        return speaker_1, speaker_2

    except (ValueError, IndexError):
        return None, None

chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
     "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laughter",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don’t care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing",
}

def preprocess_text(text):
    """
    Preprocesses input text by applying the following cleaning operations:
    - Lowercases the text
    - Expands contractions (e.g., "can't" -> "cannot")
    - Removes URLs
    - Removes emojis
    - Converts chat abbreviations to full forms using a chat_words dictionary
    - Normalizes punctuation (e.g., "!!" -> "!", "???" -> "?")
    - Cleans up whitespace around punctuation
    Args:
        text (str): The input text string to preprocess.
    Returns:
        str: The cleaned and preprocessed text.
    """
    # Lowercase the text
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Convert chat abbreviations
    words = text.split()
    new_text = [chat_words.get(w.upper(), w) for w in words]
    text = " ".join(new_text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Normalize repeated punctuation
    text = re.sub(r'([!?.,])\1{1,}', r'\1', text)  # e.g., "!!!" → "!"
    text = re.sub(r'\.{2,}', '.', text)           # e.g., "..." → "."

    # Normalize spacing around punctuation
    text = re.sub(r'\s*([.,!?\'"-])\s*', r' \1 ', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def anonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
    """Replace speaker names with placeholders."""
    text = text.replace(speaker_1, "<speaker1>")
    text = text.replace(speaker_2, "<speaker2>")
    return text

def deanonymize_speakers(text: str, speaker_1: str, speaker_2: str) -> str:
    """Replace placeholders with original speaker names."""
    text = text.replace("<speaker1>", speaker_1)
    text = text.replace("<speaker2>", speaker_2)
    return text


# Inference function
def summarize_text(txt):
  speaker_1, speaker_2 = extract_speakers(txt)
  txt = preprocess_text(txt)
  txt = anonymize_speakers(txt, speaker_1, speaker_2)
  inputs = tokenizer(txt, return_tensors="pt")
  inputs = {k: v.cpu() for k, v in inputs.items()}
  summary_ids = model.generate(**inputs)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  summary = deanonymize_speakers(summary, speaker_1, speaker_2)
  return summary

# Title
st.markdown("<h1 style='color:#9d4edd;'>Smart Summarizer</h1>", unsafe_allow_html=True)
st.markdown("A clean, purple-themed summarization app powered by a fine-tuned Transformers model.")

# Text input
text_input = st.text_area("Enter the text to summarize:", height=300, placeholder="Paste a dialogue here...")

# Summarize button
if st.button("Summarize"):
    if text_input.strip():
        with st.spinner("Generating summary..."):
            summary = summarize_text(text_input)
        st.markdown("<div class='section-header'>Summary</div>", unsafe_allow_html=True)
        st.write(summary)
    else:
        st.warning("Please enter text to summarize.")

# Show simplified model metrics
st.markdown("<div class='section-header'>Model Performance</div>", unsafe_allow_html=True)

metrics = {
    "ROUGE-1": "0.4193",
    "ROUGE-2": "0.2064",
    "ROUGE-L": "0.3469",
}

metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
metrics_df = metrics_df.set_index("Metric")
st.table(metrics_df)