tahamueed23 commited on
Commit
80f282b
·
verified ·
1 Parent(s): 7861917

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -492
app.py CHANGED
@@ -1,535 +1,163 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
3
  import pandas as pd
 
 
 
4
  import os
5
- import re
6
- from filelock import FileLock
7
- import torch
8
-
9
- # -----------------------------
10
- # Load Models with Error Handling
11
- # -----------------------------
12
- try:
13
- # English sentiment model
14
- english_model = pipeline(
15
- "sentiment-analysis",
16
- model="siebert/sentiment-roberta-large-english"
17
- )
18
-
19
- # Urdu sentiment model
20
- urdu_model = pipeline(
21
- "sentiment-analysis",
22
- model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
23
- )
24
-
25
- # Roman Urdu sentiment model
26
- roman_urdu_model = pipeline(
27
- "sentiment-analysis",
28
- model="tahamueed23/roman-urdu-sentiment"
29
- )
30
-
31
- # Language detection model
32
- lang_detector = pipeline(
33
- "text-classification",
34
- model="papluca/xlm-roberta-base-language-detection"
35
- )
36
-
37
- print("✅ All models loaded successfully!")
38
-
39
- except Exception as e:
40
- print(f"❌ Error loading models: {e}")
41
- raise
42
-
43
- # -----------------------------
44
- # Roman Urdu Word Databases
45
- # -----------------------------
46
- ROMAN_URDU_POSITIVE_WORDS = {
47
- 'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda',
48
- 'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida',
49
- 'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri',
50
- 'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran',
51
- 'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi',
52
- 'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche'
53
- }
54
-
55
- ROMAN_URDU_NEGATIVE_WORDS = {
56
- 'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal',
57
- 'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf',
58
- 'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez',
59
- 'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda',
60
- 'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish'
61
- }
62
-
63
- ROMAN_URDU_NEUTRAL_WORDS = {
64
- 'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon',
65
- 'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh',
66
- 'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna'
67
- }
68
-
69
- # Compile regex patterns for faster matching
70
- roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE)
71
- roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE)
72
-
73
- # -----------------------------
74
- # Enhanced Language Detection
75
- # -----------------------------
76
- def detect_language_advanced(text):
77
- """Advanced language detection using model + rules"""
78
- if not text.strip():
79
- return "English"
80
-
81
- text_clean = text.strip()
82
-
83
- # Step 1: Urdu script detection (most reliable)
84
- if re.search(r'[\u0600-\u06FF]', text_clean):
85
- return "Urdu"
86
-
87
- # Step 2: Use transformer model for language detection
88
- try:
89
- # Truncate very long texts to avoid model limits
90
- truncated_text = text_clean[:250]
91
- lang_result = lang_detector(truncated_text)[0]
92
- lang_label = lang_result['label'].upper()
93
- lang_score = lang_result['score']
94
-
95
- # Map model outputs to our language categories
96
- lang_map = {
97
- 'UR': 'Urdu',
98
- 'EN': 'English',
99
- 'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu
100
- }
101
-
102
- detected_lang = lang_map.get(lang_label, 'English')
103
-
104
- # Step 3: For Urdu/English detection, apply Roman Urdu rules
105
- if detected_lang in ['Urdu', 'English']:
106
- if is_likely_roman_urdu(text_clean):
107
- return "Roman Urdu"
108
-
109
- return detected_lang
110
-
111
- except Exception as e:
112
- print(f"Language detection model error: {e}")
113
- # Fallback to rule-based detection
114
- return detect_language_fallback(text_clean)
115
-
116
- def is_likely_roman_urdu(text):
117
- """Check if text is likely Roman Urdu using comprehensive rules"""
118
- text_lower = text.lower()
119
-
120
- # Count Roman Urdu specific words
121
- positive_hits = len(roman_urdu_positive_pattern.findall(text_lower))
122
- negative_hits = len(roman_urdu_negative_pattern.findall(text_lower))
123
- total_hits = positive_hits + negative_hits
124
-
125
- # Count total words
126
- words = re.findall(r'\b\w+\b', text_lower)
127
- total_words = len(words)
128
-
129
- if total_words == 0:
130
- return False
131
-
132
- # Rule 1: High percentage of Roman Urdu words
133
- roman_urdu_ratio = total_hits / total_words
134
- if roman_urdu_ratio > 0.3: # 30% threshold
135
- return True
136
-
137
- # Rule 2: Specific Roman Urdu sentence structures
138
- roman_urdu_patterns = [
139
- r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$",
140
- r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$",
141
- r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$",
142
- r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$",
143
- r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)"
144
- ]
145
-
146
- for pattern in roman_urdu_patterns:
147
- if re.search(pattern, text_lower):
148
- return True
149
-
150
- # Rule 3: Presence of key Roman Urdu function words
151
- function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne']
152
- function_word_count = sum(1 for word in words if word in function_words)
153
-
154
- if function_word_count >= 2 and total_words <= 8:
155
- return True
156
-
157
- return False
158
-
159
- def detect_language_fallback(text):
160
- """Rule-based fallback language detection"""
161
- text_lower = text.lower()
162
-
163
- # Urdu script check
164
- if re.search(r'[\u0600-\u06FF]', text):
165
- return "Urdu"
166
-
167
- # Roman Urdu detection
168
- if is_likely_roman_urdu(text):
169
- return "Roman Urdu"
170
-
171
- return "English"
172
-
173
- # -----------------------------
174
- # Roman Urdu Text Processing
175
- # -----------------------------
176
- def normalize_roman_urdu(text):
177
- """Normalize Roman Urdu text variations"""
178
- text = text.lower().strip()
179
-
180
- # Common Roman Urdu spelling variations
181
- variations = {
182
- r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai',
183
- r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi',
184
- r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot',
185
- r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada',
186
- r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi',
187
- r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi',
188
- r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe',
189
- r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi',
190
- r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab',
191
- r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum',
192
- r'\buss\b': 'us', r'\biss\b': 'is'
193
- }
194
-
195
- for pattern, replacement in variations.items():
196
- text = re.sub(pattern, replacement, text)
197
-
198
- return text
199
-
200
- # -----------------------------
201
- # Roman Urdu Sentiment Correction
202
- # -----------------------------
203
- def correct_roman_urdu_sentiment(text, current_sentiment, current_score):
204
- """Apply Roman Urdu specific sentiment corrections"""
205
- text_lower = text.lower()
206
- normalized_text = normalize_roman_urdu(text_lower)
207
-
208
- # Count positive and negative words
209
- positive_matches = roman_urdu_positive_pattern.findall(normalized_text)
210
- negative_matches = roman_urdu_negative_pattern.findall(normalized_text)
211
-
212
- positive_count = len(positive_matches)
213
- negative_count = len(negative_matches)
214
-
215
- # Strong positive indicators
216
- strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal']
217
- strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez']
218
-
219
- # Rule 1: If text contains strong positive words but model says negative, correct it
220
- has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators)
221
- has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators)
222
-
223
- if has_strong_positive and current_sentiment == "Negative":
224
- return "Positive", max(current_score, 0.85)
225
-
226
- if has_strong_negative and current_sentiment == "Positive":
227
- return "Negative", max(current_score, 0.85)
228
-
229
- # Rule 2: Word count based correction
230
- if positive_count > negative_count and current_sentiment == "Negative":
231
- new_score = min(0.8 + (positive_count * 0.05), 0.95)
232
- return "Positive", new_score
233
-
234
- if negative_count > positive_count and current_sentiment == "Positive":
235
- new_score = min(0.8 + (negative_count * 0.05), 0.95)
236
- return "Negative", new_score
237
-
238
- # Rule 3: Mixed sentiments with clear majority
239
- total_sentiment_words = positive_count + negative_count
240
- if total_sentiment_words >= 2:
241
- positive_ratio = positive_count / total_sentiment_words
242
-
243
- if positive_ratio >= 0.7 and current_sentiment != "Positive":
244
- return "Positive", 0.8
245
- elif positive_ratio <= 0.3 and current_sentiment != "Negative":
246
- return "Negative", 0.8
247
-
248
- return current_sentiment, current_score
249
-
250
- # -----------------------------
251
- # Enhanced Ensemble for Roman Urdu
252
- # -----------------------------
253
- def ensemble_roman_urdu_sentiment(text):
254
- """Advanced ensemble method for Roman Urdu sentiment analysis"""
255
- normalized_text = normalize_roman_urdu(text)
256
-
257
- try:
258
- # Get predictions from both Roman Urdu and Urdu models
259
- ru_result = roman_urdu_model(normalized_text)[0]
260
- ur_result = urdu_model(normalized_text)[0]
261
-
262
- # Normalize labels
263
- ru_sentiment = normalize_sentiment_label(ru_result["label"])
264
- ur_sentiment = normalize_sentiment_label(ur_result["label"])
265
- ru_score = ru_result["score"]
266
- ur_score = ur_result["score"]
267
-
268
- # Apply Roman Urdu corrections to both results
269
- ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score)
270
- ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score)
271
-
272
- # If both models agree after correction
273
- if ru_sentiment_corrected == ur_sentiment_corrected:
274
- final_score = max(ru_score_corrected, ur_score_corrected)
275
- return {"label": ru_sentiment_corrected, "score": final_score}
276
-
277
- # Weighted voting with higher weight for Roman Urdu model
278
- ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model
279
- ur_weight = ur_score_corrected * 1.2
280
-
281
- if ru_weight > ur_weight:
282
- return {"label": ru_sentiment_corrected, "score": ru_score_corrected}
283
- else:
284
- return {"label": ur_sentiment_corrected, "score": ur_score_corrected}
285
-
286
- except Exception as e:
287
- print(f"Ensemble error: {e}")
288
- # Fallback to Roman Urdu model with correction
289
- try:
290
- result = roman_urdu_model(normalize_roman_urdu(text))[0]
291
- corrected_sentiment, corrected_score = correct_roman_urdu_sentiment(
292
- text, normalize_sentiment_label(result["label"]), result["score"]
293
- )
294
- return {"label": corrected_sentiment, "score": corrected_score}
295
- except:
296
- return {"label": "Neutral", "score": 0.5}
297
-
298
- # -----------------------------
299
- # Sentiment Analysis Core Functions
300
- # -----------------------------
301
- def normalize_sentiment_label(label):
302
- """Normalize sentiment labels from different models"""
303
- label = str(label).lower()
304
-
305
- if any(word in label for word in ["pos", "positive", "positive", "lab"]):
306
- return "Positive"
307
- elif any(word in label for word in ["neg", "negative", "negative"]):
308
- return "Negative"
309
- else:
310
- return "Neutral"
311
-
312
- def get_strong_sentiment_words(text, language):
313
- """Extract strong sentiment-bearing words"""
314
- text_lower = text.lower()
315
- strong_words = []
316
-
317
- if language == "Roman Urdu":
318
- # Use our Roman Urdu word databases
319
- positive_matches = roman_urdu_positive_pattern.findall(text_lower)
320
- negative_matches = roman_urdu_negative_pattern.findall(text_lower)
321
- strong_words = positive_matches + negative_matches
322
- elif language == "Urdu":
323
- # Urdu strong words (you can expand this list)
324
- urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا']
325
- urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار']
326
- for word in urdu_positive + urdu_negative:
327
- if word in text:
328
- strong_words.append(word)
329
- else: # English
330
- english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great']
331
- english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad']
332
- for word in english_positive + english_negative:
333
- if re.search(r'\b' + re.escape(word) + r'\b', text_lower):
334
- strong_words.append(word)
335
-
336
- return list(set(strong_words))[:5] # Return unique words, max 5
337
-
338
- def generate_detailed_explanation(text, sentiment, score, language, strong_words):
339
- """Generate detailed explanation for sentiment analysis"""
340
-
341
- confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low"
342
-
343
- base_explanations = {
344
- "Positive": {
345
- "High": "Strong positive sentiment with clear positive expressions.",
346
- "Medium": "Moderately positive sentiment with favorable tone.",
347
- "Low": "Slightly positive leaning with some positive indicators."
348
- },
349
- "Negative": {
350
- "High": "Strong negative sentiment with clear criticism.",
351
- "Medium": "Moderately negative sentiment with critical tone.",
352
- "Low": "Slightly negative leaning with some concerning indicators."
353
- },
354
- "Neutral": {
355
- "High": "Clearly neutral or factual statement.",
356
- "Medium": "Mostly neutral with balanced perspective.",
357
- "Low": "Weak sentiment leaning neutral."
358
- }
359
- }
360
-
361
- explanation = base_explanations[sentiment][confidence_level]
362
-
363
- # Add language specific notes
364
- if language == "Roman Urdu":
365
- explanation += " Analyzed with Roman Urdu specific rules."
366
-
367
- # Special note for common corrections
368
- if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']):
369
- if sentiment == "Positive":
370
- explanation += " Words like 'acha' correctly identified as positive."
371
-
372
- # Add strong words information
373
- if strong_words:
374
- explanation += f" Key sentiment words: {', '.join(strong_words)}."
375
-
376
- explanation += f" Confidence: {score:.3f}"
377
-
378
- return explanation
379
-
380
- # -----------------------------
381
- # Main Analysis Function
382
- # -----------------------------
383
- SAVE_FILE = "sentiment_logs.csv"
384
- LOCK_FILE = SAVE_FILE + ".lock"
385
-
386
- if not os.path.exists(SAVE_FILE):
387
- pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv(
388
- SAVE_FILE, index=False, encoding="utf-8-sig"
389
- )
390
 
391
- def analyze_sentiment_complete(text, lang_hint):
392
- """Complete sentiment analysis pipeline"""
393
- if not text.strip():
394
- return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""
395
-
396
- # Detect language
397
- language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text)
398
-
399
- try:
400
- # Perform sentiment analysis based on language
401
- if language == "English":
402
- result = english_model(text[:512])[0]
403
- sentiment = normalize_sentiment_label(result["label"])
404
- score = round(float(result["score"]), 3)
405
-
406
- elif language == "Urdu":
407
- result = urdu_model(text[:512])[0]
408
- sentiment = normalize_sentiment_label(result["label"])
409
- score = round(float(result["score"]), 3)
410
-
411
- else: # Roman Urdu
412
- result = ensemble_roman_urdu_sentiment(text)
413
- sentiment = result["label"]
414
- score = round(float(result["score"]), 3)
415
-
416
- # Get strong words
417
- strong_words = get_strong_sentiment_words(text, language)
418
- strong_words_str = ", ".join(strong_words) if strong_words else "None"
419
-
420
- # Generate explanation
421
- explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words)
422
-
423
- # Save to CSV
424
- with FileLock(LOCK_FILE):
425
- df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame(
426
- columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]
427
- )
428
- new_row = pd.DataFrame([[
429
- text, language, sentiment, score, strong_words_str, pd.Timestamp.now()
430
- ]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
431
- df = pd.concat([df, new_row], ignore_index=True)
432
- df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
433
-
434
- return sentiment, str(score), explanation, SAVE_FILE, strong_words_str
435
-
436
- except Exception as e:
437
- error_msg = f"Analysis error: {str(e)}"
438
- return "Error", "0", error_msg, SAVE_FILE, ""
439
-
440
- # -----------------------------
441
- # Gradio Interface
442
- # -----------------------------
443
  def show_logs():
444
- if os.path.exists(SAVE_FILE):
445
- df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
446
- return df.tail(20)
447
- else:
448
- return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
449
 
450
  def clear_logs():
451
- if os.path.exists(SAVE_FILE):
452
- os.remove(SAVE_FILE)
453
- return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
  with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
 
456
  gr.Markdown("""
457
- # 🌍 Advanced Multilingual Sentiment Analysis
458
- **English Urdu Roman Urdu**
459
-
460
- Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling.
461
-
462
- **Used models:**
463
- - English: siebert/sentiment-roberta-large-english
464
- - Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
465
- - Roman Urdu: tahamueed23/roman-urdu-sentiment
466
- - Language detection: papluca/xlm-roberta-base-language-detection
467
  """)
468
 
469
- # -----------------------------
470
- # TOP ROW (Two Blocks)
471
- # -----------------------------
472
  with gr.Row():
473
- # Left block (Text input + buttons)
 
474
  with gr.Column(scale=1):
475
  user_text = gr.Textbox(
476
- label="✍️ Enter Text",
477
- placeholder="Type in English, Urdu, or Roman Urdu...",
478
  lines=3
479
  )
 
480
  lang_dropdown = gr.Dropdown(
481
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
482
  value="Auto Detect",
483
- label="🌐 Language Selection"
484
  )
485
 
486
  with gr.Row():
487
- btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
488
- btn_show = gr.Button("📂 Show Logs")
489
- btn_clear = gr.Button("🗑️ Clear Logs")
490
 
491
- # Right block (output results)
492
  with gr.Column(scale=1):
493
- out_sent = gr.Textbox(label="🎭 Sentiment")
494
- out_conf = gr.Textbox(label="📊 Confidence Score")
495
- out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=4)
496
- out_strong = gr.Textbox(label="💪 Strong Words")
497
- out_file = gr.File(label="⬇️ Download Logs")
498
-
499
- # -----------------------------
500
- # BOTTOM ROW (Two Blocks)
501
- # -----------------------------
502
  with gr.Row():
503
- # Left block (analysis history)
 
504
  with gr.Column(scale=1):
505
  logs_df = gr.Dataframe(
506
  headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
507
- label="📋 Analysis History",
508
  interactive=False,
509
  wrap=True,
510
  height=350
511
  )
512
 
513
- # Right block can be empty or used later
514
  with gr.Column(scale=1):
515
  gr.Markdown("")
516
 
517
- # Event handlers
518
- out_exp = gr.Textbox(label="💡 Detailed Explanation")
519
- out_strong = gr.Textbox(label="🔥 Strong Sentiment Words")
520
- out_file = gr.File(label="📁 Log File")
521
-
522
  btn_analyze.click(
523
  analyze_sentiment_complete,
524
  inputs=[user_text, lang_dropdown],
525
- outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
526
  )
527
-
528
- btn_show.click(show_logs, outputs=[gr.Dataframe()])
529
- btn_clear.click(clear_logs, outputs=[gr.Dataframe()])
530
 
531
- demo.launch()
 
532
 
533
-
534
- if __name__ == "__main__":
535
- demo.launch(share=False)
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import numpy as np
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
5
+ from datetime import datetime
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # ------------------------------------------------------------
9
+ # LOAD MODELS
10
+ # ------------------------------------------------------------
11
+
12
+ lang_detector_name = "papluca/xlm-roberta-base-language-detection"
13
+ eng_model_name = "siebert/sentiment-roberta-large-english"
14
+ urdu_model_name = "tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
15
+ roman_model_name = "tahamueed23/roman-urdu-sentiment"
16
+
17
+ lang_pipe = pipeline("text-classification", model=lang_detector_name, tokenizer=lang_detector_name)
18
+ eng_pipe = pipeline("sentiment-analysis", model=eng_model_name, tokenizer=eng_model_name)
19
+ urdu_pipe = pipeline("text-classification", model=urdu_model_name, tokenizer=urdu_model_name)
20
+ roman_pipe = pipeline("text-classification", model=roman_model_name, tokenizer=roman_model_name)
21
+
22
+ # ------------------------------------------------------------
23
+ # LOG STORAGE
24
+ # ------------------------------------------------------------
25
+ LOG_FILE = "analysis_logs.csv"
26
+
27
+ if not os.path.exists(LOG_FILE):
28
+ df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
29
+ df.to_csv(LOG_FILE, index=False)
30
+
31
+ def save_log(sentence, lang, sent, conf, strong_words):
32
+ df = pd.read_csv(LOG_FILE)
33
+ df.loc[len(df)] = [sentence, lang, sent, conf, strong_words, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
34
+ df.to_csv(LOG_FILE, index=False)
35
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def show_logs():
37
+ return pd.read_csv(LOG_FILE)
 
 
 
 
38
 
39
  def clear_logs():
40
+ df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
41
+ df.to_csv(LOG_FILE, index=False)
42
+ return df
43
+
44
+ # ------------------------------------------------------------
45
+ # SENTIMENT HELPERS
46
+ # ------------------------------------------------------------
47
+
48
+ def detect_language(text):
49
+ res = lang_pipe(text)[0]['label']
50
+ if res.lower() in ["ur", "urd"]:
51
+ return "Urdu"
52
+ if res.lower() in ["en", "eng"]:
53
+ return "English"
54
+ return "Roman Urdu"
55
+
56
+ def extract_strong_words(text):
57
+ words = text.split()
58
+ strong = [w for w in words if w.isupper() or w.endswith("!!!")]
59
+ return ", ".join(strong) if strong else "None"
60
+
61
+ # ------------------------------------------------------------
62
+ # MAIN ANALYSIS FUNCTION
63
+ # ------------------------------------------------------------
64
+
65
+ def analyze_sentiment_complete(text, selected_lang):
66
+
67
+ if selected_lang == "Auto Detect":
68
+ lang = detect_language(text)
69
+ else:
70
+ lang = selected_lang
71
+
72
+ if lang == "English":
73
+ result = eng_pipe(text)[0]
74
+ sentiment = result["label"]
75
+ score = round(float(result["score"]), 4)
76
+
77
+ elif lang == "Urdu":
78
+ result = urdu_pipe(text)[0]
79
+ sentiment = result["label"]
80
+ score = round(float(result["score"]), 4)
81
+
82
+ else: # Roman Urdu
83
+ result = roman_pipe(text)[0]
84
+ sentiment = result["label"]
85
+ score = round(float(result["score"]), 4)
86
+
87
+ strong_words = extract_strong_words(text)
88
+ explanation = f"Language: {lang}\nStrong indicators: {strong_words}\nThe model predicts: {sentiment}"
89
+
90
+ save_log(text, lang, sentiment, score, strong_words)
91
+
92
+ return sentiment, score, explanation, LOG_FILE, strong_words
93
+
94
+ # ------------------------------------------------------------
95
+ # GRADIO UI LAYOUT (Final Updated Version)
96
+ # ------------------------------------------------------------
97
 
98
  with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
99
+
100
  gr.Markdown("""
101
+ # Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)
102
+ Transformer-based sentiment classification with auto language detection.
 
 
 
 
 
 
 
 
103
  """)
104
 
105
+ # ---------------- TOP ROW ----------------
 
 
106
  with gr.Row():
107
+
108
+ # LEFT: Input controls
109
  with gr.Column(scale=1):
110
  user_text = gr.Textbox(
111
+ label="Enter text",
112
+ placeholder="Type English, Urdu, or Roman Urdu...",
113
  lines=3
114
  )
115
+
116
  lang_dropdown = gr.Dropdown(
117
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
118
  value="Auto Detect",
119
+ label="Language Selection"
120
  )
121
 
122
  with gr.Row():
123
+ btn_analyze = gr.Button("Analyze Sentiment", variant="primary")
124
+ btn_show = gr.Button("Show Logs")
125
+ btn_clear = gr.Button("Clear Logs")
126
 
127
+ # RIGHT: Output panel
128
  with gr.Column(scale=1):
129
+ out_sent = gr.Textbox(label="Sentiment", interactive=False)
130
+ out_score = gr.Textbox(label="Confidence Score", interactive=False)
131
+ out_explain = gr.Textbox(label="Detailed Explanation", lines=5, interactive=False)
132
+ out_file = gr.File(label="Download Logs", interactive=False)
133
+ out_words = gr.Textbox(label="Strong Words", interactive=False)
134
+
135
+ # ---------------- BOTTOM ROW ----------------
 
 
136
  with gr.Row():
137
+
138
+ # LEFT: History table
139
  with gr.Column(scale=1):
140
  logs_df = gr.Dataframe(
141
  headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
142
+ label="Analysis History",
143
  interactive=False,
144
  wrap=True,
145
  height=350
146
  )
147
 
148
+ # RIGHT empty or for future extensions
149
  with gr.Column(scale=1):
150
  gr.Markdown("")
151
 
152
+ # ---------------- BUTTON ACTIONS ----------------
 
 
 
 
153
  btn_analyze.click(
154
  analyze_sentiment_complete,
155
  inputs=[user_text, lang_dropdown],
156
+ outputs=[out_sent, out_score, out_explain, out_file, out_words]
157
  )
 
 
 
158
 
159
+ btn_show.click(show_logs, outputs=[logs_df])
160
+ btn_clear.click(clear_logs, outputs=[logs_df])
161
 
162
+ # Run app
163
+ demo.launch()