tahamueed23 commited on
Commit
16b4ab7
·
verified ·
1 Parent(s): 80f282b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +484 -136
app.py CHANGED
@@ -1,163 +1,511 @@
1
  import gradio as gr
 
2
  import pandas as pd
3
- import numpy as np
4
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
5
- from datetime import datetime
6
  import os
7
-
8
- # ------------------------------------------------------------
9
- # LOAD MODELS
10
- # ------------------------------------------------------------
11
-
12
- lang_detector_name = "papluca/xlm-roberta-base-language-detection"
13
- eng_model_name = "siebert/sentiment-roberta-large-english"
14
- urdu_model_name = "tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
15
- roman_model_name = "tahamueed23/roman-urdu-sentiment"
16
-
17
- lang_pipe = pipeline("text-classification", model=lang_detector_name, tokenizer=lang_detector_name)
18
- eng_pipe = pipeline("sentiment-analysis", model=eng_model_name, tokenizer=eng_model_name)
19
- urdu_pipe = pipeline("text-classification", model=urdu_model_name, tokenizer=urdu_model_name)
20
- roman_pipe = pipeline("text-classification", model=roman_model_name, tokenizer=roman_model_name)
21
-
22
- # ------------------------------------------------------------
23
- # LOG STORAGE
24
- # ------------------------------------------------------------
25
- LOG_FILE = "analysis_logs.csv"
26
-
27
- if not os.path.exists(LOG_FILE):
28
- df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
29
- df.to_csv(LOG_FILE, index=False)
30
-
31
- def save_log(sentence, lang, sent, conf, strong_words):
32
- df = pd.read_csv(LOG_FILE)
33
- df.loc[len(df)] = [sentence, lang, sent, conf, strong_words, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
34
- df.to_csv(LOG_FILE, index=False)
35
-
36
- def show_logs():
37
- return pd.read_csv(LOG_FILE)
38
-
39
- def clear_logs():
40
- df = pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
41
- df.to_csv(LOG_FILE, index=False)
42
- return df
43
-
44
- # ------------------------------------------------------------
45
- # SENTIMENT HELPERS
46
- # ------------------------------------------------------------
47
-
48
- def detect_language(text):
49
- res = lang_pipe(text)[0]['label']
50
- if res.lower() in ["ur", "urd"]:
51
- return "Urdu"
52
- if res.lower() in ["en", "eng"]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return "English"
54
- return "Roman Urdu"
55
-
56
- def extract_strong_words(text):
57
- words = text.split()
58
- strong = [w for w in words if w.isupper() or w.endswith("!!!")]
59
- return ", ".join(strong) if strong else "None"
60
-
61
- # ------------------------------------------------------------
62
- # MAIN ANALYSIS FUNCTION
63
- # ------------------------------------------------------------
64
-
65
- def analyze_sentiment_complete(text, selected_lang):
66
-
67
- if selected_lang == "Auto Detect":
68
- lang = detect_language(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
- lang = selected_lang
71
-
72
- if lang == "English":
73
- result = eng_pipe(text)[0]
74
- sentiment = result["label"]
75
- score = round(float(result["score"]), 4)
76
-
77
- elif lang == "Urdu":
78
- result = urdu_pipe(text)[0]
79
- sentiment = result["label"]
80
- score = round(float(result["score"]), 4)
81
-
82
- else: # Roman Urdu
83
- result = roman_pipe(text)[0]
84
- sentiment = result["label"]
85
- score = round(float(result["score"]), 4)
86
-
87
- strong_words = extract_strong_words(text)
88
- explanation = f"Language: {lang}\nStrong indicators: {strong_words}\nThe model predicts: {sentiment}"
89
-
90
- save_log(text, lang, sentiment, score, strong_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- return sentiment, score, explanation, LOG_FILE, strong_words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # ------------------------------------------------------------
95
- # GRADIO UI LAYOUT (Final Updated Version)
96
- # ------------------------------------------------------------
 
97
 
98
  with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
99
-
100
  gr.Markdown("""
101
- # Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)
102
- Transformer-based sentiment classification with auto language detection.
 
 
 
 
 
 
 
 
103
  """)
104
-
105
- # ---------------- TOP ROW ----------------
106
  with gr.Row():
107
-
108
- # LEFT: Input controls
109
- with gr.Column(scale=1):
110
  user_text = gr.Textbox(
111
- label="Enter text",
112
- placeholder="Type English, Urdu, or Roman Urdu...",
113
  lines=3
114
  )
115
-
116
  lang_dropdown = gr.Dropdown(
117
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
118
  value="Auto Detect",
119
- label="Language Selection"
120
  )
121
-
122
  with gr.Row():
123
- btn_analyze = gr.Button("Analyze Sentiment", variant="primary")
124
- btn_show = gr.Button("Show Logs")
125
- btn_clear = gr.Button("Clear Logs")
126
-
127
- # RIGHT: Output panel
128
- with gr.Column(scale=1):
129
- out_sent = gr.Textbox(label="Sentiment", interactive=False)
130
- out_score = gr.Textbox(label="Confidence Score", interactive=False)
131
- out_explain = gr.Textbox(label="Detailed Explanation", lines=5, interactive=False)
132
- out_file = gr.File(label="Download Logs", interactive=False)
133
- out_words = gr.Textbox(label="Strong Words", interactive=False)
134
-
135
- # ---------------- BOTTOM ROW ----------------
136
- with gr.Row():
137
-
138
- # LEFT: History table
139
- with gr.Column(scale=1):
140
- logs_df = gr.Dataframe(
141
- headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
142
- label="Analysis History",
143
- interactive=False,
144
- wrap=True,
145
- height=350
146
- )
147
-
148
- # RIGHT empty or for future extensions
149
- with gr.Column(scale=1):
150
- gr.Markdown("")
151
-
152
- # ---------------- BUTTON ACTIONS ----------------
153
  btn_analyze.click(
154
  analyze_sentiment_complete,
155
  inputs=[user_text, lang_dropdown],
156
- outputs=[out_sent, out_score, out_explain, out_file, out_words]
157
  )
158
-
159
  btn_show.click(show_logs, outputs=[logs_df])
160
  btn_clear.click(clear_logs, outputs=[logs_df])
161
 
162
- # Run app
163
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
3
  import pandas as pd
 
 
 
4
  import os
5
+ import re
6
+ from filelock import FileLock
7
+ import torch
8
+
9
+ # -----------------------------
10
+ # Load Models with Error Handling
11
+ # -----------------------------
12
+ try:
13
+ # English sentiment model
14
+ english_model = pipeline(
15
+ "sentiment-analysis",
16
+ model="siebert/sentiment-roberta-large-english"
17
+ )
18
+
19
+ # Urdu sentiment model
20
+ urdu_model = pipeline(
21
+ "sentiment-analysis",
22
+ model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
23
+ )
24
+
25
+ # Roman Urdu sentiment model
26
+ roman_urdu_model = pipeline(
27
+ "sentiment-analysis",
28
+ model="tahamueed23/roman-urdu-sentiment"
29
+ )
30
+
31
+ # Language detection model
32
+ lang_detector = pipeline(
33
+ "text-classification",
34
+ model="papluca/xlm-roberta-base-language-detection"
35
+ )
36
+
37
+ print("✅ All models loaded successfully!")
38
+
39
+ except Exception as e:
40
+ print(f"❌ Error loading models: {e}")
41
+ raise
42
+
43
+ # -----------------------------
44
+ # Roman Urdu Word Databases
45
+ # -----------------------------
46
+ ROMAN_URDU_POSITIVE_WORDS = {
47
+ 'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda',
48
+ 'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida',
49
+ 'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri',
50
+ 'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran',
51
+ 'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi',
52
+ 'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche'
53
+ }
54
+
55
+ ROMAN_URDU_NEGATIVE_WORDS = {
56
+ 'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal',
57
+ 'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf',
58
+ 'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez',
59
+ 'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda',
60
+ 'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish'
61
+ }
62
+
63
+ ROMAN_URDU_NEUTRAL_WORDS = {
64
+ 'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon',
65
+ 'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh',
66
+ 'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna'
67
+ }
68
+
69
+ # Compile regex patterns for faster matching
70
+ roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE)
71
+ roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE)
72
+
73
+ # -----------------------------
74
+ # Enhanced Language Detection
75
+ # -----------------------------
76
+ def detect_language_advanced(text):
77
+ """Advanced language detection using model + rules"""
78
+ if not text.strip():
79
  return "English"
80
+
81
+ text_clean = text.strip()
82
+
83
+ # Step 1: Urdu script detection (most reliable)
84
+ if re.search(r'[\u0600-\u06FF]', text_clean):
85
+ return "Urdu"
86
+
87
+ # Step 2: Use transformer model for language detection
88
+ try:
89
+ # Truncate very long texts to avoid model limits
90
+ truncated_text = text_clean[:250]
91
+ lang_result = lang_detector(truncated_text)[0]
92
+ lang_label = lang_result['label'].upper()
93
+ lang_score = lang_result['score']
94
+
95
+ # Map model outputs to our language categories
96
+ lang_map = {
97
+ 'UR': 'Urdu',
98
+ 'EN': 'English',
99
+ 'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu
100
+ }
101
+
102
+ detected_lang = lang_map.get(lang_label, 'English')
103
+
104
+ # Step 3: For Urdu/English detection, apply Roman Urdu rules
105
+ if detected_lang in ['Urdu', 'English']:
106
+ if is_likely_roman_urdu(text_clean):
107
+ return "Roman Urdu"
108
+
109
+ return detected_lang
110
+
111
+ except Exception as e:
112
+ print(f"Language detection model error: {e}")
113
+ # Fallback to rule-based detection
114
+ return detect_language_fallback(text_clean)
115
+
116
+ def is_likely_roman_urdu(text):
117
+ """Check if text is likely Roman Urdu using comprehensive rules"""
118
+ text_lower = text.lower()
119
+
120
+ # Count Roman Urdu specific words
121
+ positive_hits = len(roman_urdu_positive_pattern.findall(text_lower))
122
+ negative_hits = len(roman_urdu_negative_pattern.findall(text_lower))
123
+ total_hits = positive_hits + negative_hits
124
+
125
+ # Count total words
126
+ words = re.findall(r'\b\w+\b', text_lower)
127
+ total_words = len(words)
128
+
129
+ if total_words == 0:
130
+ return False
131
+
132
+ # Rule 1: High percentage of Roman Urdu words
133
+ roman_urdu_ratio = total_hits / total_words
134
+ if roman_urdu_ratio > 0.3: # 30% threshold
135
+ return True
136
+
137
+ # Rule 2: Specific Roman Urdu sentence structures
138
+ roman_urdu_patterns = [
139
+ r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$",
140
+ r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$",
141
+ r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$",
142
+ r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$",
143
+ r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)"
144
+ ]
145
+
146
+ for pattern in roman_urdu_patterns:
147
+ if re.search(pattern, text_lower):
148
+ return True
149
+
150
+ # Rule 3: Presence of key Roman Urdu function words
151
+ function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne']
152
+ function_word_count = sum(1 for word in words if word in function_words)
153
+
154
+ if function_word_count >= 2 and total_words <= 8:
155
+ return True
156
+
157
+ return False
158
+
159
+ def detect_language_fallback(text):
160
+ """Rule-based fallback language detection"""
161
+ text_lower = text.lower()
162
+
163
+ # Urdu script check
164
+ if re.search(r'[\u0600-\u06FF]', text):
165
+ return "Urdu"
166
+
167
+ # Roman Urdu detection
168
+ if is_likely_roman_urdu(text):
169
+ return "Roman Urdu"
170
+
171
+ return "English"
172
+
173
+ # -----------------------------
174
+ # Roman Urdu Text Processing
175
+ # -----------------------------
176
+ def normalize_roman_urdu(text):
177
+ """Normalize Roman Urdu text variations"""
178
+ text = text.lower().strip()
179
+
180
+ # Common Roman Urdu spelling variations
181
+ variations = {
182
+ r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai',
183
+ r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi',
184
+ r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot',
185
+ r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada',
186
+ r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi',
187
+ r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi',
188
+ r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe',
189
+ r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi',
190
+ r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab',
191
+ r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum',
192
+ r'\buss\b': 'us', r'\biss\b': 'is'
193
+ }
194
+
195
+ for pattern, replacement in variations.items():
196
+ text = re.sub(pattern, replacement, text)
197
+
198
+ return text
199
+
200
+ # -----------------------------
201
+ # Roman Urdu Sentiment Correction
202
+ # -----------------------------
203
+ def correct_roman_urdu_sentiment(text, current_sentiment, current_score):
204
+ """Apply Roman Urdu specific sentiment corrections"""
205
+ text_lower = text.lower()
206
+ normalized_text = normalize_roman_urdu(text_lower)
207
+
208
+ # Count positive and negative words
209
+ positive_matches = roman_urdu_positive_pattern.findall(normalized_text)
210
+ negative_matches = roman_urdu_negative_pattern.findall(normalized_text)
211
+
212
+ positive_count = len(positive_matches)
213
+ negative_count = len(negative_matches)
214
+
215
+ # Strong positive indicators
216
+ strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal']
217
+ strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez']
218
+
219
+ # Rule 1: If text contains strong positive words but model says negative, correct it
220
+ has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators)
221
+ has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators)
222
+
223
+ if has_strong_positive and current_sentiment == "Negative":
224
+ return "Positive", max(current_score, 0.85)
225
+
226
+ if has_strong_negative and current_sentiment == "Positive":
227
+ return "Negative", max(current_score, 0.85)
228
+
229
+ # Rule 2: Word count based correction
230
+ if positive_count > negative_count and current_sentiment == "Negative":
231
+ new_score = min(0.8 + (positive_count * 0.05), 0.95)
232
+ return "Positive", new_score
233
+
234
+ if negative_count > positive_count and current_sentiment == "Positive":
235
+ new_score = min(0.8 + (negative_count * 0.05), 0.95)
236
+ return "Negative", new_score
237
+
238
+ # Rule 3: Mixed sentiments with clear majority
239
+ total_sentiment_words = positive_count + negative_count
240
+ if total_sentiment_words >= 2:
241
+ positive_ratio = positive_count / total_sentiment_words
242
+
243
+ if positive_ratio >= 0.7 and current_sentiment != "Positive":
244
+ return "Positive", 0.8
245
+ elif positive_ratio <= 0.3 and current_sentiment != "Negative":
246
+ return "Negative", 0.8
247
+
248
+ return current_sentiment, current_score
249
+
250
+ # -----------------------------
251
+ # Enhanced Ensemble for Roman Urdu
252
+ # -----------------------------
253
+ def ensemble_roman_urdu_sentiment(text):
254
+ """Advanced ensemble method for Roman Urdu sentiment analysis"""
255
+ normalized_text = normalize_roman_urdu(text)
256
+
257
+ try:
258
+ # Get predictions from both Roman Urdu and Urdu models
259
+ ru_result = roman_urdu_model(normalized_text)[0]
260
+ ur_result = urdu_model(normalized_text)[0]
261
+
262
+ # Normalize labels
263
+ ru_sentiment = normalize_sentiment_label(ru_result["label"])
264
+ ur_sentiment = normalize_sentiment_label(ur_result["label"])
265
+ ru_score = ru_result["score"]
266
+ ur_score = ur_result["score"]
267
+
268
+ # Apply Roman Urdu corrections to both results
269
+ ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score)
270
+ ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score)
271
+
272
+ # If both models agree after correction
273
+ if ru_sentiment_corrected == ur_sentiment_corrected:
274
+ final_score = max(ru_score_corrected, ur_score_corrected)
275
+ return {"label": ru_sentiment_corrected, "score": final_score}
276
+
277
+ # Weighted voting with higher weight for Roman Urdu model
278
+ ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model
279
+ ur_weight = ur_score_corrected * 1.2
280
+
281
+ if ru_weight > ur_weight:
282
+ return {"label": ru_sentiment_corrected, "score": ru_score_corrected}
283
+ else:
284
+ return {"label": ur_sentiment_corrected, "score": ur_score_corrected}
285
+
286
+ except Exception as e:
287
+ print(f"Ensemble error: {e}")
288
+ # Fallback to Roman Urdu model with correction
289
+ try:
290
+ result = roman_urdu_model(normalize_roman_urdu(text))[0]
291
+ corrected_sentiment, corrected_score = correct_roman_urdu_sentiment(
292
+ text, normalize_sentiment_label(result["label"]), result["score"]
293
+ )
294
+ return {"label": corrected_sentiment, "score": corrected_score}
295
+ except:
296
+ return {"label": "Neutral", "score": 0.5}
297
+
298
+ # -----------------------------
299
+ # Sentiment Analysis Core Functions
300
+ # -----------------------------
301
+ def normalize_sentiment_label(label):
302
+ """Normalize sentiment labels from different models"""
303
+ label = str(label).lower()
304
+
305
+ if any(word in label for word in ["pos", "positive", "positive", "lab"]):
306
+ return "Positive"
307
+ elif any(word in label for word in ["neg", "negative", "negative"]):
308
+ return "Negative"
309
  else:
310
+ return "Neutral"
311
+
312
+ def get_strong_sentiment_words(text, language):
313
+ """Extract strong sentiment-bearing words"""
314
+ text_lower = text.lower()
315
+ strong_words = []
316
+
317
+ if language == "Roman Urdu":
318
+ # Use our Roman Urdu word databases
319
+ positive_matches = roman_urdu_positive_pattern.findall(text_lower)
320
+ negative_matches = roman_urdu_negative_pattern.findall(text_lower)
321
+ strong_words = positive_matches + negative_matches
322
+ elif language == "Urdu":
323
+ # Urdu strong words (you can expand this list)
324
+ urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا']
325
+ urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار']
326
+ for word in urdu_positive + urdu_negative:
327
+ if word in text:
328
+ strong_words.append(word)
329
+ else: # English
330
+ english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great']
331
+ english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad']
332
+ for word in english_positive + english_negative:
333
+ if re.search(r'\b' + re.escape(word) + r'\b', text_lower):
334
+ strong_words.append(word)
335
+
336
+ return list(set(strong_words))[:5] # Return unique words, max 5
337
+
338
+ def generate_detailed_explanation(text, sentiment, score, language, strong_words):
339
+ """Generate detailed explanation for sentiment analysis"""
340
+
341
+ confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low"
342
+
343
+ base_explanations = {
344
+ "Positive": {
345
+ "High": "Strong positive sentiment with clear positive expressions.",
346
+ "Medium": "Moderately positive sentiment with favorable tone.",
347
+ "Low": "Slightly positive leaning with some positive indicators."
348
+ },
349
+ "Negative": {
350
+ "High": "Strong negative sentiment with clear criticism.",
351
+ "Medium": "Moderately negative sentiment with critical tone.",
352
+ "Low": "Slightly negative leaning with some concerning indicators."
353
+ },
354
+ "Neutral": {
355
+ "High": "Clearly neutral or factual statement.",
356
+ "Medium": "Mostly neutral with balanced perspective.",
357
+ "Low": "Weak sentiment leaning neutral."
358
+ }
359
+ }
360
+
361
+ explanation = base_explanations[sentiment][confidence_level]
362
+
363
+ # Add language specific notes
364
+ if language == "Roman Urdu":
365
+ explanation += " Analyzed with Roman Urdu specific rules."
366
+
367
+ # Special note for common corrections
368
+ if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']):
369
+ if sentiment == "Positive":
370
+ explanation += " Words like 'acha' correctly identified as positive."
371
+
372
+ # Add strong words information
373
+ if strong_words:
374
+ explanation += f" Key sentiment words: {', '.join(strong_words)}."
375
+
376
+ explanation += f" Confidence: {score:.3f}"
377
+
378
+ return explanation
379
+
380
+ # -----------------------------
381
+ # Main Analysis Function
382
+ # -----------------------------
383
+ SAVE_FILE = "sentiment_logs.csv"
384
+ LOCK_FILE = SAVE_FILE + ".lock"
385
+
386
+ if not os.path.exists(SAVE_FILE):
387
+ pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv(
388
+ SAVE_FILE, index=False, encoding="utf-8-sig"
389
+ )
390
 
391
+ def analyze_sentiment_complete(text, lang_hint):
392
+ """Complete sentiment analysis pipeline"""
393
+ if not text.strip():
394
+ return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""
395
+
396
+ # Detect language
397
+ language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text)
398
+
399
+ try:
400
+ # Perform sentiment analysis based on language
401
+ if language == "English":
402
+ result = english_model(text[:512])[0]
403
+ sentiment = normalize_sentiment_label(result["label"])
404
+ score = round(float(result["score"]), 3)
405
+
406
+ elif language == "Urdu":
407
+ result = urdu_model(text[:512])[0]
408
+ sentiment = normalize_sentiment_label(result["label"])
409
+ score = round(float(result["score"]), 3)
410
+
411
+ else: # Roman Urdu
412
+ result = ensemble_roman_urdu_sentiment(text)
413
+ sentiment = result["label"]
414
+ score = round(float(result["score"]), 3)
415
+
416
+ # Get strong words
417
+ strong_words = get_strong_sentiment_words(text, language)
418
+ strong_words_str = ", ".join(strong_words) if strong_words else "None"
419
+
420
+ # Generate explanation
421
+ explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words)
422
+
423
+ # Save to CSV
424
+ with FileLock(LOCK_FILE):
425
+ df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame(
426
+ columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]
427
+ )
428
+ new_row = pd.DataFrame([[
429
+ text, language, sentiment, score, strong_words_str, pd.Timestamp.now()
430
+ ]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
431
+ df = pd.concat([df, new_row], ignore_index=True)
432
+ df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
433
+
434
+ return sentiment, str(score), explanation, SAVE_FILE, strong_words_str
435
+
436
+ except Exception as e:
437
+ error_msg = f"Analysis error: {str(e)}"
438
+ return "Error", "0", error_msg, SAVE_FILE, ""
439
+
440
+ # -----------------------------
441
+ # Gradio Interface
442
+ # -----------------------------
443
+ def show_logs():
444
+ if os.path.exists(SAVE_FILE):
445
+ df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
446
+ return df.tail(20)
447
+ else:
448
+ return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
449
 
450
+ def clear_logs():
451
+ if os.path.exists(SAVE_FILE):
452
+ os.remove(SAVE_FILE)
453
+ return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
454
 
455
  with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
 
456
  gr.Markdown("""
457
+ # 🌍 Advanced Multilingual Sentiment Analysis
458
+ **English Urdu Roman Urdu**
459
+
460
+ Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling.
461
+
462
+ **Used models:**
463
+ - English: siebert/sentiment-roberta-large-english
464
+ - Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
465
+ - Roman Urdu: tahamueed23/roman-urdu-sentiment
466
+ - Language detection: papluca/xlm-roberta-base-language-detection
467
  """)
468
+
 
469
  with gr.Row():
470
+ with gr.Column():
 
 
471
  user_text = gr.Textbox(
472
+ label="✍️ Enter Text",
473
+ placeholder="Type in English, Urdu, or Roman Urdu...",
474
  lines=3
475
  )
 
476
  lang_dropdown = gr.Dropdown(
477
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
478
  value="Auto Detect",
479
+ label="🌐 Language Selection"
480
  )
481
+
482
  with gr.Row():
483
+ btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
484
+ btn_show = gr.Button("📂 Show Logs")
485
+ btn_clear = gr.Button("🗑️ Clear Logs")
486
+
487
+ with gr.Column():
488
+ out_sent = gr.Textbox(label="🎭 Sentiment")
489
+ out_conf = gr.Textbox(label="📊 Confidence Score")
490
+ out_exp = gr.Textbox(label="💡 Detailed Explanation")
491
+ out_strong = gr.Textbox(label="💪 Strong Words")
492
+ out_file = gr.File(label="��️ Download Logs")
493
+
494
+ logs_df = gr.Dataframe(
495
+ headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
496
+ label="📋 Analysis History",
497
+ interactive=False,
498
+ wrap=True
499
+ )
500
+
501
+ # Event handlers - FIXED: Added missing closing parenthesis
 
 
 
 
 
 
 
 
 
 
 
502
  btn_analyze.click(
503
  analyze_sentiment_complete,
504
  inputs=[user_text, lang_dropdown],
505
+ outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
506
  )
 
507
  btn_show.click(show_logs, outputs=[logs_df])
508
  btn_clear.click(clear_logs, outputs=[logs_df])
509
 
510
+ if __name__ == "__main__":
511
+ demo.launch(share=False)