File size: 13,268 Bytes
ea7b9be
55d0499
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e0dbf
 
 
 
 
fd36e32
 
 
 
 
55d0499
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80f282b
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b4ab7
fd36e32
 
 
16b4ab7
fd36e32
 
80f282b
fd36e32
 
 
 
 
 
 
cd458ad
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b4ab7
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdb29da
fd36e32
 
bdb29da
 
 
fd36e32
 
bdb29da
 
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdb29da
fd36e32
 
 
bdb29da
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d000f2a
 
 
fd36e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7861917
fd36e32
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import gradio as gr
from transformers import pipeline
import pandas as pd
import os
import re
from datetime import datetime
from filelock import FileLock
import unicodedata

# ==========================================
# MODEL LOADING
# ==========================================
print("๐Ÿ”„ Loading models...")

try:
    # Load sentiment models
    english_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/sentiment_roberta_english_finetuned"
    )
    
    # Same model for both Urdu and Roman Urdu as per your requirements
    urdu_roman_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
    )

    urdu_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
    )
    print("โœ… All models loaded successfully!")
    
except Exception as e:
    print(f"โŒ Error loading models: {e}")
    raise

# ==========================================
# LANGUAGE DETECTION
# ==========================================

def contains_urdu_script(text):
    """Check if text contains Urdu/Arabic script"""
    urdu_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]')
    return bool(urdu_pattern.search(text))

def is_roman_urdu(text):
    """Detect Roman Urdu using comprehensive word patterns"""
    text_lower = text.lower().strip()
    
    # Roman Urdu specific words
    roman_urdu_markers = {
        # Common verbs and helping verbs
        'hai', 'hain', 'tha', 'thi', 'thay', 'ho', 'hun', 'hoon', 'hein', 'he', 'hy',
        # Pronouns
        'main', 'mein', 'mai', 'tum', 'wo', 'woh', 'ye', 'yeh', 'ap', 'aap',
        # Prepositions
        'ka', 'ki', 'ke', 'ko', 'se', 'ne', 'par', 'pe',
        # Common words
        'nahi', 'nhi', 'nahin', 'kya', 'kyun', 'kaise', 'kese', 'kahan', 'kab',
        # Sentiment words
        'acha', 'achy', 'achha', 'accha', 'achi', 'bura', 'kharab', 'behtar',
        'zabardast', 'bekar', 'bekaar', 'bohot', 'bohat', 'bahut', 'bhot',
        # Action words
        'karo', 'karna', 'karna', 'karein', 'kiya', 'kia', 'gaya', 'gayi', 'gaye',
        'dena', 'lena', 'dekho', 'dekha', 'suno', 'suna', 'samjho', 'samjha',
        # Conjunctions
        'aur', 'or', 'lekin', 'magar', 'ya', 'phir', 'to', 'toh',
        # Time words
        'ab', 'abhi', 'kal', 'parso', 'aj', 'aaj',
        # Common expressions
        'sath', 'saath', 'pas', 'paas', 'dur', 'door', 'sab', 'kuch', 'koi'
    }
    
    # Tokenize text
    words = re.findall(r'\b\w+\b', text_lower)
    
    if not words:
        return False
    
    # Count Roman Urdu markers
    marker_count = sum(1 for word in words if word in roman_urdu_markers)
    marker_ratio = marker_count / len(words)
    
    # Detection thresholds
    if len(words) <= 3:
        # For very short text, need at least one marker
        return marker_count >= 1
    elif len(words) <= 8:
        # For short text, need 25% markers
        return marker_ratio >= 0.25
    else:
        # For longer text, need 20% markers
        return marker_ratio >= 0.20

def detect_language(text):
    """
    Detect language with high accuracy
    Returns: 'English', 'Urdu', or 'Roman Urdu'
    """
    if not text or not text.strip():
        return "English"
    
    text = text.strip()
    
    # Check for Urdu script (most reliable)
    if contains_urdu_script(text):
        return "Urdu"
    
    # Check for Roman Urdu patterns
    if is_roman_urdu(text):
        return "Roman Urdu"
    
    # Default to English
    return "English"

# ==========================================
# SENTIMENT ANALYSIS
# ==========================================

def normalize_label(label):
    """Normalize sentiment labels from different models"""
    label_lower = str(label).lower()
    
    if 'pos' in label_lower or 'positive' in label_lower:
        return "Positive"
    elif 'neg' in label_lower or 'negative' in label_lower:
        return "Negative"
    elif 'neu' in label_lower or 'neutral' in label_lower:
        return "Neutral"
    else:
        return label

def get_sentiment_emoji(sentiment):
    """Return emoji for sentiment"""
    emoji_map = {
        "Positive": "๐Ÿ˜Š",
        "Negative": "๐Ÿ˜ž",
        "Neutral": "๐Ÿ˜"
    }
    return emoji_map.get(sentiment, "")

def analyze_sentiment(text, language):
    """
    Perform sentiment analysis based on detected language
    """
    try:
        # Truncate text if too long
        text_input = text[:512]
        
        # Choose model based on language
        if language == "English":
            result = english_model(text_input)[0]
        else:  # Urdu or Roman Urdu
            result = urdu_roman_model(text_input)[0]
        
        # Extract and normalize results
        sentiment = normalize_label(result['label'])
        confidence = round(float(result['score']), 4)
        
        return sentiment, confidence
        
    except Exception as e:
        print(f"Error in sentiment analysis: {e}")
        return "Error", 0.0

# ==========================================
# CSV LOGGING
# ==========================================

CSV_FILE = "sentiment_analysis_logs.csv"
LOCK_FILE = CSV_FILE + ".lock"

def initialize_csv():
    """Initialize CSV file if it doesn't exist"""
    if not os.path.exists(CSV_FILE):
        df = pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])
        df.to_csv(CSV_FILE, index=False, encoding='utf-8-sig')

def save_to_csv(text, language, sentiment, confidence):
    """Save analysis result to CSV with file locking"""
    try:
        with FileLock(LOCK_FILE, timeout=10):
            # Read existing data
            if os.path.exists(CSV_FILE):
                df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
            else:
                df = pd.DataFrame(columns=[
                    "Timestamp", "Text", "Language", "Sentiment", "Confidence"
                ])
            
            # Add new row
            new_row = pd.DataFrame([{
                "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "Text": text,
                "Language": language,
                "Sentiment": sentiment,
                "Confidence": confidence
            }])
            
            df = pd.concat([df, new_row], ignore_index=True)
            
            # Save to CSV
            df.to_csv(CSV_FILE, index=False, encoding='utf-8-sig')
            
            return True
    except Exception as e:
        print(f"Error saving to CSV: {e}")
        return False

def load_logs():
    """Load recent logs from CSV"""
    try:
        if os.path.exists(CSV_FILE):
            df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
            # Return last 50 entries, most recent first
            return df.tail(50).iloc[::-1]
        else:
            return pd.DataFrame(columns=[
                "Timestamp", "Text", "Language", "Sentiment", "Confidence"
            ])
    except Exception as e:
        print(f"Error loading logs: {e}")
        return pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])

def clear_logs():
    """Clear all logs"""
    try:
        if os.path.exists(CSV_FILE):
            os.remove(CSV_FILE)
        initialize_csv()
        return pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])
    except Exception as e:
        print(f"Error clearing logs: {e}")
        return load_logs()

# ==========================================
# MAIN ANALYSIS FUNCTION
# ==========================================

def process_sentiment(text):
    """
    Main function to process sentiment analysis
    """
    if not text or not text.strip():
        return (
            "",
            "",
            "",
            "",
            load_logs(),
            CSV_FILE
        )
    
    # Detect language
    language = detect_language(text)
    
    # Analyze sentiment
    sentiment, confidence = analyze_sentiment(text, language)
    
    # Format results
    emoji = get_sentiment_emoji(sentiment)
    result_text = f"{emoji} {sentiment}"
    confidence_text = f"{confidence:.2%}"
    
    # Create detailed result
    detail = f"**Language:** {language}\n**Sentiment:** {sentiment}\n**Confidence:** {confidence:.4f}"
    
    # Save to CSV
    save_to_csv(text, language, sentiment, confidence)
    
    # Load updated logs
    logs = load_logs()
    
    return (
        result_text,
        confidence_text,
        language,
        detail,
        logs,
        CSV_FILE
    )

# ==========================================
# GRADIO INTERFACE
# ==========================================

# Initialize CSV on startup
initialize_csv()

# Create Gradio interface
with gr.Blocks(title="Sentiment Analysis - Student Feedback") as demo:
    
    # Header
    gr.Markdown("""
    # ๐ŸŽ“ Student Feedback Sentiment Analysis
    ### Multilingual Support: English โ€ข ุงุฑุฏูˆ โ€ข Roman Urdu
    """)
    
    gr.Markdown("---")
    
    # Main content
    with gr.Row():
        # Left column - Input
        with gr.Column(scale=1):
            gr.Markdown("### ๐Ÿ“ Enter Feedback")
            
            input_text = gr.Textbox(
                label="Student Feedback",
                placeholder="Enter feedback in English, Urdu, or Roman Urdu...\nPress Enter or click Analyze",
                lines=5,
                max_lines=10
            )
            
            with gr.Row():
                analyze_btn = gr.Button("๐Ÿ” Analyze Sentiment", variant="primary", scale=2)
                clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Logs", variant="secondary", scale=1)
        
        # Right column - Results
        with gr.Column(scale=1):
            gr.Markdown("### ๐Ÿ“Š Analysis Results")
            
            with gr.Row():
                sentiment_output = gr.Textbox(
                    label="Sentiment",
                    interactive=False
                )
                confidence_output = gr.Textbox(
                    label="Confidence",
                    interactive=False
                )
            
            language_output = gr.Textbox(
                label="Detected Language",
                interactive=False
            )
            
            detail_output = gr.Markdown(
                label="Details",
                value=""
            )
    
    # Bottom section - Logs and Export
    gr.Markdown("---")
    gr.Markdown("### ๐Ÿ“‹ Analysis History")
    
    with gr.Row():
        logs_display = gr.Dataframe(
            headers=["Timestamp", "Text", "Language", "Sentiment", "Confidence"],
            datatype=["str", "str", "str", "str", "number"],
            label="Recent Analyses",
            wrap=True,
            interactive=False,
            value=load_logs()
        )
    
    with gr.Row():
        export_file = gr.File(
            label="๐Ÿ“ฅ Download Complete Logs (CSV)",
            value=CSV_FILE,
            interactive=False
        )
        gr.Markdown("""
        **๐Ÿ’ก Tips:**
        - Type your feedback and press **Enter** or click **Analyze**
        - Supports English, Urdu (ุงุฑุฏูˆ), and Roman Urdu
        - All analyses are automatically saved
        - Download CSV for complete history
        """)
    
    # Model information
    gr.Markdown("---")
    with gr.Accordion("โ„น๏ธ Model Information", open=False):
        gr.Markdown("""
        **Models Used:**
        - **English:** tahamueed23/sentiment_roberta_english_finetuned
        - **Urdu & Roman Urdu:** tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
        
        **Features:**
        - Automatic language detection
        - High-accuracy sentiment classification
        - Real-time analysis
        - CSV export for data analysis
        - Support for mixed feedback in different languages

        **Important Note:**
        - If youโ€™re facing problems, itโ€™s because you didnโ€™t learn, so go educate yourself before others๐Ÿ˜Š.
        """)
    
    # Event handlers
    def process_and_update(text):
        return process_sentiment(text)
    
    # Click event
    analyze_btn.click(
        fn=process_and_update,
        inputs=[input_text],
        outputs=[
            sentiment_output,
            confidence_output,
            language_output,
            detail_output,
            logs_display,
            export_file
        ]
    )
    
    # Enter key event
    input_text.submit(
        fn=process_and_update,
        inputs=[input_text],
        outputs=[
            sentiment_output,
            confidence_output,
            language_output,
            detail_output,
            logs_display,
            export_file
        ]
    )
    
    # Clear logs event
    clear_btn.click(
        fn=clear_logs,
        inputs=[],
        outputs=[logs_display]
    )

# Launch the app
if __name__ == "__main__":
    print("\n" + "="*50)
    print("๐Ÿš€ Starting Sentiment Analysis Application")
    print("="*50 + "\n")
    demo.launch(
        share=False,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )