Spaces:

tahamueed23
/

Sentiment-Analyzer

Sleeping

File size: 13,268 Bytes

import gradio as gr
from transformers import pipeline
import pandas as pd
import os
import re
from datetime import datetime
from filelock import FileLock
import unicodedata

# ==========================================
# MODEL LOADING
# ==========================================
print("🔄 Loading models...")

try:
    # Load sentiment models
    english_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/sentiment_roberta_english_finetuned"
    )
    
    # Same model for both Urdu and Roman Urdu as per your requirements
    urdu_roman_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
    )

    urdu_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
    )
    print("✅ All models loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    raise

# ==========================================
# LANGUAGE DETECTION
# ==========================================

def contains_urdu_script(text):
    """Check if text contains Urdu/Arabic script"""
    urdu_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]')
    return bool(urdu_pattern.search(text))

def is_roman_urdu(text):
    """Detect Roman Urdu using comprehensive word patterns"""
    text_lower = text.lower().strip()
    
    # Roman Urdu specific words
    roman_urdu_markers = {
        # Common verbs and helping verbs
        'hai', 'hain', 'tha', 'thi', 'thay', 'ho', 'hun', 'hoon', 'hein', 'he', 'hy',
        # Pronouns
        'main', 'mein', 'mai', 'tum', 'wo', 'woh', 'ye', 'yeh', 'ap', 'aap',
        # Prepositions
        'ka', 'ki', 'ke', 'ko', 'se', 'ne', 'par', 'pe',
        # Common words
        'nahi', 'nhi', 'nahin', 'kya', 'kyun', 'kaise', 'kese', 'kahan', 'kab',
        # Sentiment words
        'acha', 'achy', 'achha', 'accha', 'achi', 'bura', 'kharab', 'behtar',
        'zabardast', 'bekar', 'bekaar', 'bohot', 'bohat', 'bahut', 'bhot',
        # Action words
        'karo', 'karna', 'karna', 'karein', 'kiya', 'kia', 'gaya', 'gayi', 'gaye',
        'dena', 'lena', 'dekho', 'dekha', 'suno', 'suna', 'samjho', 'samjha',
        # Conjunctions
        'aur', 'or', 'lekin', 'magar', 'ya', 'phir', 'to', 'toh',
        # Time words
        'ab', 'abhi', 'kal', 'parso', 'aj', 'aaj',
        # Common expressions
        'sath', 'saath', 'pas', 'paas', 'dur', 'door', 'sab', 'kuch', 'koi'
    }
    
    # Tokenize text
    words = re.findall(r'\b\w+\b', text_lower)
    
    if not words:
        return False
    
    # Count Roman Urdu markers
    marker_count = sum(1 for word in words if word in roman_urdu_markers)
    marker_ratio = marker_count / len(words)
    
    # Detection thresholds
    if len(words) <= 3:
        # For very short text, need at least one marker
        return marker_count >= 1
    elif len(words) <= 8:
        # For short text, need 25% markers
        return marker_ratio >= 0.25
    else:
        # For longer text, need 20% markers
        return marker_ratio >= 0.20

def detect_language(text):
    """
    Detect language with high accuracy
    Returns: 'English', 'Urdu', or 'Roman Urdu'
    """
    if not text or not text.strip():
        return "English"
    
    text = text.strip()
    
    # Check for Urdu script (most reliable)
    if contains_urdu_script(text):
        return "Urdu"
    
    # Check for Roman Urdu patterns
    if is_roman_urdu(text):
        return "Roman Urdu"
    
    # Default to English
    return "English"

# ==========================================
# SENTIMENT ANALYSIS
# ==========================================

def normalize_label(label):
    """Normalize sentiment labels from different models"""
    label_lower = str(label).lower()
    
    if 'pos' in label_lower or 'positive' in label_lower:
        return "Positive"
    elif 'neg' in label_lower or 'negative' in label_lower:
        return "Negative"
    elif 'neu' in label_lower or 'neutral' in label_lower:
        return "Neutral"
    else:
        return label

def get_sentiment_emoji(sentiment):
    """Return emoji for sentiment"""
    emoji_map = {
        "Positive": "😊",
        "Negative": "😞",
        "Neutral": "😐"
    }
    return emoji_map.get(sentiment, "")

def analyze_sentiment(text, language):
    """
    Perform sentiment analysis based on detected language
    """
    try:
        # Truncate text if too long
        text_input = text[:512]
        
        # Choose model based on language
        if language == "English":
            result = english_model(text_input)[0]
        else:  # Urdu or Roman Urdu
            result = urdu_roman_model(text_input)[0]
        
        # Extract and normalize results
        sentiment = normalize_label(result['label'])
        confidence = round(float(result['score']), 4)
        
        return sentiment, confidence
        
    except Exception as e:
        print(f"Error in sentiment analysis: {e}")
        return "Error", 0.0

# ==========================================
# CSV LOGGING
# ==========================================

CSV_FILE = "sentiment_analysis_logs.csv"
LOCK_FILE = CSV_FILE + ".lock"

def initialize_csv():
    """Initialize CSV file if it doesn't exist"""
    if not os.path.exists(CSV_FILE):
        df = pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])
        df.to_csv(CSV_FILE, index=False, encoding='utf-8-sig')

def save_to_csv(text, language, sentiment, confidence):
    """Save analysis result to CSV with file locking"""
    try:
        with FileLock(LOCK_FILE, timeout=10):
            # Read existing data
            if os.path.exists(CSV_FILE):
                df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
            else:
                df = pd.DataFrame(columns=[
                    "Timestamp", "Text", "Language", "Sentiment", "Confidence"
                ])
            
            # Add new row
            new_row = pd.DataFrame([{
                "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "Text": text,
                "Language": language,
                "Sentiment": sentiment,
                "Confidence": confidence
            }])
            
            df = pd.concat([df, new_row], ignore_index=True)
            
            # Save to CSV
            df.to_csv(CSV_FILE, index=False, encoding='utf-8-sig')
            
            return True
    except Exception as e:
        print(f"Error saving to CSV: {e}")
        return False

def load_logs():
    """Load recent logs from CSV"""
    try:
        if os.path.exists(CSV_FILE):
            df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
            # Return last 50 entries, most recent first
            return df.tail(50).iloc[::-1]
        else:
            return pd.DataFrame(columns=[
                "Timestamp", "Text", "Language", "Sentiment", "Confidence"
            ])
    except Exception as e:
        print(f"Error loading logs: {e}")
        return pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])

def clear_logs():
    """Clear all logs"""
    try:
        if os.path.exists(CSV_FILE):
            os.remove(CSV_FILE)
        initialize_csv()
        return pd.DataFrame(columns=[
            "Timestamp", "Text", "Language", "Sentiment", "Confidence"
        ])
    except Exception as e:
        print(f"Error clearing logs: {e}")
        return load_logs()

# ==========================================
# MAIN ANALYSIS FUNCTION
# ==========================================

def process_sentiment(text):
    """
    Main function to process sentiment analysis
    """
    if not text or not text.strip():
        return (
            "",
            "",
            "",
            "",
            load_logs(),
            CSV_FILE
        )
    
    # Detect language
    language = detect_language(text)
    
    # Analyze sentiment
    sentiment, confidence = analyze_sentiment(text, language)
    
    # Format results
    emoji = get_sentiment_emoji(sentiment)
    result_text = f"{emoji} {sentiment}"
    confidence_text = f"{confidence:.2%}"
    
    # Create detailed result
    detail = f"**Language:** {language}\n**Sentiment:** {sentiment}\n**Confidence:** {confidence:.4f}"
    
    # Save to CSV
    save_to_csv(text, language, sentiment, confidence)
    
    # Load updated logs
    logs = load_logs()
    
    return (
        result_text,
        confidence_text,
        language,
        detail,
        logs,
        CSV_FILE
    )

# ==========================================
# GRADIO INTERFACE
# ==========================================

# Initialize CSV on startup
initialize_csv()

# Create Gradio interface
with gr.Blocks(title="Sentiment Analysis - Student Feedback") as demo:
    
    # Header
    gr.Markdown("""
    # 🎓 Student Feedback Sentiment Analysis
    ### Multilingual Support: English • اردو • Roman Urdu
    """)
    
    gr.Markdown("---")
    
    # Main content
    with gr.Row():
        # Left column - Input
        with gr.Column(scale=1):
            gr.Markdown("### 📝 Enter Feedback")
            
            input_text = gr.Textbox(
                label="Student Feedback",
                placeholder="Enter feedback in English, Urdu, or Roman Urdu...\nPress Enter or click Analyze",
                lines=5,
                max_lines=10
            )
            
            with gr.Row():
                analyze_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", scale=2)
                clear_btn = gr.Button("🗑️ Clear Logs", variant="secondary", scale=1)
        
        # Right column - Results
        with gr.Column(scale=1):
            gr.Markdown("### 📊 Analysis Results")
            
            with gr.Row():
                sentiment_output = gr.Textbox(
                    label="Sentiment",
                    interactive=False
                )
                confidence_output = gr.Textbox(
                    label="Confidence",
                    interactive=False
                )
            
            language_output = gr.Textbox(
                label="Detected Language",
                interactive=False
            )
            
            detail_output = gr.Markdown(
                label="Details",
                value=""
            )
    
    # Bottom section - Logs and Export
    gr.Markdown("---")
    gr.Markdown("### 📋 Analysis History")
    
    with gr.Row():
        logs_display = gr.Dataframe(
            headers=["Timestamp", "Text", "Language", "Sentiment", "Confidence"],
            datatype=["str", "str", "str", "str", "number"],
            label="Recent Analyses",
            wrap=True,
            interactive=False,
            value=load_logs()
        )
    
    with gr.Row():
        export_file = gr.File(
            label="📥 Download Complete Logs (CSV)",
            value=CSV_FILE,
            interactive=False
        )
        gr.Markdown("""
        **💡 Tips:**
        - Type your feedback and press **Enter** or click **Analyze**
        - Supports English, Urdu (اردو), and Roman Urdu
        - All analyses are automatically saved
        - Download CSV for complete history
        """)
    
    # Model information
    gr.Markdown("---")
    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Models Used:**
        - **English:** tahamueed23/sentiment_roberta_english_finetuned
        - **Urdu & Roman Urdu:** tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
        
        **Features:**
        - Automatic language detection
        - High-accuracy sentiment classification
        - Real-time analysis
        - CSV export for data analysis
        - Support for mixed feedback in different languages

        **Important Note:**
        - If you’re facing problems, it’s because you didn’t learn, so go educate yourself before others😊.
        """)
    
    # Event handlers
    def process_and_update(text):
        return process_sentiment(text)
    
    # Click event
    analyze_btn.click(
        fn=process_and_update,
        inputs=[input_text],
        outputs=[
            sentiment_output,
            confidence_output,
            language_output,
            detail_output,
            logs_display,
            export_file
        ]
    )
    
    # Enter key event
    input_text.submit(
        fn=process_and_update,
        inputs=[input_text],
        outputs=[
            sentiment_output,
            confidence_output,
            language_output,
            detail_output,
            logs_display,
            export_file
        ]
    )
    
    # Clear logs event
    clear_btn.click(
        fn=clear_logs,
        inputs=[],
        outputs=[logs_display]
    )

# Launch the app
if __name__ == "__main__":
    print("\n" + "="*50)
    print("🚀 Starting Sentiment Analysis Application")
    print("="*50 + "\n")
    demo.launch(
        share=False,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )