Spaces:

UDHOV
/

Nepali-hate-classification

Sleeping

App Files Files Community

UDHOV commited on Mar 6

Commit

5a1a25f

1 Parent(s): 1d7b47b

The initail deployment

Browse files

Files changed (8) hide show

Dockerfile +41 -7
README.md +36 -12
requirements.txt +34 -3
scripts/__init__.py +0 -0
scripts/captum_explainer.py +710 -0
scripts/explainability.py +884 -0
scripts/transformer_data_preprocessing.py +786 -0
src/streamlit_app.py +1421 -35

Dockerfile CHANGED Viewed

@@ -1,20 +1,54 @@
-FROM python:3.13.5-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
+# Set working directory
 WORKDIR /app
+# Install system dependencies
+# fontconfig is needed to register and cache the Kalimati font
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    fontconfig \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy all app files (includes fonts/Kalimati.ttf)
+COPY . .
+# Register Kalimati font with the system font cache
+# This allows matplotlib to find it system-wide as a fallback
+RUN if [ -f /app/fonts/Kalimati.ttf ]; then \
+        mkdir -p /usr/local/share/fonts/nepali && \
+        cp /app/fonts/Kalimati.ttf /usr/local/share/fonts/nepali/ && \
+        fc-cache -fv; \
+    fi
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV HF_HOME=/tmp/huggingface
+ENV TRANSFORMERS_CACHE=/tmp/huggingface
+ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV MPLCONFIGDIR=/tmp/matplotlib
+# Create necessary temp directories
+RUN mkdir -p /tmp/huggingface /tmp/matplotlib
+# Expose the port HF Spaces expects
+EXPOSE 7860
+# Run the Streamlit app
+CMD ["streamlit", "run", "main_app.py", \
+     "--server.port=7860", \
+     "--server.address=0.0.0.0", \
+     "--server.headless=true", \
+     "--browser.gatherUsageStats=false"]

README.md CHANGED Viewed

@@ -1,20 +1,44 @@
 ---
-title: Nepali Hate Classification
-emoji: 🚀
 colorFrom: red
-colorTo: red
 sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Multi-class Nepali hate content classification system suppor
 license: mit
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: Nepali Hate Content Classification
+emoji: 🔍
 colorFrom: red
+colorTo: blue
 sdk: docker
+pinned: true
 license: mit
+short_description: Multi-class hate content classifier for Nepali social media text
 ---
+# Nepali Hate Content Classification System
+An interactive web application for classifying hate content in Nepali social media text. Supports **Devanagari script**, **Romanized Nepali**, **English**, and **code-mixed** inputs.
+## Models
+| Model | HF Repo |
+|-------|---------|
+| XLM-RoBERTa Large | [UDHOV/xlm-roberta-large-nepali-hate-classification](https://huggingface.co/UDHOV/xlm-roberta-large-nepali-hate-classification) |
+| NepaliBERT | [UDHOV/nepalibert-nepali-hate-classification](https://huggingface.co/UDHOV/nepalibert-nepali-hate-classification) |
+## Classes
+| Label | Description |
+|-------|-------------|
+| 🟢 NON_OFFENSIVE | No offensive content |
+| 🟡 OTHER_OFFENSIVE | General offensive content |
+| 🔴 OFFENSIVE_RACIST | Targets ethnicity, race, or caste |
+| 🔴 OFFENSIVE_SEXIST | Targets gender |
+## Features
+- Single text and batch (CSV) classification
+- Automatic script detection and preprocessing
+- Emoji semantic mapping (180+ emojis)
+- Confidence scores with visualization
+- Explainability via LIME, SHAP, and Integrated Gradients (Captum)
+- Prediction history tracking
+## Project
+Bachelor of Computer Engineering Final Project
+Khwopa College of Engineering, Tribhuvan University, Nepal (2026)

requirements.txt CHANGED Viewed

@@ -1,3 +1,34 @@
-altair
-pandas
-streamlit

+# Core
+streamlit>=1.32.0
+pandas>=1.5.0
+numpy>=1.23.0
+# Deep Learning
+torch>=2.0.0
+transformers>=4.38.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0
+# HuggingFace
+huggingface-hub>=0.21.0
+accelerate>=0.27.0
+# Preprocessing
+deep-translator>=1.11.4
+indic-transliteration>=2.3.0
+emoji>=2.10.0
+# Explainability
+lime>=0.2.0.1
+shap>=0.44.0
+captum>=0.7.0
+# Visualization
+plotly>=5.18.0
+matplotlib>=3.7.0
+# Utilities
+scikit-learn>=1.3.0
+joblib>=1.3.0
+scipy>=1.10.0
+tqdm>=4.65.0

scripts/__init__.py ADDED Viewed

File without changes

scripts/captum_explainer.py ADDED Viewed

	@@ -0,0 +1,710 @@

+"""
+Captum Explainer Module
+========================
+Gradient-based explainability using Captum's Integrated Gradients.
+This module provides:
+- Layer Integrated Gradients attribution
+- Token-level importance visualization
+- Emoji-aware visualization with Nepali font support
+- Heatmap and bar chart visualizations
+Usage:
+------
+from scripts.captum_explainer import CaptumExplainer, explain_with_captum
+# Create explainer
+explainer = CaptumExplainer(model, tokenizer, label_encoder, preprocessor)
+# Explain prediction
+result = explainer.explain(
+    original_text="Your text here",
+    n_steps=50,
+    nepali_font=font
+)
+# Visualize
+explainer.visualize_bar_chart(result, save_path="ig_bar.png")
+explainer.visualize_heatmap(result, save_path="ig_heatmap.png")
+# All-in-one
+result = explainer.explain_and_visualize(
+    original_text="Your text",
+    save_dir="./explanations",
+    show=True
+)
+"""
+import os
+import numpy as np
+import torch
+import re
+import emoji
+import regex
+import warnings
+warnings.filterwarnings("ignore")
+from typing import Dict, List, Tuple, Optional
+from matplotlib import pyplot as plt, cm
+from matplotlib.font_manager import FontProperties
+import matplotlib.colors as mcolors
+# Captum
+try:
+    from captum.attr import LayerIntegratedGradients
+    CAPTUM_AVAILABLE = True
+except ImportError:
+    CAPTUM_AVAILABLE = False
+    print("⚠️ Captum not installed. Install with: pip install captum")
+# ============================================================================
+# TOKEN ALIGNMENT WITH EMOJI PRESERVATION
+# ============================================================================
+def create_display_tokens_from_subwords(
+    original_text: str,
+    preprocessed_text: str,
+    tokenizer_tokens: List[str],
+    emoji_to_nepali_map: Dict[str, str],
+    remove_special: bool = True
+) -> List[str]:
+    """
+    Create display tokens that preserve emojis from original text
+    Maps preprocessed tokens (with emoji translations) back to original tokens (with actual emojis)
+    Args:
+        original_text: Original text with emojis (e.g., "तेरी कसम 😀😀")
+        preprocessed_text: Preprocessed text (e.g., "तेरी कसम खुशी खुशी")
+        tokenizer_tokens: Tokenized output from model
+        emoji_to_nepali_map: Emoji to Nepali mapping dictionary
+        remove_special: Whether to remove special tokens
+    Returns:
+        List of display tokens with emojis preserved (e.g., ["तेरी", "कसम", "😀", "😀"])
+    """
+    # Build reverse emoji mapping (Nepali text → emoji)
+    # For multi-word translations like "ठूलो रिस", we need to handle them specially
+    reverse_emoji_map = {}
+    multi_word_emoji_map = {}  # For phrases like "ठूलो रिस"
+    for emoji_char, nepali_text in emoji_to_nepali_map.items():
+        if ' ' in nepali_text:
+            # Multi-word translation
+            multi_word_emoji_map[nepali_text] = emoji_char
+            # Also map individual words (as fallback)
+            for word in nepali_text.split():
+                if word not in reverse_emoji_map:
+                    reverse_emoji_map[word] = emoji_char
+        else:
+            # Single word translation
+            reverse_emoji_map[nepali_text] = emoji_char
+    # Clean and group tokenizer output into words
+    word_pieces = []
+    current_word = ""
+    for tok in tokenizer_tokens:
+        # Skip special tokens if requested
+        if remove_special and tok in ['<s>', '</s>', '[CLS]', '[SEP]', '<pad>', '[PAD]']:
+            continue
+        if tok.startswith("▁"):
+            # New word
+            if current_word:
+                word_pieces.append(current_word)
+            current_word = tok.replace("▁", "")
+        else:
+            # Continue current word
+            current_word += tok.replace("▁", "")
+    if current_word:
+        word_pieces.append(current_word)
+    # Get original words
+    original_words = original_text.split()
+    # Map word_pieces back to original with emojis
+    display_tokens = []
+    orig_idx = 0
+    word_idx = 0
+    while word_idx < len(word_pieces):
+        word = word_pieces[word_idx]
+        # Check for multi-word emoji phrases first
+        if word_idx < len(word_pieces) - 1:
+            two_word_phrase = f"{word} {word_pieces[word_idx + 1]}"
+            if two_word_phrase in multi_word_emoji_map:
+                # Found a multi-word emoji translation - show emoji once
+                display_tokens.append(multi_word_emoji_map[two_word_phrase])
+                word_idx += 2  # Skip both words
+                continue
+        # Check if this single word is an emoji translation
+        if word in reverse_emoji_map:
+            # This is a Nepali emoji translation → use the actual emoji
+            display_tokens.append(reverse_emoji_map[word])
+            word_idx += 1
+        else:
+            # Regular word - try to match with original
+            matched = False
+            # Look for matching word in original
+            while orig_idx < len(original_words):
+                orig_word = original_words[orig_idx]
+                # Skip emojis in original (they're handled by reverse_emoji_map)
+                if any(c in emoji.EMOJI_DATA for c in orig_word):
+                    orig_idx += 1
+                    continue
+                # Check if words match
+                orig_clean = emoji.replace_emoji(orig_word, replace="").strip()
+                if orig_clean and (word in orig_clean or orig_clean in word or word == orig_clean):
+                    display_tokens.append(orig_word)
+                    matched = True
+                    orig_idx += 1
+                    break
+                orig_idx += 1
+            if not matched:
+                # Couldn't match - use the word as-is
+                display_tokens.append(word)
+            word_idx += 1
+    return display_tokens
+# ============================================================================
+# FONT HANDLING
+# ============================================================================
+def apply_nepali_font(ax_or_text, nepali_font: Optional[FontProperties] = None,
+                     is_axis: bool = True):
+    """
+    Apply Nepali font to text containing Devanagari (but not emojis)
+    Args:
+        ax_or_text: Matplotlib axis or text object
+        nepali_font: Nepali font properties
+        is_axis: Whether ax_or_text is an axis (True) or text object (False)
+    """
+    if nepali_font is None:
+        return
+    if is_axis:
+        # Apply to axis tick labels
+        for lbl in ax_or_text.get_xticklabels():
+            text_content = lbl.get_text()
+            # Only apply if has Devanagari AND no emojis
+            has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
+            has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
+            if has_devanagari and not has_emoji:
+                lbl.set_fontproperties(nepali_font)
+                lbl.set_fontsize(11)
+    else:
+        # Apply to single text object
+        text_content = ax_or_text.get_text()
+        has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
+        has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
+        if has_devanagari and not has_emoji:
+            ax_or_text.set_fontproperties(nepali_font)
+# ============================================================================
+# CAPTUM EXPLAINER CLASS
+# ============================================================================
+class CaptumExplainer:
+    """
+    Captum Integrated Gradients explainer with emoji support
+    """
+    def __init__(self, model, tokenizer, label_encoder, preprocessor,
+                 emoji_to_nepali_map: Optional[Dict[str, str]] = None,
+                 device=None, max_length: int = 256):
+        """
+        Args:
+            model: Trained model
+            tokenizer: Model tokenizer
+            label_encoder: Label encoder
+            preprocessor: HateSpeechPreprocessor instance
+            emoji_to_nepali_map: Emoji to Nepali mapping (optional)
+            device: torch device (auto-detected if None)
+            max_length: Maximum sequence length
+        """
+        if not CAPTUM_AVAILABLE:
+            raise ImportError("Captum not installed. Install with: pip install captum")
+        self.model = model
+        self.tokenizer = tokenizer
+        self.label_encoder = label_encoder
+        self.preprocessor = preprocessor
+        self.class_names = label_encoder.classes_.tolist()
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.max_length = max_length
+        self.emoji_to_nepali_map = emoji_to_nepali_map or {}
+        self.model.to(self.device).eval()
+        # Get embedding layer (model-specific)
+        self.embedding_layer = self._get_embedding_layer()
+    def _get_embedding_layer(self):
+        """Get the embedding layer from the model"""
+        # Try different model architectures
+        if hasattr(self.model, 'roberta'):
+            # XLM-RoBERTa
+            return self.model.roberta.embeddings.word_embeddings
+        elif hasattr(self.model, 'bert'):
+            # BERT-based
+            return self.model.bert.embeddings.word_embeddings
+        elif hasattr(self.model, 'transformer'):
+            # Generic transformer
+            return self.model.transformer.wte
+        else:
+            raise AttributeError("Could not find embedding layer. Please specify manually.")
+    def explain(self, original_text: str, target: Optional[int] = None,
+               n_steps: int = 50) -> Dict:
+        """
+        Generate Integrated Gradients explanation
+        Args:
+            original_text: Original text with emojis
+            target: Target class index (None = predicted class)
+            n_steps: Number of IG steps
+        Returns:
+            Dictionary with explanation results
+        """
+        # Preprocess
+        preprocessed, emoji_features = self.preprocessor.preprocess(original_text, verbose=False)
+        if not preprocessed:
+            raise ValueError("Preprocessing resulted in empty text")
+        # Tokenize
+        encoding = self.tokenizer(
+            preprocessed,
+            return_tensors="pt",
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_length
+        )
+        input_ids = encoding['input_ids'].to(self.device)
+        attention_mask = encoding['attention_mask'].to(self.device)
+        # Get prediction
+        with torch.no_grad():
+            out = self.model(input_ids=input_ids, attention_mask=attention_mask)
+            probs = torch.softmax(out.logits, dim=-1)[0].cpu().numpy()
+            pred_idx = int(np.argmax(probs))
+            pred_label = self.class_names[pred_idx]
+            pred_conf = float(probs[pred_idx])
+        if target is None:
+            target = pred_idx
+        # Forward function for Captum
+        def forward_func(input_ids_arg, attention_mask_arg):
+            """Forward function that takes input_ids"""
+            return self.model(input_ids=input_ids_arg, attention_mask=attention_mask_arg).logits[:, target]
+        # Initialize Integrated Gradients
+        lig = LayerIntegratedGradients(forward_func, self.embedding_layer)
+        # Baseline: all pad tokens
+        baseline_ids = torch.full_like(input_ids, self.tokenizer.pad_token_id)
+        # Calculate attributions
+        attributions, delta = lig.attribute(
+            input_ids,
+            baselines=baseline_ids,
+            additional_forward_args=(attention_mask,),
+            return_convergence_delta=True,
+            n_steps=n_steps
+        )
+        # Sum across embedding dimension
+        attributions_sum = attributions.sum(dim=-1).squeeze(0)
+        # Get tokens
+        tokens = self.tokenizer.convert_ids_to_tokens(
+            input_ids[0].cpu().tolist(),
+            skip_special_tokens=False
+        )
+        # Create display tokens with emojis preserved
+        display_tokens = create_display_tokens_from_subwords(
+            original_text,
+            preprocessed,
+            tokens,
+            self.emoji_to_nepali_map,
+            remove_special=True
+        )
+        # Aggregate word-level attributions
+        word_attributions = self._aggregate_word_attributions(
+            tokens, attributions_sum, display_tokens
+        )
+        return {
+            "original_text": original_text,
+            "preprocessed_text": preprocessed,
+            "emoji_features": emoji_features,
+            "predicted_label": pred_label,
+            "predicted_index": pred_idx,
+            "confidence": pred_conf,
+            "probabilities": {label: float(prob) for label, prob in zip(self.class_names, probs)},
+            "word_attributions": word_attributions,
+            "convergence_delta": float(delta.sum().cpu().numpy()),
+            "tokens": tokens,
+            "display_tokens": display_tokens
+        }
+    def _aggregate_word_attributions(self, tokens: List[str], attributions_sum: torch.Tensor,
+                                    display_tokens: List[str]) -> List[Tuple[str, float, float]]:
+        """
+        Aggregate subword attributions to word-level
+        Returns:
+            List of (word, abs_score, signed_score) tuples
+        """
+        word_attributions = []
+        current_indices = []
+        for i, tok in enumerate(tokens):
+            # Skip special tokens
+            if tok in ['<s>', '</s>', '[CLS]', '[SEP]', '<pad>', '[PAD]']:
+                continue
+            if tok.startswith("▁"):
+                # New word starts
+                if current_indices:
+                    # Save previous word
+                    grp_vals = attributions_sum[current_indices].detach().cpu().numpy()
+                    score = float(np.sum(np.abs(grp_vals)))
+                    signed_score = float(np.sum(grp_vals))
+                    word = "".join([tokens[j].replace("▁", "") for j in current_indices])
+                    word_attributions.append((word, score, signed_score))
+                current_indices = [i]
+            else:
+                # Continue current word
+                current_indices.append(i)
+        # Don't forget last word
+        if current_indices:
+            grp_vals = attributions_sum[current_indices].detach().cpu().numpy()
+            score = float(np.sum(np.abs(grp_vals)))
+            signed_score = float(np.sum(grp_vals))
+            word = "".join([tokens[j].replace("▁", "") for j in current_indices])
+            word_attributions.append((word, score, signed_score))
+        # Align with display tokens
+        if len(display_tokens) == len(word_attributions):
+            aligned_attributions = [
+                (display_tok, score, signed_score)
+                for display_tok, (_, score, signed_score) in zip(display_tokens, word_attributions)
+            ]
+        else:
+            aligned_attributions = word_attributions
+        # Post-process: merge attributions for multi-word emoji translations
+        # Build reverse mapping to detect which words are parts of multi-word emojis
+        multi_word_phrases = set()
+        for emoji_char, nepali_text in self.emoji_to_nepali_map.items():
+            if ' ' in nepali_text:
+                multi_word_phrases.add(nepali_text)
+        # Merge consecutive words that form a multi-word emoji phrase
+        merged_attributions = []
+        i = 0
+        while i < len(aligned_attributions):
+            word, score, signed_score = aligned_attributions[i]
+            # Check if this word + next word(s) form a multi-word emoji phrase
+            merged = False
+            for phrase in multi_word_phrases:
+                phrase_words = phrase.split()
+                if i + len(phrase_words) <= len(aligned_attributions):
+                    # Check if consecutive words match the phrase
+                    candidate_words = [aligned_attributions[i + j][0] for j in range(len(phrase_words))]
+                    candidate_phrase = ' '.join(candidate_words)
+                    # Also check if any word is already the emoji (from display_tokens fix)
+                    has_emoji = any(c in emoji.EMOJI_DATA for c in word)
+                    if candidate_phrase == phrase or (has_emoji and len(phrase_words) > 1):
+                        # Found a multi-word emoji phrase - merge their scores
+                        total_abs_score = sum(aligned_attributions[i + j][1] for j in range(len(phrase_words)))
+                        total_signed_score = sum(aligned_attributions[i + j][2] for j in range(len(phrase_words)))
+                        # Find the corresponding emoji
+                        emoji_char = [e for e, n in self.emoji_to_nepali_map.items() if n == phrase][0]
+                        merged_attributions.append((emoji_char, total_abs_score, total_signed_score))
+                        i += len(phrase_words)  # Skip all words in the phrase
+                        merged = True
+                        break
+            if not merged:
+                merged_attributions.append((word, score, signed_score))
+                i += 1
+        return merged_attributions
+    def visualize_bar_chart(self, explanation: Dict, save_path: Optional[str] = None,
+                           show: bool = True, nepali_font: Optional[FontProperties] = None,
+                           figsize: Tuple[int, int] = None):
+        """
+        Create bar chart visualization
+        Args:
+            explanation: Explanation dictionary from explain()
+            save_path: Path to save figure
+            show: Whether to display figure
+            nepali_font: Nepali font properties
+            figsize: Figure size (auto if None)
+        Returns:
+            matplotlib figure
+        """
+        word_attributions = explanation['word_attributions']
+        pred_label = explanation['predicted_label']
+        pred_conf = explanation['confidence']
+        scores = [s for _, s, _ in word_attributions]
+        words = [w.replace('_', ' ') for w, _, _ in word_attributions]  # Replace underscores
+        signed_scores = [ss for _, _, ss in word_attributions]
+        if figsize is None:
+            figsize = (max(8, 0.6 * len(words)), 5)
+        fig, ax = plt.subplots(figsize=figsize)
+        colors = ['green' if ss > 0 else 'red' for ss in signed_scores]
+        ax.bar(range(len(words)), scores, tick_label=words, color=colors, alpha=0.7)
+        ax.set_ylabel("Attribution (sum abs)", fontsize=12)
+        ax.set_title(
+            f"Integrated Gradients → Pred: {pred_label} ({pred_conf:.2%})",
+            fontsize=14,
+            fontweight='bold'
+        )
+        # Apply Nepali font
+        if nepali_font:
+            apply_nepali_font(ax, nepali_font, is_axis=True)
+        plt.xticks(rotation=45, ha='right')
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"✓ Bar chart saved to: {save_path}")
+        if show:
+            plt.show()
+        else:
+            plt.close(fig)
+        return fig
+    def visualize_heatmap(self, explanation: Dict, save_path: Optional[str] = None,
+                         show: bool = True, nepali_font: Optional[FontProperties] = None,
+                         figsize: Tuple[int, int] = None):
+        """
+        Create heatmap visualization with colored text boxes
+        Args:
+            explanation: Explanation dictionary from explain()
+            save_path: Path to save figure
+            show: Whether to display figure
+            nepali_font: Nepali font properties
+            figsize: Figure size (auto if None)
+        Returns:
+            matplotlib figure
+        """
+        word_attributions = explanation['word_attributions']
+        pred_label = explanation['predicted_label']
+        scores = [s for _, s, _ in word_attributions]
+        max_score = max(scores) if scores else 1.0
+        cmap = cm.get_cmap("RdYlGn")
+        if figsize is None:
+            figsize = (max(10, 0.6 * len(word_attributions)), 3)
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.axis('off')
+        x, y = 0.01, 0.6
+        text_objs = []
+        for word, score, signed_score in word_attributions:
+            # Replace underscores with spaces for display
+            display_word = word.replace('_', ' ')
+            # Normalize for color
+            intensity = min(score / max_score, 1.0) if max_score > 0 else 0.0
+            # Color based on signed score
+            if signed_score > 0:
+                color = cmap(0.5 + intensity * 0.5)  # Green side
+            else:
+                color = cmap(0.5 - intensity * 0.5)  # Red side
+            txt = ax.text(
+                x, y, f" {display_word} ",
+                fontsize=13,
+                bbox=dict(
+                    facecolor=mcolors.to_hex(color),
+                    alpha=0.8,
+                    boxstyle="round,pad=0.3",
+                    edgecolor='gray'
+                )
+            )
+            # Apply Nepali font only to Devanagari text (but not if it contains emojis)
+            has_emoji = any(c in emoji.EMOJI_DATA for c in display_word)
+            has_devanagari = bool(regex.search(r'\p{Devanagari}', display_word))
+            if nepali_font and has_devanagari and not has_emoji:
+                txt.set_fontproperties(nepali_font)
+            text_objs.append(txt)
+            # Update position - emojis take less horizontal space
+            char_width = 0.025 if any(c in emoji.EMOJI_DATA for c in display_word) else 0.04
+            x += char_width * len(display_word) + 0.01
+            if x > 0.92:
+                x = 0.01
+                y -= 0.35
+        # Title
+        ax.text(
+            0.5, 0.95,
+            f"Token Attributions (Predicted: {pred_label})",
+            ha='center',
+            va='top',
+            fontsize=14,
+            fontweight='bold',
+            transform=ax.transAxes
+        )
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"✓ Heatmap saved to: {save_path}")
+        if show:
+            plt.show()
+        else:
+            plt.close(fig)
+        return fig
+    def explain_and_visualize(self, original_text: str, target: Optional[int] = None,
+                            n_steps: int = 50, save_dir: Optional[str] = None,
+                            show: bool = True, nepali_font: Optional[FontProperties] = None):
+        """
+        Explain and visualize in one step
+        Args:
+            original_text: Original text with emojis
+            target: Target class index (None = predicted)
+            n_steps: Number of IG steps
+            save_dir: Directory to save figures
+            show: Whether to display figures
+            nepali_font: Nepali font properties
+        Returns:
+            Dictionary with explanation and figures
+        """
+        # Generate explanation
+        explanation = self.explain(original_text, target, n_steps)
+        # Generate file paths if save_dir provided
+        if save_dir:
+            os.makedirs(save_dir, exist_ok=True)
+            hash_suffix = abs(hash(original_text)) % 10**8
+            bar_path = os.path.join(save_dir, f"ig_bar_{explanation['predicted_label']}_{hash_suffix}.png")
+            heatmap_path = os.path.join(save_dir, f"ig_heatmap_{explanation['predicted_label']}_{hash_suffix}.png")
+        else:
+            bar_path = None
+            heatmap_path = None
+        # Visualize
+        bar_fig = self.visualize_bar_chart(explanation, bar_path, show, nepali_font)
+        heatmap_fig = self.visualize_heatmap(explanation, heatmap_path, show, nepali_font)
+        return {
+            'explanation': explanation,
+            'bar_chart': bar_fig,
+            'heatmap': heatmap_fig
+        }
+# ============================================================================
+# CONVENIENCE FUNCTIONS
+# ============================================================================
+def explain_with_captum(text: str, model, tokenizer, label_encoder, preprocessor,
+                       emoji_to_nepali_map: Optional[Dict[str, str]] = None,
+                       n_steps: int = 50, nepali_font: Optional[FontProperties] = None,
+                       save_dir: Optional[str] = None, show: bool = True) -> Dict:
+    """
+    Convenience function to explain a text with Captum
+    Args:
+        text: Input text
+        model: Trained model
+        tokenizer: Model tokenizer
+        label_encoder: Label encoder
+        preprocessor: HateSpeechPreprocessor instance
+        emoji_to_nepali_map: Emoji mapping dictionary
+        n_steps: Number of IG steps
+        nepali_font: Nepali font properties
+        save_dir: Directory to save figures
+        show: Whether to display figures
+    Returns:
+        Dictionary with explanation and visualizations
+    """
+    explainer = CaptumExplainer(
+        model, tokenizer, label_encoder, preprocessor,
+        emoji_to_nepali_map=emoji_to_nepali_map
+    )
+    return explainer.explain_and_visualize(
+        text, n_steps=n_steps, save_dir=save_dir, show=show, nepali_font=nepali_font
+    )
+def check_availability() -> bool:
+    """Check if Captum is available"""
+    return CAPTUM_AVAILABLE
+# ============================================================================
+# DEFAULT EMOJI MAPPING (For standalone usage)
+# ============================================================================
+DEFAULT_EMOJI_TO_NEPALI = {
+    '😀': 'खुशी', '😁': 'खुशी', '😂': 'हाँसो', '😃': 'खुशी', '😄': 'खुशी',
+    '😅': 'नर्भस हाँसो', '😆': 'हाँसो', '😊': 'मुस्कान', '😍': 'माया',
+    '😠': 'रिस', '😡': 'ठूलो रिस', '🤬': 'गाली', '😈': 'खराब',
+    '🖕': 'अपमान', '👎': 'नकारात्मक', '👍': 'सकारात्मक', '🙏': 'नमस्कार',
+    '❤️': 'माया', '💔': 'टुटेको मन', '🔥': 'आगो', '💯': 'पूर्ण',
+}

scripts/explainability.py ADDED Viewed

	@@ -0,0 +1,884 @@

+"""
+Explainability Module - LIME & SHAP
+===================================
+Model-agnostic explainability for Nepali hate speech classification.
+This module provides:
+- LIME (Local Interpretable Model-agnostic Explanations)
+- SHAP (SHapley Additive exPlanations)
+- Emoji-aware visualization with Nepali font support
+Usage:
+------
+from scripts.explainability import LIMEExplainer, SHAPExplainer, create_explainer_wrapper
+# Create model wrapper
+wrapper = create_explainer_wrapper(model, tokenizer, label_encoder, preprocessor)
+# LIME explanation
+lime = LIMEExplainer(wrapper, nepali_font=font)
+lime.explain_and_visualize(original_text, preprocessed_text, save_path="lime.png")
+# SHAP explanation
+shap_exp = SHAPExplainer(wrapper, nepali_font=font)
+shap_exp.explain_and_visualize(original_text, preprocessed_text, save_path="shap.png")
+"""
+import os
+import numpy as np
+import torch
+import re
+import emoji
+import regex
+import warnings
+warnings.filterwarnings("ignore")
+from typing import Optional, Tuple, Dict, List
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties
+# Explainability libraries
+try:
+    from lime.lime_text import LimeTextExplainer
+    LIME_AVAILABLE = True
+except ImportError:
+    LIME_AVAILABLE = False
+    print("⚠️ LIME not installed. Install with: pip install lime")
+try:
+    import shap
+    from shap import Explainer, maskers
+    SHAP_AVAILABLE = True
+except ImportError:
+    SHAP_AVAILABLE = False
+    print("⚠️ SHAP not installed. Install with: pip install shap")
+# ============================================================================
+# MODEL WRAPPER CLASS
+# ============================================================================
+class ModelExplainerWrapper:
+    """
+    Wrapper class for model + preprocessing
+    Makes model compatible with LIME/SHAP
+    """
+    def __init__(self, model, tokenizer, label_encoder, preprocessor, device=None):
+        """
+        Args:
+            model: Trained model
+            tokenizer: Model tokenizer
+            label_encoder: Label encoder
+            preprocessor: HateSpeechPreprocessor instance
+            device: torch device (auto-detected if None)
+        """
+        self.model = model
+        self.tokenizer = tokenizer
+        self.class_names = label_encoder.classes_.tolist()
+        self.preprocessor = preprocessor
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device).eval()
+    def preprocess_text(self, text: str) -> Tuple[str, Dict[str, int]]:
+        """Preprocess text using the HateSpeechPreprocessor"""
+        return self.preprocessor.preprocess(text, verbose=False)
+    def predict_proba(self, texts):
+        """
+        Predict probabilities for texts
+        Args:
+            texts: Single text or list of texts (already preprocessed)
+        Returns:
+            numpy array of probabilities
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        elif isinstance(texts, np.ndarray):
+            texts = texts.tolist() if texts.ndim > 0 else [str(texts)]
+        # Convert to strings and filter empty
+        texts = [str(t).strip() for t in texts if str(t).strip()]
+        if not texts:
+            # Return uniform probabilities for empty input
+            return np.ones((1, len(self.class_names))) / len(self.class_names)
+        # Tokenize
+        enc = self.tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=256,
+            return_tensors="pt"
+        ).to(self.device)
+        # Predict
+        with torch.no_grad():
+            probs = torch.softmax(self.model(**enc).logits, dim=-1)
+        return probs.cpu().numpy()
+    def predict_with_analysis(self, text: str) -> Dict:
+        """
+        Predict with full analysis
+        Returns:
+            Dictionary with original text, preprocessed text, predictions, etc.
+        """
+        # Preprocess
+        preprocessed, emoji_features = self.preprocess_text(text)
+        # Predict
+        probs = self.predict_proba(preprocessed)[0]
+        pred_idx = int(np.argmax(probs))
+        return {
+            "original_text": text,
+            "preprocessed_text": preprocessed,
+            "emoji_features": emoji_features,
+            "predicted_label": self.class_names[pred_idx],
+            "confidence": float(probs[pred_idx]),
+            "probabilities": {label: float(prob) for label, prob in zip(self.class_names, probs)}
+        }
+# ============================================================================
+# UTILITY FUNCTIONS
+# ============================================================================
+def apply_nepali_font(ax, nepali_font: Optional[FontProperties] = None,
+                     texts: Optional[list] = None, is_tick_labels: bool = True):
+    """
+    Apply Nepali font to Devanagari text while preserving emojis
+    Args:
+        ax: Matplotlib axes
+        nepali_font: Nepali font properties
+        texts: Text objects to apply font to (if not tick labels)
+        is_tick_labels: Whether to apply to tick labels
+    """
+    if nepali_font is None:
+        return
+    if is_tick_labels or texts is None:
+        for txt in ax.get_yticklabels():
+            text_content = txt.get_text()
+            # Only apply if has Devanagari AND no emojis
+            has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
+            has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
+            if has_devanagari and not has_emoji:
+                txt.set_fontproperties(nepali_font)
+                txt.set_fontsize(11)
+    else:
+        for txt in texts:
+            text_content = txt.get_text()
+            has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
+            has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
+            if has_devanagari and not has_emoji:
+                txt.set_fontproperties(nepali_font)
+def create_display_text_with_emojis(original_text: str, preprocessed_text: str) -> Tuple[List[str], List[str]]:
+    """
+    Create aligned display tokens preserving emojis
+    Handles multi-word emoji translations like: 😡 → "ठूलो रिस" (2 words)
+    Args:
+        original_text: Original text with emojis
+        preprocessed_text: Preprocessed text (emojis replaced with Nepali)
+    Returns:
+        Tuple of (display_tokens, model_tokens)
+    """
+    from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
+    original_tokens = original_text.split()
+    preprocessed_tokens = preprocessed_text.split()
+    # Build emoji to word count mapping (how many words each emoji becomes)
+    emoji_word_counts = {}
+    for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
+        word_count = len(nepali_text.split())
+        emoji_word_counts[emoji_char] = word_count
+    display_tokens = []
+    model_tokens = []
+    orig_idx = 0
+    proc_idx = 0
+    while orig_idx < len(original_tokens):
+        orig_token = original_tokens[orig_idx]
+        # Check if token contains emoji
+        has_emoji = any(c in emoji.EMOJI_DATA for c in orig_token)
+        if has_emoji:
+            # Display: keep original emoji
+            display_tokens.append(orig_token)
+            # Model: use Nepali translation (may be multiple words!)
+            # Count how many emojis in this token
+            emojis_in_token = [c for c in orig_token if c in emoji.EMOJI_DATA]
+            if emojis_in_token:
+                # Calculate total words needed for all emojis in this token
+                total_words_needed = sum(
+                    emoji_word_counts.get(e, 1) for e in emojis_in_token
+                )
+                # Collect that many preprocessed tokens
+                nepali_words = []
+                for _ in range(total_words_needed):
+                    if proc_idx < len(preprocessed_tokens):
+                        nepali_words.append(preprocessed_tokens[proc_idx])
+                        proc_idx += 1
+                # Join them as the model token
+                if nepali_words:
+                    model_tokens.append(' '.join(nepali_words))
+                else:
+                    model_tokens.append(orig_token)
+            else:
+                # Shouldn't happen, but fallback
+                if proc_idx < len(preprocessed_tokens):
+                    model_tokens.append(preprocessed_tokens[proc_idx])
+                    proc_idx += 1
+                else:
+                    model_tokens.append(orig_token)
+        else:
+            # No emoji: use preprocessed for both
+            if proc_idx < len(preprocessed_tokens):
+                display_tokens.append(preprocessed_tokens[proc_idx])
+                model_tokens.append(preprocessed_tokens[proc_idx])
+                proc_idx += 1
+            else:
+                display_tokens.append(orig_token)
+                model_tokens.append(orig_token)
+        orig_idx += 1
+    # Handle remaining preprocessed tokens
+    while proc_idx < len(preprocessed_tokens):
+        token = preprocessed_tokens[proc_idx]
+        display_tokens.append(token)
+        model_tokens.append(token)
+        proc_idx += 1
+    return display_tokens, model_tokens
+# ============================================================================
+# LIME EXPLAINER
+# ============================================================================
+class LIMEExplainer:
+    """LIME explainer with emoji support"""
+    def __init__(self, model_wrapper: ModelExplainerWrapper, nepali_font: Optional[FontProperties] = None):
+        """
+        Args:
+            model_wrapper: ModelExplainerWrapper instance
+            nepali_font: Nepali font properties for visualization
+        """
+        if not LIME_AVAILABLE:
+            raise ImportError("LIME not installed. Install with: pip install lime")
+        self.model_wrapper = model_wrapper
+        self.nepali_font = nepali_font
+        self.explainer = LimeTextExplainer(
+            class_names=model_wrapper.class_names,
+            random_state=42
+        )
+    def explain(self, original_text: str, preprocessed_text: str, num_samples: int = 200) -> Dict:
+        """
+        Generate LIME explanation
+        Args:
+            original_text: Original text with emojis
+            preprocessed_text: Preprocessed text for model
+            num_samples: Number of samples for LIME
+        Returns:
+            Dictionary with explanation data
+        """
+        # Get LIME explanation
+        exp = self.explainer.explain_instance(
+            preprocessed_text,
+            self.model_wrapper.predict_proba,
+            num_samples=num_samples
+        )
+        # Get token weights
+        token_weights = dict(exp.as_list())
+        # Create aligned tokens
+        display_tokens, model_tokens = create_display_text_with_emojis(
+            original_text, preprocessed_text
+        )
+        # Map weights to display tokens
+        word_scores = []
+        for display_tok, model_tok in zip(display_tokens, model_tokens):
+            score = 0.0
+            for lime_token, weight in token_weights.items():
+                if lime_token in model_tok or model_tok in lime_token:
+                    score += weight
+            word_scores.append((display_tok, score))
+        # Merge multi-word emoji attributions
+        word_scores = self._merge_multi_word_emojis(word_scores)
+        return {
+            'word_scores': word_scores,
+            'display_tokens': display_tokens,
+            'model_tokens': model_tokens,
+            'lime_explanation': exp
+        }
+    def _merge_multi_word_emojis(self, word_scores: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
+        """
+        Merge attributions for multi-word emoji translations like: ठूलो रिस → 😡
+        Args:
+            word_scores: List of (word, score) tuples
+        Returns:
+            Merged list with multi-word emojis combined
+        """
+        from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
+        # Build set of multi-word phrases
+        multi_word_phrases = {}
+        for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
+            if ' ' in nepali_text:
+                multi_word_phrases[nepali_text] = emoji_char
+        # Merge consecutive words that form multi-word emoji phrases
+        merged_scores = []
+        i = 0
+        while i < len(word_scores):
+            word, score = word_scores[i]
+            # Check if this word + next word(s) form a multi-word emoji phrase
+            merged = False
+            for phrase, emoji_char in multi_word_phrases.items():
+                phrase_words = phrase.split()
+                if i + len(phrase_words) <= len(word_scores):
+                    # Check if consecutive words match the phrase
+                    candidate_words = [word_scores[i + j][0] for j in range(len(phrase_words))]
+                    candidate_phrase = ' '.join(candidate_words)
+                    if candidate_phrase == phrase:
+                        # Found a multi-word emoji phrase - merge their scores
+                        total_score = sum(word_scores[i + j][1] for j in range(len(phrase_words)))
+                        merged_scores.append((emoji_char, total_score))
+                        i += len(phrase_words)  # Skip all words in the phrase
+                        merged = True
+                        break
+            if not merged:
+                merged_scores.append((word, score))
+                i += 1
+        return merged_scores
+    def visualize(self, word_scores: List[Tuple[str, float]], save_path: Optional[str] = None,
+                 show: bool = True, figsize: Tuple[int, int] = None):
+        """
+        Visualize LIME explanation
+        Args:
+            word_scores: List of (word, score) tuples
+            save_path: Path to save figure
+            show: Whether to display figure
+            figsize: Figure size (auto if None)
+        Returns:
+            matplotlib figure
+        """
+        if not word_scores:
+            print("⚠️ No words to visualize")
+            return None
+        # Replace underscores with spaces for display
+        word_scores_display = [(w.replace('_', ' '), score) for w, score in word_scores]
+        features, weights = zip(*word_scores_display)
+        y_pos = range(len(features))
+        if figsize is None:
+            figsize = (10, max(6, len(features) * 0.4))
+        fig, ax = plt.subplots(figsize=figsize)
+        colors = ['red' if w < 0 else 'green' for w in weights]
+        ax.barh(y_pos, weights, color=colors, alpha=0.6)
+        ax.set_yticks(y_pos)
+        ax.set_yticklabels(features, fontsize=12)
+        ax.invert_yaxis()
+        ax.set_xlabel("Contribution to Prediction", fontsize=12)
+        ax.set_title("LIME Feature Importance (Red=Against, Green=For)", fontsize=14)
+        ax.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
+        # Apply Nepali font
+        apply_nepali_font(ax, self.nepali_font)
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            print(f"✓ LIME visualization saved to: {save_path}")
+        if show:
+            plt.show()
+        else:
+            plt.close(fig)
+        return fig
+    def explain_and_visualize(self, original_text: str, preprocessed_text: str,
+                            save_path: Optional[str] = None, show: bool = True,
+                            num_samples: int = 200):
+        """
+        Explain and visualize in one step
+        Args:
+            original_text: Original text with emojis
+            preprocessed_text: Preprocessed text for model
+            save_path: Path to save figure
+            show: Whether to display figure
+            num_samples: Number of LIME samples
+        Returns:
+            Dictionary with explanation and figure
+        """
+        # Generate explanation
+        explanation = self.explain(original_text, preprocessed_text, num_samples)
+        # Visualize
+        fig = self.visualize(explanation['word_scores'], save_path, show)
+        return {
+            'explanation': explanation,
+            'figure': fig
+        }
+# ============================================================================
+# SHAP EXPLAINER
+# ============================================================================
+class SHAPExplainer:
+    """SHAP explainer with emoji support and fallback methods"""
+    def __init__(self, model_wrapper: ModelExplainerWrapper, nepali_font: Optional[FontProperties] = None):
+        """
+        Args:
+            model_wrapper: ModelExplainerWrapper instance
+            nepali_font: Nepali font properties for visualization
+        """
+        if not SHAP_AVAILABLE:
+            raise ImportError("SHAP not installed. Install with: pip install shap")
+        self.model_wrapper = model_wrapper
+        self.nepali_font = nepali_font
+    def explain(self, original_text: str, preprocessed_text: str, use_fallback: bool = True) -> Dict:
+        """
+        Generate SHAP explanation
+        Args:
+            original_text: Original text with emojis
+            preprocessed_text: Preprocessed text for model
+            use_fallback: Use fallback method if SHAP fails
+        Returns:
+            Dictionary with explanation data
+        """
+        try:
+            # Try SHAP with text masker
+            def predict_masked(masked_texts):
+                if isinstance(masked_texts, np.ndarray):
+                    if masked_texts.ndim == 1:
+                        texts = [' '.join(str(t) for t in masked_texts if str(t).strip())]
+                    else:
+                        texts = [' '.join(str(t) for t in row if str(t).strip()) for row in masked_texts]
+                elif isinstance(masked_texts, str):
+                    texts = [masked_texts]
+                elif isinstance(masked_texts, list):
+                    texts = masked_texts
+                else:
+                    texts = [str(masked_texts)]
+                return self.model_wrapper.predict_proba(texts)
+            explainer = Explainer(predict_masked, maskers.Text(preprocessed_text))
+            sv = explainer([preprocessed_text])[0]
+            shap_tokens = list(sv.data)
+            values_array = np.array(sv.values)
+            # Validate that we got meaningful results
+            if len(shap_tokens) == 0 or values_array.size == 0:
+                raise ValueError("SHAP returned empty results")
+            method_used = "shap"
+        except Exception as e:
+            if not use_fallback:
+                raise e
+            # Use fallback silently (only show in debug mode)
+            import logging
+            logging.debug(f"SHAP failed: {e}, using gradient fallback")
+            shap_tokens, values_array = self._gradient_based_attribution(preprocessed_text)
+            method_used = "gradient"
+        # Get predicted class
+        pred_probs = self.model_wrapper.predict_proba([preprocessed_text])[0]
+        class_idx = int(np.argmax(pred_probs))
+        # Extract values for predicted class
+        if values_array.ndim == 1:
+            token_values = values_array
+        elif values_array.ndim == 2:
+            token_values = values_array[:, class_idx]
+        elif values_array.ndim == 3:
+            token_values = values_array[0, :, class_idx]
+        else:
+            token_values = values_array.flatten()[:len(shap_tokens)]
+        # Create aligned tokens
+        display_tokens, model_tokens = create_display_text_with_emojis(
+            original_text, preprocessed_text
+        )
+        # Map SHAP values to display tokens
+        word_scores = self._align_shap_values(
+            display_tokens, model_tokens, shap_tokens, token_values
+        )
+        # Merge multi-word emoji attributions
+        word_scores = self._merge_multi_word_emojis(word_scores)
+        return {
+            'word_scores': word_scores,
+            'display_tokens': display_tokens,
+            'model_tokens': model_tokens,
+            'shap_tokens': shap_tokens,
+            'token_values': token_values,
+            'class_idx': class_idx,
+            'method_used': method_used
+        }
+    def _gradient_based_attribution(self, text: str) -> Tuple[List[str], np.ndarray]:
+        """
+        Fallback: Word-level attribution using occlusion
+        Masks each word and measures prediction change
+        """
+        words = text.split()
+        base_probs = self.model_wrapper.predict_proba([text])[0]
+        base_pred_idx = int(np.argmax(base_probs))
+        base_score = base_probs[base_pred_idx]
+        attributions = []
+        for i in range(len(words)):
+            # Mask the word
+            masked_words = words[:i] + words[i+1:]
+            masked_text = ' '.join(masked_words)
+            if not masked_text.strip():
+                attributions.append(base_score)
+                continue
+            # Get prediction without this word
+            masked_probs = self.model_wrapper.predict_proba([masked_text])[0]
+            masked_score = masked_probs[base_pred_idx]
+            # Attribution = score drop when word removed
+            attribution = base_score - masked_score
+            attributions.append(attribution)
+        # Ensure we have at least one attribution
+        if len(attributions) == 0:
+            attributions = [0.0] * len(words)
+        return words, np.array(attributions)
+    def _merge_multi_word_emojis(self, word_scores: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
+        """
+        Merge attributions for multi-word emoji translations like: ठूलो रिस → 😡
+        Args:
+            word_scores: List of (word, score) tuples
+        Returns:
+            Merged list with multi-word emojis combined
+        """
+        from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
+        # Build set of multi-word phrases
+        multi_word_phrases = {}
+        for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
+            if ' ' in nepali_text:
+                multi_word_phrases[nepali_text] = emoji_char
+        # Merge consecutive words that form multi-word emoji phrases
+        merged_scores = []
+        i = 0
+        while i < len(word_scores):
+            word, score = word_scores[i]
+            # Check if this word + next word(s) form a multi-word emoji phrase
+            merged = False
+            for phrase, emoji_char in multi_word_phrases.items():
+                phrase_words = phrase.split()
+                if i + len(phrase_words) <= len(word_scores):
+                    # Check if consecutive words match the phrase
+                    candidate_words = [word_scores[i + j][0] for j in range(len(phrase_words))]
+                    candidate_phrase = ' '.join(candidate_words)
+                    if candidate_phrase == phrase:
+                        # Found a multi-word emoji phrase - merge their scores
+                        total_score = sum(word_scores[i + j][1] for j in range(len(phrase_words)))
+                        merged_scores.append((emoji_char, total_score))
+                        i += len(phrase_words)  # Skip all words in the phrase
+                        merged = True
+                        break
+            if not merged:
+                merged_scores.append((word, score))
+                i += 1
+        return merged_scores
+    def _align_shap_values(self, display_tokens: List[str], model_tokens: List[str],
+                          shap_tokens: List[str], token_values: np.ndarray) -> List[Tuple[str, float]]:
+        """Align SHAP values with display tokens"""
+        word_scores = []
+        if len(display_tokens) == len(model_tokens):
+            # Direct alignment
+            for display_tok, model_tok in zip(display_tokens, model_tokens):
+                score = 0.0
+                for j, shap_tok in enumerate(shap_tokens):
+                    if j < len(token_values) and (shap_tok in model_tok or model_tok in shap_tok):
+                        score += float(token_values[j])
+                word_scores.append((display_tok, score))
+        else:
+            # Fallback: distribute evenly
+            for display_tok in display_tokens:
+                score = np.mean(token_values) if len(token_values) > 0 else 0.0
+                word_scores.append((display_tok, score))
+        return word_scores
+    def visualize(self, word_scores: List[Tuple[str, float]], class_name: str,
+                 save_path: Optional[str] = None, show: bool = True,
+                 figsize: Tuple[int, int] = None):
+        """
+        Visualize SHAP explanation with highlighted text
+        Args:
+            word_scores: List of (word, score) tuples
+            class_name: Predicted class name
+            save_path: Path to save figure
+            show: Whether to display figure
+            figsize: Figure size (auto if None)
+        Returns:
+            matplotlib figure
+        """
+        if not word_scores:
+            print("⚠️ No words to visualize")
+            return None
+        # Safe max calculation with fallback
+        abs_vals = [abs(v) for _, v in word_scores]
+        if not abs_vals or all(v == 0 for v in abs_vals):
+            max_val = 1.0  # Default to 1.0 if all values are zero
+        else:
+            max_val = max(abs_vals) + 1e-6
+        if figsize is None:
+            figsize = (max(10, 0.5 * len(word_scores)), 3)
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.axis("off")
+        x, y = 0.01, 0.5
+        text_objs = []
+        for word, val in word_scores:
+            # Replace underscores with spaces for display
+            display_word = word.replace('_', ' ')
+            # Color intensity
+            intensity = min(abs(val) / max_val, 1.0)
+            # Red=negative, Green=positive
+            if val < 0:
+                color = (1.0, 1.0 - intensity * 0.7, 1.0 - intensity * 0.7)
+            else:
+                color = (1.0 - intensity * 0.7, 1.0, 1.0 - intensity * 0.7)
+            txt = ax.text(
+                x, y, f" {display_word} ",
+                fontsize=14,
+                bbox=dict(
+                    facecolor=color,
+                    edgecolor='gray',
+                    alpha=0.8,
+                    boxstyle="round,pad=0.4"
+                )
+            )
+            text_objs.append(txt)
+            # Update position (emojis take less space)
+            char_width = 0.025 if any(c in emoji.EMOJI_DATA for c in display_word) else 0.04
+            x += char_width * len(display_word) + 0.01
+            if x > 0.92:
+                x = 0.01
+                y -= 0.35
+        # Apply Nepali font
+        apply_nepali_font(ax, self.nepali_font, texts=text_objs, is_tick_labels=False)
+        ax.text(0.5, 0.95, f"SHAP Explanation (Predicted: {class_name})",
+                ha='center', va='top', fontsize=14, fontweight='bold',
+                transform=ax.transAxes)
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            print(f"✓ SHAP visualization saved to: {save_path}")
+        if show:
+            plt.show()
+        else:
+            plt.close(fig)
+        return fig
+    def explain_and_visualize(self, original_text: str, preprocessed_text: str,
+                            save_path: Optional[str] = None, show: bool = True,
+                            use_fallback: bool = True):
+        """
+        Explain and visualize in one step
+        Args:
+            original_text: Original text with emojis
+            preprocessed_text: Preprocessed text for model
+            save_path: Path to save figure
+            show: Whether to display figure
+            use_fallback: Use fallback if SHAP fails
+        Returns:
+            Dictionary with explanation and figure
+        """
+        # Generate explanation
+        explanation = self.explain(original_text, preprocessed_text, use_fallback)
+        # Get class name
+        class_name = self.model_wrapper.class_names[explanation['class_idx']]
+        # Visualize
+        fig = self.visualize(explanation['word_scores'], class_name, save_path, show)
+        return {
+            'explanation': explanation,
+            'figure': fig
+        }
+# ============================================================================
+# CONVENIENCE FUNCTIONS
+# ============================================================================
+def create_explainer_wrapper(model, tokenizer, label_encoder, preprocessor, device=None):
+    """
+    Convenience function to create model wrapper
+    Args:
+        model: Trained model
+        tokenizer: Model tokenizer
+        label_encoder: Label encoder
+        preprocessor: HateSpeechPreprocessor instance
+        device: torch device (auto if None)
+    Returns:
+        ModelExplainerWrapper instance
+    """
+    return ModelExplainerWrapper(model, tokenizer, label_encoder, preprocessor, device)
+def explain_prediction(text: str, model_wrapper: ModelExplainerWrapper,
+                      method: str = "both", nepali_font: Optional[FontProperties] = None,
+                      save_dir: Optional[str] = None, show: bool = True) -> Dict:
+    """
+    Explain a prediction using LIME and/or SHAP
+    Args:
+        text: Input text
+        model_wrapper: ModelExplainerWrapper instance
+        method: "lime", "shap", or "both"
+        nepali_font: Nepali font for visualization
+        save_dir: Directory to save figures
+        show: Whether to display figures
+    Returns:
+        Dictionary with explanations and figures
+    """
+    # Get analysis
+    analysis = model_wrapper.predict_with_analysis(text)
+    original_text = analysis['original_text']
+    preprocessed_text = analysis['preprocessed_text']
+    results = {
+        'analysis': analysis,
+        'lime': None,
+        'shap': None
+    }
+    # LIME
+    if method in ["lime", "both"] and LIME_AVAILABLE:
+        lime = LIMEExplainer(model_wrapper, nepali_font)
+        save_path = os.path.join(save_dir, f"lime_{abs(hash(text)) % 10**8}.png") if save_dir else None
+        results['lime'] = lime.explain_and_visualize(
+            original_text, preprocessed_text, save_path, show
+        )
+    # SHAP
+    if method in ["shap", "both"] and SHAP_AVAILABLE:
+        shap_exp = SHAPExplainer(model_wrapper, nepali_font)
+        save_path = os.path.join(save_dir, f"shap_{abs(hash(text)) % 10**8}.png") if save_dir else None
+        results['shap'] = shap_exp.explain_and_visualize(
+            original_text, preprocessed_text, save_path, show
+        )
+    return results
+# ============================================================================
+# AVAILABILITY CHECK
+# ============================================================================
+def check_availability() -> Dict[str, bool]:
+    """Check which explainability methods are available"""
+    return {
+        'lime': LIME_AVAILABLE,
+        'shap': SHAP_AVAILABLE
+    }

scripts/transformer_data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,786 @@

+"""
+Transformer Data Preprocessing Module
+======================================
+Enhanced preprocessing pipeline for Nepali hate speech classification.
+This module provides:
+- Script detection (Devanagari/Romanized/English/Mixed)
+- Transliteration (Romanized → Devanagari)
+- Translation (English → Nepali)
+- Emoji semantic mapping with feature extraction
+- Text normalization
+Usage:
+------
+from scripts.transformer_data_preprocessing import HateSpeechPreprocessor
+# Initialize preprocessor
+preprocessor = HateSpeechPreprocessor(
+    model_type="xlmr",
+    translate_english=True
+)
+# Preprocess single text
+processed_text, emoji_features = preprocessor.preprocess("Your text here")
+# Preprocess batch
+texts_list = ["text1", "text2", "text3"]
+processed_texts, features_list = preprocessor.preprocess_batch(texts_list)
+"""
+import re
+import emoji
+import regex
+from typing import Any, Literal, Optional, Tuple, Dict, List
+from deep_translator import GoogleTranslator
+from functools import lru_cache
+import logging
+# Setup logging
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+# Try to import transliteration (optional)
+try:
+    from indic_transliteration import sanscript
+    from indic_transliteration.sanscript import transliterate
+    TRANSLITERATION_AVAILABLE = True
+except ImportError:
+    TRANSLITERATION_AVAILABLE = False
+    logger.warning("indic_transliteration not available. Transliteration disabled.")
+# ============================================================================
+# COMPREHENSIVE EMOJI MAPPINGS
+# ============================================================================
+EMOJI_TO_NEPALI = {
+    # Positive emotions
+    '😂': 'हाँसो', '🤣': 'ठूलो_हाँसो', '😀': 'खुशी', '😁': 'खुशी', '😃': 'खुशी',
+    '😄': 'खुशी', '😅': 'नर्भस_हाँसो', '😆': 'हाँसो', '😊': 'मुस्कान', '☺️': 'मुस्कान',
+    '😉': 'आँखा_झिम्काउने', '🙂': 'मुस्कान', '🙃': 'उल्टो_मुस्कान', '😌': 'शान्त',
+    '😍': 'माया', '🥰': 'माया', '😘': 'चुम्बन', '😗': 'चुम्बन', '😙': 'चुम्बन', '😚': 'चुम्बन',
+    '🤗': 'अँगालो', '🤩': 'चकित', '🥳': 'उत्सव', '🤤': 'लालसा',
+    # Mockery & Sarcasm
+    '😏': 'व्यंग्य', '😜': 'जिब्रो_देखाउने', '😝': 'जिब्रो_देखाउने', '😛': 'जिब्रो',
+    '🙄': 'आँखा_घुमाउने', '😤': 'निराश', '😑': 'अभिव्यक्तिहीन', '😐': 'तटस्थ',
+    '😬': 'तनाव', '🤨': 'शंकास्पद', '🤫': 'चुपचाप', '🤭': 'हात_छोप्ने',
+    '🤥': 'झूठ', '😶': 'मौन',
+    # Anger & Hate
+    '😠': 'रिस', '😡': 'ठूलो_रिस', '🤬': 'गाली', '😈': 'खराब', '👿': 'खराब',
+    '💢': 'क्रोध', '🔪': 'हिंसा', '💣': 'हिंसा', '🗡️': 'तरवार', '⚔️': 'युद्ध',
+    '💥': 'विस्फोट', '🔫': 'बन्दुक', '🧨': 'विस्फोटक', '☠️': 'मृत्यु', '💀': 'खोपडी',
+    '👹': 'राक्षस', '👺': 'दानव', '🤡': 'जोकर', '🖤': 'कालो_मन',
+    '😾': 'रिसाएको', '👊': 'मुक्का', '✊': 'मुक्का',
+    # Offensive Gestures
+    '🖕': 'अपमान', '👎': 'नकारात्मक', '👎🏻': 'नकारात्मक', '👎🏼': 'नकारात्मक',
+    '👎🏽': 'नकारात्मक', '👎🏾': 'नकारात्मक', '👎🏿': 'नकारात्मक',
+    '🖕🏻': 'अपमान', '🖕🏼': 'अपमान', '🖕🏽': 'अपमान', '🖕🏾': 'अपमान', '🖕🏿': 'अपमान',
+    # Sadness
+    '😭': 'रुवाइ', '😢': 'रुवाइ', '😿': 'रुवाइ', '😔': 'उदास', '😞': 'उदास',
+    '😒': 'उदास', '😓': 'चिन्तित', '😟': 'चिन्तित', '😕': 'अलमलिएको',
+    '🙁': 'तल्लो_मुख', '☹️': 'दुःखी', '😩': 'थकित', '😫': 'थकित',
+    '😖': 'भ्रमित', '😣': 'अडिग', '😥': 'निराश', '🥺': 'बिन्ती',
+    # Fear & Shock
+    '😨': 'डर', '😰': 'चिन्तित_पसिना', '😱': 'चिच्याउने', '😳': 'लजाउने',
+    '🤯': 'मन_उडेको', '😵': 'चक्कर', '😲': 'चकित', '😯': 'छक्क',
+    # Disgust
+    '🤢': 'बान्ता', '🤮': 'बान्ता', '🤧': 'हाच्छ्यूँ', '😷': 'बिरामी',
+    '🤒': 'ज्वरो', '🤕': 'घाइते', '🥴': 'मात्तिएको', '😪': 'निद्रा',
+    # Positive Gestures
+    '👍': 'सकारात्मक', '👍🏻': 'सकारात्मक', '👍🏼': 'सकारात्मक',
+    '👍🏽': 'सकारात्मक', '👍🏾': 'सकारात्मक', '👍🏿': 'सकारात्मक',
+    '👏': 'तालि', '🙌': 'उत्सव', '👌': 'ठीक_छ', '🤝': 'हात_मिलाउनु',
+    '🙏': 'नमस्कार', '🤲': 'प्रार्थना', '💪': 'शक्ति', '✌️': 'शान्ति',
+    # Hearts
+    '❤️': 'माया', '🧡': 'माया', '💛': 'माया', '💚': 'माया', '💙': 'माया',
+    '💜': 'माया', '🤍': 'सेतो_मन', '🤎': 'खैरो_मन', '❣️': 'माया',
+    '💕': 'माया', '💞': 'माया', '💓': 'माया', '💗': 'माया',
+    '💖': 'माया', '💘': 'माया', '💝': 'माया', '💔': 'टुटेको_मन',
+    # Symbols
+    '🔥': 'आगो', '💯': 'पूर्ण', '💨': 'हावा', '💫': 'चमक',
+    '⭐': 'तारा', '✨': 'चमक', '🌟': 'चम्किलो_तारा',
+    '🚫': 'निषेध', '⛔': 'प्रवेश_निषेध', '❌': 'रद्द', '❎': 'गलत',
+    # People
+    '👫': 'जोडी', '👬': 'पुरुष_जोडी', '👭': 'महिला_जोडी', '👨\u200d👩\u200d👧\u200d👦': 'परिवार',
+    '👶': 'बच्चा', '👦': 'केटा', '👧': 'केटी', '👨': 'पुरुष', '👩': 'महिला',
+    '👴': 'बूढो', '👵': 'बूढी', '🧒': 'बालक', '👱': 'गोरो', '🧔': 'दाह्री',
+    # Country
+    '🇳🇵': 'नेपाल', '🇮🇳': 'भारत', '🇵🇰': 'पाकिस्तान', '🇧🇩': 'बंगलादेश',
+    '🇨🇳': 'चीन', '🇺🇸': 'अमेरिका', '🏴': 'झण्डा',
+    # Animals
+    '🐕': 'कुकुर', '🐖': 'सुँगुर', '🐀': 'मुसा', '🐍': 'सर्प', '🦂': 'बिच्छी',
+    '🐒': 'बाँदर', '🐵': 'बाँदर_अनुहार', '🦍': 'गोरिल्ला', '🐗': 'जङ्गली_सुँगुर',
+    # Other
+    '🤔': 'सोच', '🧐': 'अनुसन्धान', '😴': 'सुत्ने', '💩': 'मल',
+    '👻': 'भूत', '🤖': 'रोबोट', '👽': 'विदेशी', '🎭': 'मुखौटा',
+    # === EXPANDED COMMON EMOJIS ===
+    # Celebrations & Party
+    '🎉': 'उत्सव', '🎊': 'पार्टी', '🎈': 'बेलुन', '🎁': 'उपहार',
+    '🎂': 'केक', '🍰': 'मिठाई', '🥂': 'चश्मा', '🍾': 'शराब',
+    # Food & Drink (common in casual/hate contexts)
+    '🍕': 'पिज्जा', '🍔': 'बर्गर', '🍗': 'चिकन', '🍖': 'मासु',
+    '🍺': 'बियर', '🍻': 'पार्टी', '☕': 'चिया', '🍵': 'चिया',
+    '🍜': 'नूडल', '🍛': 'करी', '🍲': 'खाना', '🥘': 'परिकार',
+    # Sports & Activities
+    '⚽': 'फुटबल', '🏏': 'क्रिकेट', '🏀': 'बास्केटबल', '🎮': 'खेल',
+    '🏆': 'ट्रफी', '🥇': 'स्वर्ण', '🥈': 'रजत', '🥉': 'कांस्य',
+    # Weather & Nature
+    '☀️': 'घाम', '🌙': 'चन्द्रमा', '🌧️': 'पानी', '⛈️': 'आँधी',
+    '❄️': 'हिउँ', '🌈': 'इन्द्रेणी', '⚡': 'बिजुली', '🌪️': 'बतास',
+    # Technology & Modern
+    '📱': 'मोबाइल', '💻': 'कम्प्युटर', '📷': 'क्यामेरा', '🎥': 'भिडियो',
+    '🖥️': 'कम्प्युटर', '⌨️': 'किबोर्ड', '🖱️': 'माउस', '📡': 'एन्टेना',
+    # Time & Clock
+    '⏰': 'घडी', '⏳': 'समय', '⌛': 'बालुवा_घडी', '🕐': 'एक_बजे',
+    # Objects
+    '📚': 'किताब', '📖': 'खुल्ला_किताब', '✏️': 'पेन्सिल', '📝': 'लेख',
+    '🎤': 'माइक', '🎧': 'हेडफोन', '📢': 'घोषणा', '📣': 'चिल्लाउने',
+    # Miscellaneous Common
+    '✅': 'ठीक', '☑️': 'जाँच', '💯': 'सय', '🆗': 'ठीक',
+    '🆕': 'नयाँ', '🆓': 'मुक्त', '🔴': 'रातो', '🟢': 'हरियो',
+}
+# Emoji categories for feature extraction
+HATE_RELATED_EMOJIS = {
+    '😠', '😡', '🤬', '😈', '👿', '💢', '👊', '✊',
+    '🔪', '💣', '🗡️', '⚔️', '💥', '🔫', '🧨', '☠️', '💀',
+    '🖕', '🖕🏻', '🖕🏼', '🖕🏽', '🖕🏾', '🖕🏿',
+    '👎', '👎🏻', '👎🏼', '👎🏽', '👎🏾', '👎🏿',
+    '👹', '👺', '🤡', '🖤', '💔',
+    '🐕', '🐖', '🐀', '🐍', '🦂', '🐒', '🐵', '🦍', '🐗',
+    '💩', '😾',
+}
+MOCKERY_EMOJIS = {
+    '😏', '😜', '😝', '😛', '🙄', '😤', '🙃',
+    '😑', '😐', '😬', '🤨', '🤫', '🤭', '🤥',
+    '🤡', '👻', '🎭',
+}
+POSITIVE_EMOJIS = {
+    '😊', '😀', '😁', '😃', '😄', '☺️', '🙂', '😌', '🥰', '😍',
+    '❤️', '🧡', '💛', '💚', '💙', '💜', '🤍', '🤎',
+    '💕', '💞', '💓', '💗', '💖', '💘', '💝', '❣️',
+    '👍', '👍🏻', '👍🏼', '👍🏽', '👍🏾', '👍🏿',
+    '🙏', '👏', '🙌', '👌', '🤝', '✌️',
+    '🥳', '🎉', '🎊', '⭐', '✨', '🌟',
+}
+SADNESS_EMOJIS = {
+    '😭', '😢', '😿', '😔', '😞', '😒', '😓', '😟', '😕',
+    '🙁', '☹️', '😩', '😫', '😖', '😣', '😥', '🥺',
+}
+FEAR_EMOJIS = {
+    '😨', '😰', '😱', '😳', '🤯', '😵', '😲', '😯',
+}
+DISGUST_EMOJIS = {
+    '🤢', '🤮', '🤧', '😷', '🤒', '🤕', '🥴',
+}
+# ============================================================================
+# NORMALIZATION MAPPINGS
+# ============================================================================
+DIRGHIKARAN_MAP = {
+    "\u200d": "",  # Zero-width joiner
+    "\u200c": "",  # Zero-width non-joiner
+    "।": ".",      # Devanagari danda
+    "॥": ".",      # Double danda
+}
+# ============================================================================
+# TYPE DEFINITIONS
+# ============================================================================
+ScriptType = Literal["devanagari", "romanized_nepali", "english", "mixed", "other"]
+# ============================================================================
+# EMOJI FEATURE EXTRACTION
+# ============================================================================
+def extract_emoji_features(text: str) -> Dict[str, int]:
+    """
+    Extract comprehensive emoji-based semantic features
+    Returns 18 features:
+    - 6 binary flags (has_X_emoji)
+    - 6 count features (X_emoji_count)
+    - 6 derived features (total, ratio, mixed_sentiment, unknown tracking)
+    """
+    emojis_found = [c for c in text if c in emoji.EMOJI_DATA]
+    hate_count = sum(1 for e in emojis_found if e in HATE_RELATED_EMOJIS)
+    mockery_count = sum(1 for e in emojis_found if e in MOCKERY_EMOJIS)
+    positive_count = sum(1 for e in emojis_found if e in POSITIVE_EMOJIS)
+    sadness_count = sum(1 for e in emojis_found if e in SADNESS_EMOJIS)
+    fear_count = sum(1 for e in emojis_found if e in FEAR_EMOJIS)
+    disgust_count = sum(1 for e in emojis_found if e in DISGUST_EMOJIS)
+    # Track unknown emojis (not in our mapping)
+    known_emojis = set(EMOJI_TO_NEPALI.keys())
+    unknown_emojis = [e for e in emojis_found if e not in known_emojis]
+    unknown_count = len(unknown_emojis)
+    return {
+        # Binary flags
+        'has_hate_emoji': 1 if hate_count > 0 else 0,
+        'has_mockery_emoji': 1 if mockery_count > 0 else 0,
+        'has_positive_emoji': 1 if positive_count > 0 else 0,
+        'has_sadness_emoji': 1 if sadness_count > 0 else 0,
+        'has_fear_emoji': 1 if fear_count > 0 else 0,
+        'has_disgust_emoji': 1 if disgust_count > 0 else 0,
+        # Count features
+        'hate_emoji_count': hate_count,
+        'mockery_emoji_count': mockery_count,
+        'positive_emoji_count': positive_count,
+        'sadness_emoji_count': sadness_count,
+        'fear_emoji_count': fear_count,
+        'disgust_emoji_count': disgust_count,
+        'total_emoji_count': len(emojis_found),
+        # Derived features
+        'hate_to_positive_ratio': hate_count / max(positive_count, 1),
+        'has_mixed_sentiment': 1 if (hate_count > 0 and positive_count > 0) else 0,
+        # NEW: Unknown emoji tracking
+        'unknown_emoji_count': unknown_count,
+        'has_unknown_emoji': 1 if unknown_count > 0 else 0,
+        'known_emoji_ratio': (len(emojis_found) - unknown_count) / max(len(emojis_found), 1),
+    }
+def remove_emojis_for_detection(text: str) -> str:
+    """Remove emojis temporarily for script detection"""
+    return emoji.replace_emoji(text, replace="")
+# ============================================================================
+# SCRIPT DETECTION
+# ============================================================================
+def detect_script_type(text: str) -> Tuple[ScriptType, dict]:
+    """
+    Detect the dominant script type ignoring emojis
+    Returns:
+        Tuple of (script_type, detection_details)
+    """
+    if not text or not text.strip():
+        return "other", {"confidence": 0.0, "reason": "empty_text"}
+    # Remove emojis before detection
+    text_no_emoji = remove_emojis_for_detection(text)
+    if not text_no_emoji.strip():
+        return "other", {"confidence": 0.5, "reason": "emoji_only"}
+    letters = regex.findall(r"\p{L}", text_no_emoji)
+    letter_count = len(letters)
+    if letter_count == 0:
+        return "other", {"confidence": 0.0, "reason": "no_letters"}
+    devanagari_chars = regex.findall(r"\p{Devanagari}", text_no_emoji)
+    dev_count = len(devanagari_chars)
+    dev_ratio = dev_count / letter_count
+    latin_chars = regex.findall(r"[a-zA-Z]", text_no_emoji)
+    latin_count = len(latin_chars)
+    latin_ratio = latin_count / letter_count
+    # Romanized Nepali patterns
+    romanized_nepali_patterns = [
+        # Common words
+        r'\b[xX]u\b', r'\b[xX]um?\b', r'\bhajur\b', r'\bdai\b', r'\bbhai\b', r'\bdidi\b',
+        r'\bbahini\b', r'\bsanghai\b', r'\bsunu\b', r'\bhera\b', r'\bsun\b',
+        # Particles & Postpositions
+        r'\bko\b', r'\bki\b', r'\bka\b', r'\bho\b', r'\btyo\b', r'\byo\b', r'\bta\b',
+        r'\bma\b', r'\bma?i\b', r'\bla[ie]?\b', r'\bnai?\b', r'\bpani\b', r'\bni\b',
+        # Verbs
+        r'\bhun[ae]\b', r'\bhunchha\b', r'\bhunuhunchha\b', r'\bgar\w+\b', r'\bgarna\b',
+        r'\bx[ao]\b', r'\bxa\b', r'\bxan\b', r'\bxaina\b', r'\bxu\b',
+        r'\bchain\b', r'\bchaina\b', r'\bthiy[oe]\b', r'\bhola\b', r'\bhos\b',
+        r'\bbhan\w*\b', r'\bbol\w*\b', r'\bher\w*\b',
+        # Common adjectives/states
+        r'\bkh[ou]s[hi]?\b', r'\bkhusi\b', r'\bkhushi\b', r'\bramro\b', r'\bnaramro\b',
+        r'\bthulo\b', r'\bsano\b', r'\brasilo\b', r'\bmitho\b', r'\btikhi\b',
+        r'\bdherei\b', r'\baliali\b', r'\bastai\b', r'\blastai\b',
+        # Question words
+        r'\bkina\b', r'\bkasari\b', r'\bkahile\b', r'\bkaha[n]?\b', r'\bke\b', r'\bko\b',
+        # Pronouns
+        r'\bma\b', r'\btimi\b', r'\btapai\b', r'\buha\b', r'\buni\b', r'\byini\b',
+        r'\bmero\b', r'\btimro\b', r'\buhako\b', r'\buniko\b', r'\bhamro\b',
+        # Common nouns
+        r'\bmanxe\b', r'\bmanchhe\b', r'\bmanche\b', r'\bharu\b', r'\bdes[ha]?\b',
+        r'\bgha?r\b', r'\bthau\b', r'\bsamay\b', r'\bbela\b',
+        # Nepali-specific endings (transliterated)
+        r'\w+[ae]ko\b', r'\w+[ae]ki\b', r'\w+dai\b', r'\w+lai\b',
+        r'\w+ma\b', r'\w+xa\b', r'\w+hun[ae]\b', r'\w+thiyo\b',
+    ]
+    romanized_indicators = sum(1 for pattern in romanized_nepali_patterns
+                               if re.search(pattern, text_no_emoji, re.IGNORECASE))
+    # Calculate Romanized Nepali score
+    romanized_score = 0.0
+    if latin_ratio > 0.5 and dev_ratio < 0.3:
+        if romanized_indicators > 0:
+            romanized_score = min(0.5 + (romanized_indicators * 0.15), 0.95)
+        else:
+            # Check for typical Romanized Nepali patterns
+            romanized_patterns = re.findall(r'\b\w*[aeiou](?:h)?\b', text_no_emoji.lower())
+            if any(word.endswith(('xu', 'ro', 'no', 'lo', 'ko', 'ho'))
+                   for word in romanized_patterns):
+                romanized_score = 0.4
+            else:
+                romanized_score = 0.3
+    # English indicators (EXPANDED)
+    english_indicators = [
+        # Articles & Determiners
+        'the', 'a', 'an', 'this', 'that', 'these', 'those', 'some', 'any', 'all', 'every',
+        # Pronouns
+        'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
+        'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs',
+        'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
+        # Common verbs (be, have, do)
+        'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
+        'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'done',
+        'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
+        # Common verbs (action)
+        'get', 'got', 'go', 'went', 'gone', 'make', 'made', 'take', 'took', 'taken',
+        'come', 'came', 'see', 'saw', 'seen', 'know', 'knew', 'known', 'say', 'said',
+        'tell', 'told', 'think', 'thought', 'give', 'gave', 'given', 'find', 'found',
+        # Question words
+        'what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how',
+        # Prepositions
+        'in', 'on', 'at', 'to', 'for', 'of', 'with', 'from', 'by', 'about', 'as',
+        'into', 'through', 'over', 'under', 'after', 'before', 'between', 'among',
+        # Conjunctions
+        'and', 'or', 'but', 'so', 'yet', 'nor', 'because', 'if', 'when', 'while',
+        'although', 'though', 'unless', 'since', 'until', 'where', 'whether',
+        # Negations
+        'not', 'no', 'never', 'none', 'nothing', 'nobody', 'nowhere', 'neither',
+        # Common adjectives
+        'good', 'bad', 'great', 'big', 'small', 'long', 'short', 'high', 'low',
+        'old', 'new', 'young', 'early', 'late', 'right', 'wrong', 'true', 'false',
+        'hot', 'cold', 'happy', 'sad', 'angry', 'nice', 'beautiful', 'ugly',
+        # Sentiment words (hate speech relevant)
+        'hate', 'love', 'like', 'dislike', 'stupid', 'dumb', 'idiot', 'fool',
+        'kill', 'die', 'dead', 'death', 'fuck', 'shit', 'ass', 'damn', 'hell',
+        'worst', 'terrible', 'horrible', 'awful', 'disgusting', 'pathetic',
+        # Common nouns
+        'man', 'woman', 'people', 'person', 'thing', 'time', 'day', 'year',
+        'way', 'work', 'life', 'world', 'country', 'place', 'home', 'hand',
+        # Very & Adverbs
+        'very', 'really', 'quite', 'too', 'so', 'just', 'only', 'even', 'also',
+        'well', 'much', 'more', 'most', 'less', 'least', 'still', 'already',
+    ]
+    english_words = [w.lower() for w in re.findall(r'\b\w+\b', text_no_emoji)]
+    english_count = sum(1 for w in english_words if w in english_indicators)
+    english_ratio = english_count / len(english_words) if english_words else 0
+    # Detection details
+    details = {
+        "devanagari_count": dev_count,
+        "devanagari_ratio": dev_ratio,
+        "latin_count": latin_count,
+        "latin_ratio": latin_ratio,
+        "romanized_indicators": romanized_indicators,
+        "english_ratio": english_ratio,
+        "letter_count": letter_count
+    }
+    # Decision logic
+    if dev_ratio >= 0.8:
+        return "devanagari", {**details, "confidence": dev_ratio, "reason": "dominant_devanagari"}
+    elif dev_ratio >= 0.4:
+        return "mixed", {**details, "confidence": 0.7, "reason": "mixed_with_devanagari"}
+    elif romanized_score > 0.5 and dev_ratio < 0.2:
+        return "romanized_nepali", {**details, "confidence": romanized_score, "reason": "romanized_nepali_detected"}
+    elif english_ratio > 0.2 and romanized_score < 0.4:
+        return "english", {**details, "confidence": min(english_ratio + 0.3, 0.9), "reason": "english_detected"}
+    elif latin_ratio > 0.5 and romanized_score > 0.3:
+        return "romanized_nepali", {**details, "confidence": romanized_score, "reason": "likely_romanized_nepali"}
+    elif latin_ratio > 0.8:
+        if english_ratio > 0.1:
+            return "english", {**details, "confidence": 0.6, "reason": "likely_english"}
+        else:
+            return "romanized_nepali", {**details, "confidence": 0.5, "reason": "ambiguous_latin_script"}
+    else:
+        return "other", {**details, "confidence": 0.3, "reason": "insufficient_indicators"}
+# ============================================================================
+# TEXT PROCESSING FUNCTIONS
+# ============================================================================
+def clean_text_basic(text: str) -> str:
+    """Basic text cleaning"""
+    # Remove URLs
+    text = re.sub(r"http\S+|www\S+", "", text)
+    # Remove mentions
+    text = re.sub(r"@\w+", "", text)
+    # Remove hashtag symbol but keep text
+    text = re.sub(r"#(\w+)", r"\1", text)
+    # Remove quotes (single and double, including smart quotes)
+    text = text.replace('"', '').replace("'", '').replace('"', '').replace('"', '').replace(''', '').replace(''', '')
+    # Normalize whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def normalize_devanagari(text: str) -> str:
+    """Normalize Devanagari-specific characters"""
+    for k, v in DIRGHIKARAN_MAP.items():
+        text = text.replace(k, v)
+    return text
+def replace_emojis_semantic(text: str, preserve_spacing: bool = True,
+                           preserve_unknown: bool = True) -> str:
+    """
+    Replace emojis with Nepali text
+    Args:
+        text: Input text with emojis
+        preserve_spacing: Add spaces around Nepali replacements
+        preserve_unknown: Keep emojis not in EMOJI_TO_NEPALI mapping (default: True)
+                         If False, unknown emojis will be removed
+    Returns:
+        Text with emojis replaced (or preserved)
+    """
+    # Replace known emojis with Nepali translations
+    for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
+        if preserve_spacing:
+            text = text.replace(emoji_char, f" {nepali_text} ")
+        else:
+            text = text.replace(emoji_char, nepali_text)
+    # Handle unknown emojis
+    if not preserve_unknown:
+        # OLD BEHAVIOR: Remove all remaining emojis
+        text = emoji.replace_emoji(text, replace=" ")
+    # else: NEW BEHAVIOR: Keep unknown emojis as-is
+    # This allows the model to process them directly
+    return text
+def transliterate_romanized_nepali(text: str) -> str:
+    """Transliterate Romanized Nepali to Devanagari"""
+    if not TRANSLITERATION_AVAILABLE:
+        return text
+    try:
+        result = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
+        return result if result else text
+    except Exception as e:
+        logger.warning(f"Transliteration failed: {e}")
+        return text
+# ============================================================================
+# CACHED TRANSLATOR
+# ============================================================================
+class CachedNepaliTranslator:
+    """Translator with LRU cache for efficiency"""
+    def __init__(self, cache_size: int = 2000):
+        self.translator = GoogleTranslator(source='en', target='ne')
+        self.cache_size = cache_size
+        self._translate_cached = lru_cache(maxsize=cache_size)(self._translate_single)
+    def _translate_single(self, text: str) -> str:
+        if not text or not text.strip():
+            return ""
+        try:
+            result = self.translator.translate(text.strip())
+            return result if result else text
+        except Exception as e:
+            logger.error(f"Translation failed: {str(e)}")
+            return text
+    def translate(self, text: str, fallback_to_original: bool = True) -> str:
+        if not text or not text.strip():
+            return ""
+        try:
+            return self._translate_cached(text.strip())
+        except Exception as e:
+            if fallback_to_original:
+                logger.warning(f"Translation failed, using original: {str(e)}")
+                return text
+            raise
+    def get_cache_info(self) -> dict:
+        """Get cache statistics"""
+        cache_info = self._translate_cached.cache_info()
+        return {
+            'hits': cache_info.hits,
+            'misses': cache_info.misses,
+            'size': cache_info.currsize,
+            'max_size': cache_info.maxsize,
+            'hit_rate': cache_info.hits / (cache_info.hits + cache_info.misses)
+                       if (cache_info.hits + cache_info.misses) > 0 else 0.0
+        }
+def translate_latin_spans(text: str, translator: CachedNepaliTranslator) -> str:
+    """Translate Latin word spans in Devanagari text"""
+    def repl(match):
+        latin_text = match.group(0)
+        translated = translator.translate(latin_text, fallback_to_original=True)
+        return f" {translated} "
+    return re.sub(r"[A-Za-z][A-Za-z\s]{2,}", repl, text)
+# ============================================================================
+# MAIN PREPROCESSOR CLASS
+# ============================================================================
+class HateSpeechPreprocessor:
+    """
+    Main preprocessing pipeline for Nepali hate speech classification
+    Pipeline:
+    1. Extract emoji features (before any processing)
+    2. Detect script type (ignoring emojis)
+    3. Apply script-specific processing
+    4. Replace emojis with Nepali text
+    5. Normalize Devanagari
+    """
+    def __init__(
+        self,
+        model_type: Literal["xlmr", "mbert", "nepalibert"] = "xlmr",
+        translate_english: bool = True,
+        cache_size: int = 2000
+    ):
+        self.model_type = model_type
+        self.translate_english = translate_english
+        self.translator = CachedNepaliTranslator(cache_size) if translate_english else None
+    def preprocess(self, text: str, verbose: bool = False) -> Tuple[str, Dict[str, int]]:
+        """
+        Preprocess a single text
+        Args:
+            text: Input text
+            verbose: Print processing steps
+        Returns:
+            Tuple of (preprocessed_text, emoji_features)
+        """
+        if not isinstance(text, str) or not text.strip():
+            return "", {
+                'has_hate_emoji': 0, 'has_mockery_emoji': 0, 'has_positive_emoji': 0,
+                'has_sadness_emoji': 0, 'has_fear_emoji': 0, 'has_disgust_emoji': 0,
+                'hate_emoji_count': 0, 'mockery_emoji_count': 0, 'positive_emoji_count': 0,
+                'sadness_emoji_count': 0, 'fear_emoji_count': 0, 'disgust_emoji_count': 0,
+                'total_emoji_count': 0, 'hate_to_positive_ratio': 0.0, 'has_mixed_sentiment': 0
+            }
+        original_text = text
+        # Step 1: Extract emoji features
+        emoji_features = extract_emoji_features(original_text)
+        # Step 2: Detect script type
+        script_type, details = detect_script_type(text)
+        if verbose:
+            print(f"Script detected: {script_type} (confidence: {details.get('confidence', 0):.2%})")
+        # Step 3: Basic cleaning
+        text = clean_text_basic(text)
+        # Step 4: Script-specific processing
+        if script_type == "devanagari":
+            processed = text
+            if self.translate_english and self.translator:
+                processed = translate_latin_spans(processed, self.translator)
+        elif script_type == "romanized_nepali":
+            processed = transliterate_romanized_nepali(text)
+        elif script_type == "english":
+            if self.translate_english and self.translator:
+                processed = self.translator.translate(text, fallback_to_original=True)
+            else:
+                processed = text
+        elif script_type == "mixed":
+            processed = transliterate_romanized_nepali(text)
+            if self.translate_english and self.translator:
+                processed = translate_latin_spans(processed, self.translator)
+        else:
+            processed = text
+        # Step 5: Replace emojis
+        processed = replace_emojis_semantic(processed)
+        # Step 6: Normalize
+        final = normalize_devanagari(processed)
+        final = re.sub(r"\s+", " ", final).strip()
+        if verbose:
+            print(f"Original: {original_text}")
+            print(f"Processed: {final}")
+            print(f"Emoji features: {emoji_features}")
+        return final, emoji_features
+    def preprocess_batch(self, texts: List[str], verbose: bool = False, show_progress: bool = False) -> Tuple[List[str], List[Dict[str, int]]]:
+        """
+        Preprocess multiple texts
+        Args:
+            texts: List of input texts
+            verbose: Print processing steps for each text
+            show_progress: Show progress bar (requires tqdm)
+        Returns:
+            Tuple of (preprocessed_texts, emoji_features_list)
+        """
+        if show_progress:
+            try:
+                from tqdm import tqdm
+                results = [self.preprocess(text, verbose=verbose) for text in tqdm(texts, desc="Preprocessing")]
+            except ImportError:
+                results = [self.preprocess(text, verbose=verbose) for text in texts]
+        else:
+            results = [self.preprocess(text, verbose=verbose) for text in texts]
+        texts_processed = [r[0] for r in results]
+        features = [r[1] for r in results]
+        return texts_processed, features
+    def get_stats(self) -> dict:
+        """Get preprocessor statistics"""
+        stats = {
+            'model_type': self.model_type,
+            'translation_enabled': self.translate_english,
+            'transliteration_available': TRANSLITERATION_AVAILABLE,
+        }
+        if self.translator:
+            stats['cache_info'] = self.translator.get_cache_info()
+        return stats
+# ============================================================================
+# CONVENIENCE FUNCTIONS FOR STREAMLIT
+# ============================================================================
+def preprocess_text(
+    text: str,
+    model_type: str = "xlmr",
+    translate_english: bool = True,
+    verbose: bool = False
+) -> Tuple[str, Dict[str, int]]:
+    """
+    Quick preprocessing function for single text (Streamlit-friendly)
+    Args:
+        text: Input text
+        model_type: Model type (xlmr, mbert, nepalibert)
+        translate_english: Whether to translate English
+        verbose: Print processing steps
+    Returns:
+        Tuple of (preprocessed_text, emoji_features)
+    """
+    preprocessor = HateSpeechPreprocessor(
+        model_type=model_type,
+        translate_english=translate_english
+    )
+    return preprocessor.preprocess(text, verbose=verbose)
+def get_script_info(text: str) -> Dict[str, any]:
+    """
+    Get detailed script detection info (useful for Streamlit display)
+    Returns:
+        Dictionary with script type, confidence, and details
+    """
+    script_type, details = detect_script_type(text)
+    return {
+        'script_type': script_type,
+        'confidence': details.get('confidence', 0),
+        'details': details
+    }
+def get_emoji_info(text: str) -> Dict[str, Any]:
+    """Get detailed information about emojis in text"""
+    emojis_found = [c for c in text if c in emoji.EMOJI_DATA]
+    known_emojis = set(EMOJI_TO_NEPALI.keys())
+    unknown_emojis = [e for e in emojis_found if e not in known_emojis]
+    known_emojis_found = [e for e in emojis_found if e in known_emojis]
+    return {
+        'emojis_found': emojis_found,
+        'total_count': len(emojis_found),
+        'known_emojis': known_emojis_found,
+        'known_count': len(known_emojis_found),
+        'unknown_emojis': unknown_emojis,
+        'unknown_count': len(unknown_emojis),
+        'coverage': len(known_emojis_found) / len(emojis_found) if emojis_found else 1.0
+    }

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,1426 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
 """
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+"""
+Nepali Hate Speech Detection - Streamlit Application
+=====================================================
+Complete application with preprocessing, prediction, and explainability (LIME/SHAP/Captum)
+Run with: streamlit run main_app.py
 """
+import os
+import sys
+import streamlit as st
+import pandas as pd
+import numpy as np
+import torch
+import plotly.graph_objects as go
+import plotly.express as px
+from datetime import datetime
+import json
+import warnings
+warnings.filterwarnings('ignore')
+# Matplotlib for Nepali font support
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties, fontManager
+# ============================================================================
+# HF SPACES COMPATIBILITY — paths and environment
+# ============================================================================
+# Detect if running on HF Spaces
+IS_HF_SPACES = bool(os.environ.get('SPACE_ID'))
+# Use /tmp for writable storage on HF Spaces, local 'data/' otherwise
+DATA_DIR = '/tmp/data' if IS_HF_SPACES else 'data'
+os.makedirs(DATA_DIR, exist_ok=True)
+HISTORY_FILE = os.path.join(DATA_DIR, 'prediction_history.json')
+# ============================================================================
+# SCRIPT PATH SETUP
+# ============================================================================
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+SCRIPTS_DIR = os.path.join(BASE_DIR, 'scripts')
+if SCRIPTS_DIR not in sys.path:
+    sys.path.insert(0, BASE_DIR)
+    sys.path.insert(0, SCRIPTS_DIR)
+# ============================================================================
+# CUSTOM MODULE IMPORTS
+# ============================================================================
+try:
+    from scripts.transformer_data_preprocessing import (
+        HateSpeechPreprocessor,
+        preprocess_text,
+        get_script_info,
+        get_emoji_info,
+        EMOJI_TO_NEPALI
+    )
+    from scripts.explainability import (
+        create_explainer_wrapper,
+        LIMEExplainer,
+        SHAPExplainer,
+        check_availability as check_explainability
+    )
+    from scripts.captum_explainer import (
+        CaptumExplainer,
+        check_availability as check_captum_availability
+    )
+    CUSTOM_MODULES_AVAILABLE = True
+except ImportError as e:
+    st.error(f"⚠️ Custom modules not found: {e}")
+    CUSTOM_MODULES_AVAILABLE = False
+# ============================================================================
+# PAGE CONFIGURATION
+# ============================================================================
+st.set_page_config(
+    page_title="Nepali Hate Content Detector",
+    page_icon="🛡️",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ============================================================================
+# CUSTOM CSS
+# ============================================================================
+st.markdown("""
+    <style>
+    /* Main header */
+    .main-header {
+        font-size: 2.8rem;
+        font-weight: 700;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 0.5rem;
+        text-shadow: 2px 2px 4px rgba(0,0,0,0.1);
+    }
+    .sub-header {
+        text-align: center;
+        color: #666;
+        font-size: 1.1rem;
+        margin-bottom: 2rem;
+    }
+    /* Prediction boxes */
+    .prediction-box {
+        padding: 1.5rem;
+        border-radius: 15px;
+        margin: 1rem 0;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+        color: white;
+        font-weight: 600;
+    }
+    .no-box { background: linear-gradient(135deg, #28a745 0%, #1e7e34 100%); }
+    .oo-box { background: linear-gradient(135deg, #ffc107 0%, #e0a800 100%); }
+    .or-box { background: linear-gradient(135deg, #dc3545 0%, #a71d2a 100%); }
+    .os-box { background: linear-gradient(135deg, #6f42c1 0%, #4a1f9e 100%); }
+    /* Info boxes */
+    .info-box {
+        padding: 1rem;
+        border-radius: 10px;
+        background: #f8f9fa;
+        border-left: 4px solid #007bff;
+        margin: 1rem 0;
+    }
+    /* Metrics */
+    .metric-card {
+        background: white;
+        padding: 1rem;
+        border-radius: 10px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        text-align: center;
+    }
+    /* Buttons */
+    .stButton>button {
+        border-radius: 8px;
+        font-weight: 600;
+    }
+    /* Expander */
+    .streamlit-expanderHeader {
+        font-weight: 600;
+        font-size: 1.1rem;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# ============================================================================
+# NEPALI FONT LOADING
+# ============================================================================
+@st.cache_resource
+def load_nepali_font():
+    """Load Nepali font for matplotlib visualizations."""
+    font_paths = [
+        # HF Spaces / Linux (installed via apt fonts-noto in Dockerfile)
+        '/usr/share/fonts/truetype/noto/NotoSansDevanagari-Regular.ttf',
+        '/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf',
+        # Local development
+        'fonts/Kalimati.ttf',
+        # macOS
+        '/System/Library/Fonts/Supplemental/DevanagariSangamMN.ttc',
+        # Windows
+        'C:\\Windows\\Fonts\\NirmalaUI.ttf',
+    ]
+    for font_path in font_paths:
+        if os.path.exists(font_path):
+            try:
+                fontManager.addfont(font_path)
+                fp = FontProperties(fname=font_path)
+                return fp
+            except Exception:
+                continue
+    st.warning("⚠️ Nepali font not found. Devanagari text may display as squares. "
+               "Place Kalimati.ttf in 'fonts/' directory for proper display.")
+    return None
+# ============================================================================
+# SESSION STATE INITIALIZATION
+# ============================================================================
+if 'last_prediction' not in st.session_state:
+    st.session_state.last_prediction = None
+if 'last_text' not in st.session_state:
+    st.session_state.last_text = ""
+if 'batch_results' not in st.session_state:
+    st.session_state.batch_results = None
+if 'batch_mode' not in st.session_state:
+    st.session_state.batch_mode = None
+if 'csv_text_column' not in st.session_state:
+    st.session_state.csv_text_column = None
+if 'explainability_results' not in st.session_state:
+    st.session_state.explainability_results = None
+if 'preprocessor' not in st.session_state:
+    st.session_state.preprocessor = None
+if 'model_wrapper' not in st.session_state:
+    st.session_state.model_wrapper = None
+if 'nepali_font' not in st.session_state:
+    st.session_state.nepali_font = None
+if 'session_predictions' not in st.session_state:
+    st.session_state.session_predictions = 0
+if 'session_class_counts' not in st.session_state:
+    st.session_state.session_class_counts = {'NO': 0, 'OO': 0, 'OR': 0, 'OS': 0}
+# ============================================================================
+# MODEL LOADING
+# ============================================================================
+@st.cache_resource(show_spinner="Loading model... this may take a minute on first run.")
+def load_model_and_preprocessor():
+    """Load model, tokenizer, label encoder, and preprocessor."""
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    import joblib
+    hf_model_id = "UDHOV/xlm-roberta-large-nepali-hate-classification"
+    local_model_path = 'models/saved_models/xlm_roberta_results/large_final'
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Initialize default label encoder as fallback
+    from sklearn.preprocessing import LabelEncoder
+    le = LabelEncoder()
+    le.fit(['NO', 'OO', 'OR', 'OS'])
+    # Try local model first (only relevant for local dev), then HF Hub
+    if not IS_HF_SPACES and os.path.exists(local_model_path):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+            model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
+            model.to(device).eval()
+            le_path = os.path.join(local_model_path, 'label_encoder.pkl')
+            if os.path.exists(le_path):
+                le = joblib.load(le_path)
+            st.success(f"✅ Model loaded from local path on {device}")
+        except Exception as e:
+            st.warning(f"⚠️ Local model failed: {e}. Falling back to HuggingFace Hub...")
+            tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
+            model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
+            model.to(device).eval()
+            try:
+                from huggingface_hub import hf_hub_download
+                le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
+                le = joblib.load(le_file)
+            except Exception:
+                pass  # Use default label encoder
+            st.success(f"✅ Model loaded from HuggingFace Hub on {device}")
+    else:
+        # HF Spaces or local path not found — load directly from Hub
+        tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
+        model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
+        model.to(device).eval()
+        try:
+            from huggingface_hub import hf_hub_download
+            le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
+            le = joblib.load(le_file)
+        except Exception:
+            pass  # Use default label encoder
+        st.success(f"✅ Model loaded from HuggingFace Hub on {device}")
+    # Initialize preprocessor
+    if CUSTOM_MODULES_AVAILABLE:
+        preprocessor = HateSpeechPreprocessor(
+            model_type="xlmr",
+            translate_english=True,
+            cache_size=2000
+        )
+    else:
+        preprocessor = None
+    return model, tokenizer, le, preprocessor, device
+# ============================================================================
+# PREDICTION FUNCTIONS
+# ============================================================================
+def predict_text(text, model, tokenizer, label_encoder, preprocessor, max_length=256):
+    """Make prediction with preprocessing."""
+    device = next(model.parameters()).device
+    # Preprocess
+    if preprocessor:
+        preprocessed, emoji_features = preprocessor.preprocess(text, verbose=False)
+    else:
+        preprocessed = text
+        emoji_features = {}
+    if not preprocessed.strip():
+        return {
+            'prediction': 'NO',
+            'confidence': 0.0,
+            'probabilities': {label: 0.0 for label in label_encoder.classes_},
+            'preprocessed_text': '',
+            'emoji_features': emoji_features,
+            'error': 'Empty text after preprocessing'
+        }
+    # Tokenize
+    inputs = tokenizer(
+        preprocessed,
+        return_tensors='pt',
+        max_length=max_length,
+        padding='max_length',
+        truncation=True
+    )
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    # Predict
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask=attention_mask)
+        probs = torch.softmax(outputs.logits, dim=-1)[0]
+    probs_np = probs.cpu().numpy()
+    pred_idx = np.argmax(probs_np)
+    pred_label = label_encoder.classes_[pred_idx]
+    confidence = probs_np[pred_idx]
+    return {
+        'prediction': pred_label,
+        'confidence': float(confidence),
+        'probabilities': {
+            label_encoder.classes_[i]: float(probs_np[i])
+            for i in range(len(label_encoder.classes_))
+        },
+        'preprocessed_text': preprocessed,
+        'emoji_features': emoji_features
+    }
+# ============================================================================
+# VISUALIZATION FUNCTIONS
+# ============================================================================
+def plot_probabilities(probabilities):
+    """Create probability bar chart."""
+    labels = list(probabilities.keys())
+    probs = list(probabilities.values())
+    colors = {
+        'NO': '#28a745',
+        'OO': '#ffc107',
+        'OR': '#dc3545',
+        'OS': '#6f42c1'
+    }
+    bar_colors = [colors.get(label, '#6c757d') for label in labels]
+    fig = go.Figure(data=[
+        go.Bar(
+            x=labels,
+            y=probs,
+            marker_color=bar_colors,
+            text=[f'{p:.2%}' for p in probs],
+            textposition='outside',
+            hovertemplate='%{x}<br>Probability: %{y:.4f}<extra></extra>'
+        )
+    ])
+    fig.update_layout(
+        title="Class Probabilities",
+        xaxis_title="Class",
+        yaxis_title="Probability",
+        yaxis_range=[0, 1.1],
+        height=400,
+        showlegend=False,
+        template='plotly_white'
+    )
+    return fig
+def get_label_description(label):
+    """Get description for each label."""
+    descriptions = {
+        'NO': '✅ Non-Offensive: The text does not contain hate speech or offensive content.',
+        'OO': '⚠️ Other-Offensive: Contains general offensive language but not targeted hate.',
+        'OR': '🚫 Offensive-Racist: Contains hate speech targeting race, ethnicity, or religion.',
+        'OS': '🚫 Offensive-Sexist: Contains hate speech targeting gender or sexuality.'
+    }
+    return descriptions.get(label, 'Unknown category')
+# ============================================================================
+# HISTORY MANAGEMENT
+# ============================================================================
+def save_prediction_to_history(text, result, feedback=None):
+    """Save prediction to history file."""
+    entry = {
+        'timestamp': datetime.now().isoformat(),
+        'text': text,
+        'prediction': result.get('prediction'),
+        'confidence': result.get('confidence'),
+        'probabilities': result.get('probabilities'),
+        'preprocessed_text': result.get('preprocessed_text'),
+        'emoji_features': result.get('emoji_features', {}),
+        'feedback': feedback
+    }
+    # Load existing history
+    history = []
+    if os.path.exists(HISTORY_FILE):
+        try:
+            with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
+                history = json.load(f)
+        except Exception:
+            history = []
+    # Append and save
+    history.append(entry)
+    try:
+        with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
+            json.dump(history, f, ensure_ascii=False, indent=2)
+        return True
+    except Exception as e:
+        st.error(f"Failed to save history: {e}")
+        return False
+# ============================================================================
+# BATCH EXPLAINABILITY HELPER
+# ============================================================================
+def render_batch_explainability(results_df, text_column, model, tokenizer, label_encoder,
+                                preprocessor, nepali_font, explainability_available,
+                                captum_available, mode_key="batch"):
+    """Render explainability UI for batch results."""
+    if not CUSTOM_MODULES_AVAILABLE:
+        st.warning("⚠️ Explainability not available.")
+        return
+    if not (explainability_available['lime'] or explainability_available['shap'] or captum_available):
+        st.warning("⚠️ No explainability methods available.")
+        return
+    with st.expander("💡 Explain Individual Results", expanded=False):
+        st.markdown("**Select a text from the batch to explain:**")
+        text_options = [f"Row {idx}: {str(row[text_column])[:50]}..." for idx, row in results_df.iterrows()]
+        selected_idx = st.selectbox(
+            "Choose text:",
+            range(len(text_options)),
+            format_func=lambda x: text_options[x],
+            key=f"{mode_key}_select"
+        )
+        selected_text = str(results_df.iloc[selected_idx][text_column])
+        selected_pred = results_df.iloc[selected_idx]['Prediction']
+        st.write(f"**Selected:** {selected_text}")
+        st.write(f"**Prediction:** {selected_pred}")
+        available_methods = []
+        if explainability_available['lime']:
+            available_methods.append("LIME")
+        if explainability_available['shap']:
+            available_methods.append("SHAP")
+        if captum_available:
+            available_methods.append("Captum (IG)")
+        if not available_methods:
+            st.warning("⚠️ No explainability methods available.")
+            return
+        explain_method = st.selectbox(
+            "Explanation method:",
+            available_methods,
+            key=f"{mode_key}_method"
+        )
+        if st.button("🔍 Generate Explanation", key=f"{mode_key}_explain_btn"):
+            with st.spinner("Generating explanation..."):
+                try:
+                    if st.session_state.model_wrapper is None:
+                        st.session_state.model_wrapper = create_explainer_wrapper(
+                            model, tokenizer, label_encoder, preprocessor
+                        )
+                    wrapper = st.session_state.model_wrapper
+                    clean_selected = selected_text.replace('"', '').replace("'", '').replace('\u201c', '').replace('\u201d', '')
+                    preprocessed, emoji_features = preprocessor.preprocess(clean_selected)
+                    analysis = wrapper.predict_with_analysis(clean_selected)
+                    if explain_method == "LIME":
+                        lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
+                        result = lime_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            analysis['preprocessed_text'],
+                            save_path=None,
+                            show=False,
+                            num_samples=200
+                        )
+                        st.subheader("LIME Explanation")
+                        st.pyplot(result['figure'])
+                        st.markdown("---")
+                        st.markdown("**📊 Feature Importance Details:**")
+                        word_scores = result['explanation']['word_scores']
+                        if word_scores:
+                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
+                            df = df.sort_values('Score', ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                        else:
+                            st.warning("No word scores available")
+                    elif explain_method == "SHAP":
+                        shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
+                        result = shap_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            analysis['preprocessed_text'],
+                            save_path=None,
+                            show=False,
+                            use_fallback=True
+                        )
+                        st.subheader("SHAP Explanation")
+                        st.pyplot(result['figure'])
+                        st.markdown("---")
+                        st.markdown("**📊 Attribution Details:**")
+                        st.write(f"**Method used:** {result['explanation']['method_used']}")
+                        word_scores = result['explanation']['word_scores']
+                        if word_scores:
+                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
+                            df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                        else:
+                            st.warning("No word scores available")
+                    elif explain_method == "Captum (IG)":
+                        captum_exp = CaptumExplainer(
+                            model, tokenizer, label_encoder, preprocessor,
+                            emoji_to_nepali_map=EMOJI_TO_NEPALI
+                        )
+                        result = captum_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            target=None,
+                            n_steps=50,
+                            save_dir=None,
+                            show=False,
+                            nepali_font=nepali_font
+                        )
+                        st.subheader("Captum Integrated Gradients")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("**Bar Chart**")
+                            st.pyplot(result['bar_chart'])
+                        with col2:
+                            st.markdown("**Heatmap**")
+                            st.pyplot(result['heatmap'])
+                        st.markdown("---")
+                        st.markdown("**📊 Attribution Details:**")
+                        st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
+                        word_attrs = result['explanation']['word_attributions']
+                        if word_attrs:
+                            df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
+                            df = df.sort_values('Abs Score', ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                        else:
+                            st.warning("No word attributions available")
+                except Exception as e:
+                    st.error(f"❌ Explanation failed: {str(e)}")
+                    st.markdown("**🐛 Error Details:**")
+                    import traceback
+                    st.code(traceback.format_exc())
+# ============================================================================
+# MAIN APPLICATION
+# ============================================================================
+def main():
+    """Main application."""
+    # Load Nepali font
+    if st.session_state.nepali_font is None:
+        st.session_state.nepali_font = load_nepali_font()
+    nepali_font = st.session_state.nepali_font
+    # Header
+    st.markdown('<h1 class="main-header">🛡️ Nepali Hate Content Detector</h1>', unsafe_allow_html=True)
+    st.markdown("""
+    <div class="sub-header">
+    AI-powered hate speech detection for Nepali text with advanced explainability
+    <br>
+    <strong>XLM-RoBERTa Large</strong> fine-tuned on Nepali social media data
+    </div>
+    """, unsafe_allow_html=True)
+    # ========================================================================
+    # SIDEBAR
+    # ========================================================================
+    with st.sidebar:
+        st.header("ℹ️ About")
+        st.markdown("""
+        **Model**: XLM-RoBERTa Large
+        **Task**: Multi-class hate speech detection
+        **Language**: Nepali (Devanagari & Romanized)
+        **Classes:**
+        - **NO**: Non-offensive
+        - **OO**: General offensive
+        - **OR**: Racist/ethnic hate
+        - **OS**: Sexist/gender hate
+        """)
+        st.markdown("---")
+        st.header("🔧 Features")
+        st.markdown("""
+        ✅ **Preprocessing**
+        - Script detection
+        - Transliteration
+        - Translation
+        - Emoji mapping
+        ✅ **Explainability**
+        - LIME
+        - SHAP
+        - Captum (IG)
+        ✅ **Batch Analysis**
+        - CSV upload
+        - Text area input
+        """)
+        st.markdown("---")
+        st.header("🎨 Font Settings")
+        with st.expander("Nepali Font Info", expanded=False):
+            st.markdown(f"""
+            **Status:** {'✅ Loaded' if nepali_font else '❌ Not loaded'}
+            **Fix squares in Devanagari:**
+            1. Download Kalimati.ttf
+            2. Create `fonts/` directory
+            3. Place font file there
+            4. Restart app
+            """)
+        st.markdown("---")
+        st.header("📊 Statistics")
+        # Session Statistics
+        st.subheader("🔄 Current Session")
+        if st.session_state.session_predictions > 0:
+            st.metric("Predictions", st.session_state.session_predictions)
+            session_counts = st.session_state.session_class_counts
+            if any(count > 0 for count in session_counts.values()):
+                st.write("**Session Distribution:**")
+                for label in ['NO', 'OO', 'OR', 'OS']:
+                    count = session_counts.get(label, 0)
+                    if count > 0:
+                        pct = (count / st.session_state.session_predictions) * 100
+                        st.write(f"• {label}: {count} ({pct:.0f}%)")
+        else:
+            st.info("No predictions in this session yet.")
+        st.markdown("---")
+        # History Statistics
+        st.subheader("📚 All Time")
+        if os.path.exists(HISTORY_FILE):
+            try:
+                with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
+                    history = json.load(f)
+                if history:
+                    st.metric("Total Saved", len(history))
+                    pred_counts = pd.Series([h['prediction'] for h in history]).value_counts()
+                    st.write("**Distribution:**")
+                    for label, count in pred_counts.items():
+                        st.write(f"• {label}: {count}")
+                else:
+                    st.info("No saved predictions yet.")
+            except Exception as e:
+                st.warning("⚠️ History file error")
+                with st.expander("Error details"):
+                    st.code(str(e))
+        else:
+            st.info("📝 No history file\n\nEnable 'Save to history' in Tab 1 to track predictions.")
+        st.markdown("---")
+        st.markdown("""
+        <div style='text-align: center; font-size: 0.9rem; color: #666;'>
+        <a href='https://huggingface.co/UDHOV/xlm-roberta-large-nepali-hate-classification' target='_blank'>
+        Model on HuggingFace 🤗
+        </a>
+        </div>
+        """, unsafe_allow_html=True)
+    # ========================================================================
+    # LOAD MODEL
+    # ========================================================================
+    with st.spinner("Loading model..."):
+        model, tokenizer, label_encoder, preprocessor, device = load_model_and_preprocessor()
+    if model is None:
+        st.error("❌ Failed to load model!")
+        st.stop()
+    # Check explainability availability
+    explainability_available = check_explainability() if CUSTOM_MODULES_AVAILABLE else {'lime': False, 'shap': False}
+    captum_available = check_captum_availability() if CUSTOM_MODULES_AVAILABLE else False
+    # ========================================================================
+    # TABS
+    # ========================================================================
+    tabs = st.tabs([
+        "🔍 Single Prediction",
+        "💡 Explainability",
+        "📝 Batch Analysis",
+        "📈 History"
+    ])
+    # ========================================================================
+    # TAB 1: SINGLE PREDICTION
+    # ========================================================================
+    with tabs[0]:
+        st.subheader("🔍 Single Text Analysis")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            text_input = st.text_area(
+                "Enter Nepali Text",
+                height=200,
+                placeholder="यहाँ आफ्नो पाठ लेख्नुहोस्...\nOr enter romanized Nepali: ma khusi xu\nOr English: This is a test",
+                help="Enter text in Devanagari, Romanized Nepali, or English."
+            )
+            col_a, col_b = st.columns(2)
+            with col_a:
+                analyze_button = st.button("🔍 Analyze Text", type="primary", use_container_width=True)
+            with col_b:
+                save_to_history = st.checkbox("Save to history", value=True)
+        with col2:
+            st.markdown("##### 💡 Quick Info")
+            st.info("""
+            **Supported:**
+            - Devanagari: नेपाली
+            - Romanized: ma nepali xu
+            - English: I am Nepali
+            - Mixed scripts
+            - Emojis: 😀😡🙏
+            **Auto-processing:**
+            - Script detection
+            - Transliteration
+            - Translation
+            - Emoji → Nepali words
+            - URL/mention removal
+            """)
+        if analyze_button and text_input.strip():
+            with st.spinner("🔄 Analyzing text..."):
+                result = predict_text(
+                    text_input, model, tokenizer,
+                    label_encoder, preprocessor
+                )
+                st.session_state.last_prediction = result
+                st.session_state.last_text = text_input
+                if 'prediction' in result:
+                    st.session_state.session_predictions += 1
+                    pred_label = result['prediction']
+                    if pred_label in st.session_state.session_class_counts:
+                        st.session_state.session_class_counts[pred_label] += 1
+                if save_to_history:
+                    save_prediction_to_history(text_input, result)
+            if 'error' in result:
+                st.warning(f"⚠️ {result['error']}")
+                st.stop()
+            st.markdown("---")
+            st.subheader("📊 Analysis Results")
+            pred_label = result['prediction']
+            confidence = result['confidence']
+            box_class = {
+                'NO': 'no-box',
+                'OO': 'oo-box',
+                'OR': 'or-box',
+                'OS': 'os-box'
+            }.get(pred_label, 'no-box')
+            st.markdown(f"""
+            <div class='prediction-box {box_class}'>
+                <h2 style='margin:0;'>Prediction: {pred_label}</h2>
+                <p style='font-size:1.3rem; margin:0.5rem 0;'>
+                    Confidence: <strong>{confidence:.2%}</strong>
+                </p>
+                <p style='margin:0; font-size:1rem;'>{get_label_description(pred_label)}</p>
+            </div>
+            """, unsafe_allow_html=True)
+            st.plotly_chart(plot_probabilities(result['probabilities']), use_container_width=True)
+            with st.expander("🔍 Preprocessing Details", expanded=False):
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.markdown("**Original Text:**")
+                    st.code(text_input, language=None)
+                with col2:
+                    st.markdown("**Preprocessed:**")
+                    st.code(result['preprocessed_text'], language=None)
+                with col3:
+                    if CUSTOM_MODULES_AVAILABLE and preprocessor:
+                        script_info = get_script_info(text_input)
+                        st.markdown("**Script Detected:**")
+                        st.write(f"• Type: {script_info['script_type']}")
+                        confidence_pct = min(script_info['confidence'] * 100, 100.0)
+                        st.write(f"• Confidence: {confidence_pct:.1f}%")
+            if result.get('emoji_features', {}).get('total_emoji_count', 0) > 0:
+                with st.expander("😊 Emoji Analysis", expanded=False):
+                    features = result['emoji_features']
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Total Emojis", features['total_emoji_count'])
+                        st.metric("Hate Emojis", features['hate_emoji_count'])
+                    with col2:
+                        st.metric("Positive Emojis", features['positive_emoji_count'])
+                        st.metric("Mockery Emojis", features['mockery_emoji_count'])
+                    with col3:
+                        st.metric("Sadness Emojis", features['sadness_emoji_count'])
+                        st.metric("Fear Emojis", features['fear_emoji_count'])
+                    if CUSTOM_MODULES_AVAILABLE:
+                        emoji_info = get_emoji_info(text_input)
+                        if emoji_info['emojis_found']:
+                            st.markdown("**Emojis Found:**")
+                            st.write(" ".join(emoji_info['emojis_found']))
+            with st.expander("📊 Detailed Probabilities", expanded=False):
+                prob_df = pd.DataFrame({
+                    'Class': list(result['probabilities'].keys()),
+                    'Probability': list(result['probabilities'].values())
+                })
+                prob_df['Probability'] = prob_df['Probability'].apply(lambda x: f"{x:.4f}")
+                st.dataframe(prob_df, hide_index=True, use_container_width=True)
+    # ========================================================================
+    # TAB 2: EXPLAINABILITY
+    # ========================================================================
+    with tabs[1]:
+        st.subheader("💡 Model Explainability")
+        if not CUSTOM_MODULES_AVAILABLE:
+            st.error("❌ Explainability modules not available. Please check scripts directory.")
+            st.stop()
+        st.info(f"""
+        **Available Methods:**
+        - LIME: {'✅' if explainability_available['lime'] else '❌ (install: pip install lime)'}
+        - SHAP: {'✅' if explainability_available['shap'] else '❌ (install: pip install shap)'}
+        - Captum: {'✅' if captum_available else '❌ (install: pip install captum)'}
+        """)
+        explain_text = st.text_area(
+            "Enter text to explain",
+            height=150,
+            value=st.session_state.last_text if st.session_state.last_text else "",
+            placeholder="Enter Nepali text..."
+        )
+        available_methods = []
+        if explainability_available['lime']:
+            available_methods.append("LIME")
+        if explainability_available['shap']:
+            available_methods.append("SHAP")
+        if captum_available:
+            available_methods.append("Captum (IG)")
+        if not available_methods:
+            st.warning("⚠️ No explainability methods available. Please install required packages.")
+            st.code("pip install lime shap captum", language="bash")
+            st.stop()
+        method = st.selectbox("Select explanation method", available_methods)
+        with st.expander("⚙️ Configuration", expanded=False):
+            if method == "LIME":
+                num_samples = st.slider("Number of samples", 100, 500, 200, 50)
+            elif method == "SHAP":
+                use_fallback = st.checkbox("Use fallback if SHAP fails", value=True)
+            elif method == "Captum (IG)":
+                n_steps = st.slider("Integration steps", 10, 100, 50, 10)
+        explain_button = st.button("🔍 Generate Explanation", type="primary", use_container_width=True)
+        if explain_button and explain_text.strip():
+            with st.spinner("Generating explanation..."):
+                if st.session_state.model_wrapper is None:
+                    st.session_state.model_wrapper = create_explainer_wrapper(
+                        model, tokenizer, label_encoder, preprocessor
+                    )
+                wrapper = st.session_state.model_wrapper
+                preprocessed, emoji_features = preprocessor.preprocess(explain_text)
+                analysis = wrapper.predict_with_analysis(explain_text)
+                st.success(f"**Prediction:** {analysis['predicted_label']} ({analysis['confidence']:.2%})")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.write("**Original:**", explain_text)
+                with col2:
+                    st.write("**Preprocessed:**", preprocessed)
+                st.markdown("---")
+                try:
+                    if method == "LIME":
+                        lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
+                        result = lime_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            analysis['preprocessed_text'],
+                            save_path=None,
+                            show=False,
+                            num_samples=num_samples
+                        )
+                        st.subheader("LIME Explanation")
+                        st.pyplot(result['figure'])
+                        with st.expander("📊 Feature Importance Details"):
+                            word_scores = result['explanation']['word_scores']
+                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
+                            df = df.sort_values('Score', ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                    elif method == "SHAP":
+                        shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
+                        result = shap_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            analysis['preprocessed_text'],
+                            save_path=None,
+                            show=False,
+                            use_fallback=use_fallback
+                        )
+                        st.subheader("SHAP Explanation")
+                        st.pyplot(result['figure'])
+                        with st.expander("📊 Attribution Details"):
+                            st.write(f"**Method used:** {result['explanation']['method_used']}")
+                            word_scores = result['explanation']['word_scores']
+                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
+                            df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                    elif method == "Captum (IG)":
+                        captum_exp = CaptumExplainer(
+                            model, tokenizer, label_encoder, preprocessor,
+                            emoji_to_nepali_map=EMOJI_TO_NEPALI
+                        )
+                        result = captum_exp.explain_and_visualize(
+                            analysis['original_text'],
+                            target=None,
+                            n_steps=n_steps,
+                            save_dir=None,
+                            show=False,
+                            nepali_font=nepali_font
+                        )
+                        st.subheader("Captum Integrated Gradients")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("**Bar Chart**")
+                            st.pyplot(result['bar_chart'])
+                        with col2:
+                            st.markdown("**Heatmap**")
+                            st.pyplot(result['heatmap'])
+                        with st.expander("📊 Attribution Details"):
+                            st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
+                            word_attrs = result['explanation']['word_attributions']
+                            df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
+                            df = df.sort_values('Abs Score', ascending=False)
+                            st.dataframe(df, hide_index=True, use_container_width=True)
+                except Exception as e:
+                    st.error(f"❌ Explanation failed: {str(e)}")
+                    with st.expander("🐛 Error Details"):
+                        st.exception(e)
+    # ========================================================================
+    # TAB 3: BATCH ANALYSIS
+    # ========================================================================
+    with tabs[2]:
+        st.subheader("📝 Batch Analysis")
+        st.markdown("### 📥 Download Example Files")
+        col1, col2 = st.columns(2)
+        with col1:
+            example_csv_data = {
+                'text': [
+                    'यो राम्रो छ',
+                    'तिमी मुर्ख हौ',
+                    'मुस्लिम हरु सबै खराब छन्',
+                    'केटीहरु घरमा बस्नु पर्छ',
+                    'नमस्ते, कस्तो छ?'
+                ]
+            }
+            example_csv = pd.DataFrame(example_csv_data).to_csv(index=False)
+            st.download_button(
+                label="📄 Download Example CSV",
+                data=example_csv,
+                file_name="example_batch.csv",
+                mime="text/csv",
+                use_container_width=True
+            )
+        with col2:
+            example_text = "यो राम्रो छ\nतिमी मुर्ख हौ\nमुस्लिम हरु सबै खराब छन्\nकेटीहरु घरमा बस्नु पर्छ\nनमस्ते, कस्तो छ?"
+            st.download_button(
+                label="📝 Download Example Text",
+                data=example_text,
+                file_name="example_batch.txt",
+                mime="text/plain",
+                use_container_width=True
+            )
+        st.markdown("---")
+        input_method = st.radio("Input method:", ["Text Area", "CSV Upload"])
+        # ---- TEXT AREA ----
+        if input_method == "Text Area":
+            st.info("💡 Enter one text per line")
+            batch_text = st.text_area(
+                "Enter texts (one per line)",
+                height=250,
+                placeholder="यो राम्रो छ\nतिमी मुर्ख हौ\n..."
+            )
+            if st.button("🚀 Analyze Batch", type="primary"):
+                if batch_text.strip():
+                    texts = [line.strip() for line in batch_text.split('\n') if line.strip()]
+                    with st.spinner(f"Analyzing {len(texts)} texts..."):
+                        results = []
+                        progress_bar = st.progress(0)
+                        for idx, text in enumerate(texts):
+                            try:
+                                result = predict_text(
+                                    text, model, tokenizer,
+                                    label_encoder, preprocessor
+                                )
+                                results.append({
+                                    'Text': text[:60] + '...' if len(text) > 60 else text,
+                                    'Full_Text': text,
+                                    'Prediction': result['prediction'],
+                                    'Confidence': result['confidence'],
+                                    'Preprocessed': result['preprocessed_text']
+                                })
+                            except Exception as e:
+                                results.append({
+                                    'Text': text[:60],
+                                    'Full_Text': text,
+                                    'Prediction': 'Error',
+                                    'Confidence': 0.0,
+                                    'Preprocessed': str(e)
+                                })
+                            progress_bar.progress((idx + 1) / len(texts))
+                        st.session_state.batch_results = pd.DataFrame(results)
+                        st.session_state.batch_mode = 'text_area'
+                        for result in results:
+                            if result['Prediction'] != 'Error':
+                                st.session_state.session_predictions += 1
+                                pred_label = result['Prediction']
+                                if pred_label in st.session_state.session_class_counts:
+                                    st.session_state.session_class_counts[pred_label] += 1
+                        st.rerun()
+                else:
+                    st.warning("Please enter some texts.")
+            # Display results outside button block
+            if (st.session_state.batch_results is not None and
+                    st.session_state.get('batch_mode') == 'text_area'):
+                results_df = st.session_state.batch_results
+                st.success(f"✅ Analyzed {len(results_df)} texts!")
+                display_df = results_df[['Text', 'Prediction', 'Confidence']].copy()
+                display_df['Confidence'] = display_df['Confidence'].apply(lambda x: f"{x:.2%}")
+                st.dataframe(display_df, use_container_width=True, hide_index=True, height=400)
+                st.markdown("---")
+                st.subheader("📊 Summary Statistics")
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.metric("Total Texts", len(results_df))
+                    st.metric("Avg Confidence", f"{results_df['Confidence'].mean():.2%}")
+                with col2:
+                    summary = results_df['Prediction'].value_counts()
+                    fig = px.pie(
+                        values=summary.values,
+                        names=summary.index,
+                        title="Prediction Distribution",
+                        color_discrete_sequence=px.colors.qualitative.Set2
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                with col3:
+                    st.markdown("**Class Breakdown:**")
+                    for label, count in summary.items():
+                        pct = count / len(results_df) * 100
+                        st.write(f"• {label}: {count} ({pct:.1f}%)")
+                st.markdown("---")
+                download_df = results_df[['Full_Text', 'Prediction', 'Confidence', 'Preprocessed']].copy()
+                download_df.columns = ['Text', 'Prediction', 'Confidence', 'Preprocessed']
+                csv = download_df.to_csv(index=False)
+                col_download, col_explain = st.columns(2)
+                with col_download:
+                    st.download_button(
+                        label="📥 Download Results CSV",
+                        data=csv,
+                        file_name=f"batch_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                        mime="text/csv",
+                        use_container_width=True,
+                        key="download_batch_text"
+                    )
+                with col_explain:
+                    if st.button("💡 Explain Selected", use_container_width=True, key="hint_batch_text"):
+                        st.info("👇 Select a text below to explain")
+                render_batch_explainability(
+                    results_df=results_df,
+                    text_column='Full_Text',
+                    model=model,
+                    tokenizer=tokenizer,
+                    label_encoder=label_encoder,
+                    preprocessor=preprocessor,
+                    nepali_font=nepali_font,
+                    explainability_available=explainability_available,
+                    captum_available=captum_available,
+                    mode_key="text_area"
+                )
+        # ---- CSV UPLOAD ----
+        else:
+            st.info("💡 Upload CSV with a 'text' column")
+            uploaded_file = st.file_uploader("Choose CSV file", type=['csv'])
+            if uploaded_file:
+                try:
+                    df = pd.read_csv(uploaded_file)
+                    st.write("📄 **File Preview:**")
+                    st.dataframe(df.head(10), use_container_width=True)
+                    text_column = st.selectbox("Select text column:", df.columns)
+                    if st.button("🚀 Analyze CSV", type="primary"):
+                        texts = df[text_column].astype(str).tolist()
+                        with st.spinner(f"Analyzing {len(texts)} texts..."):
+                            predictions = []
+                            confidences = []
+                            preprocessed_list = []
+                            progress_bar = st.progress(0)
+                            for idx, text in enumerate(texts):
+                                try:
+                                    result = predict_text(
+                                        str(text), model, tokenizer,
+                                        label_encoder, preprocessor
+                                    )
+                                    predictions.append(result['prediction'])
+                                    confidences.append(result['confidence'])
+                                    preprocessed_list.append(result['preprocessed_text'])
+                                except Exception as e:
+                                    predictions.append('Error')
+                                    confidences.append(0.0)
+                                    preprocessed_list.append(str(e))
+                                progress_bar.progress((idx + 1) / len(texts))
+                            df['Prediction'] = predictions
+                            df['Confidence'] = confidences
+                            df['Preprocessed'] = preprocessed_list
+                            st.session_state.batch_results = df
+                            st.session_state.batch_mode = 'csv'
+                            st.session_state.csv_text_column = text_column
+                            for pred in predictions:
+                                if pred != 'Error':
+                                    st.session_state.session_predictions += 1
+                                    if pred in st.session_state.session_class_counts:
+                                        st.session_state.session_class_counts[pred] += 1
+                            st.rerun()
+                    # Display results outside button block
+                    if (st.session_state.batch_results is not None and
+                            st.session_state.get('batch_mode') == 'csv'):
+                        df_results = st.session_state.batch_results
+                        text_col = st.session_state.get('csv_text_column', text_column)
+                        st.success("✅ Analysis complete!")
+                        st.dataframe(df_results, use_container_width=True, height=400)
+                        st.markdown("---")
+                        st.subheader("📊 Summary")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            summary = df_results['Prediction'].value_counts()
+                            fig = px.bar(
+                                x=summary.index,
+                                y=summary.values,
+                                title="Prediction Distribution",
+                                labels={'x': 'Class', 'y': 'Count'},
+                                color=summary.index,
+                                color_discrete_map={
+                                    'NO': '#28a745',
+                                    'OO': '#ffc107',
+                                    'OR': '#dc3545',
+                                    'OS': '#6f42c1'
+                                }
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                        with col2:
+                            st.metric("Total Texts", len(df_results))
+                            st.metric("Avg Confidence", f"{df_results['Confidence'].mean():.2%}")
+                            st.markdown("**Class Distribution:**")
+                            for label, count in summary.items():
+                                st.write(f"• {label}: {count}")
+                        st.markdown("---")
+                        csv_data = df_results.to_csv(index=False)
+                        col_download, col_explain = st.columns(2)
+                        with col_download:
+                            st.download_button(
+                                label="📥 Download Results CSV",
+                                data=csv_data,
+                                file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                                mime="text/csv",
+                                use_container_width=True,
+                                key="download_csv_results"
+                            )
+                        with col_explain:
+                            if st.button("💡 Explain Selected", use_container_width=True, key="csv_explain_hint"):
+                                st.info("👇 Use expander below to explain")
+                        render_batch_explainability(
+                            results_df=df_results,
+                            text_column=text_col,
+                            model=model,
+                            tokenizer=tokenizer,
+                            label_encoder=label_encoder,
+                            preprocessor=preprocessor,
+                            nepali_font=nepali_font,
+                            explainability_available=explainability_available,
+                            captum_available=captum_available,
+                            mode_key="csv"
+                        )
+                except Exception as e:
+                    st.error(f"❌ Error processing file: {str(e)}")
+                    with st.expander("🐛 Error Details"):
+                        st.exception(e)
+    # ========================================================================
+    # TAB 4: HISTORY
+    # ========================================================================
+    with tabs[3]:
+        st.subheader("📈 Prediction History")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.write("View and analyze your prediction history")
+        with col2:
+            if st.button("🔄 Refresh", use_container_width=True):
+                st.rerun()
+        if os.path.exists(HISTORY_FILE):
+            try:
+                with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
+                    history = json.load(f)
+                if history:
+                    history_df = pd.DataFrame(history)
+                    history_df['timestamp'] = pd.to_datetime(history_df['timestamp'])
+                    st.markdown("### 📊 Overview")
+                    col1, col2, col3, col4 = st.columns(4)
+                    with col1:
+                        st.metric("Total Predictions", len(history_df))
+                    with col2:
+                        st.metric("Avg Confidence", f"{history_df['confidence'].mean():.2%}")
+                    with col3:
+                        if 'emoji_features' in history_df.columns:
+                            total_emojis = sum(
+                                e.get('total_emoji_count', 0)
+                                for e in history_df['emoji_features']
+                                if isinstance(e, dict)
+                            )
+                            st.metric("Total Emojis", total_emojis)
+                        else:
+                            st.metric("Total Emojis", "N/A")
+                    with col4:
+                        most_common = history_df['prediction'].mode()[0]
+                        st.metric("Most Common", most_common)
+                    st.markdown("---")
+                    st.markdown("### 📈 Trends")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        daily_counts = history_df.groupby(
+                            history_df['timestamp'].dt.date
+                        ).size().reset_index(name='count')
+                        fig = px.line(
+                            daily_counts,
+                            x='timestamp',
+                            y='count',
+                            title="Predictions Over Time",
+                            labels={'timestamp': 'Date', 'count': 'Count'}
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    with col2:
+                        class_dist = history_df['prediction'].value_counts()
+                        fig = px.pie(
+                            values=class_dist.values,
+                            names=class_dist.index,
+                            title="Class Distribution",
+                            color=class_dist.index,
+                            color_discrete_map={
+                                'NO': '#28a745',
+                                'OO': '#ffc107',
+                                'OR': '#dc3545',
+                                'OS': '#6f42c1'
+                            }
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    st.markdown("---")
+                    st.markdown("### 📋 Recent Predictions")
+                    num_to_show = st.slider("Number to show", 5, 50, 20, 5)
+                    recent = history_df.tail(num_to_show).sort_values('timestamp', ascending=False)
+                    display = recent[['timestamp', 'text', 'prediction', 'confidence']].copy()
+                    display['confidence'] = display['confidence'].apply(lambda x: f"{x:.2%}")
+                    display['text'] = display['text'].apply(lambda x: x[:80] + '...' if len(x) > 80 else x)
+                    display['timestamp'] = display['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
+                    st.dataframe(display, use_container_width=True, hide_index=True, height=400)
+                    st.markdown("---")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        csv = history_df.to_csv(index=False)
+                        st.download_button(
+                            label="📥 Download Full History",
+                            data=csv,
+                            file_name=f"history_{datetime.now().strftime('%Y%m%d')}.csv",
+                            mime="text/csv",
+                            use_container_width=True
+                        )
+                    with col2:
+                        if st.button("🗑️ Clear History", type="secondary", use_container_width=True):
+                            if os.path.exists(HISTORY_FILE):
+                                os.remove(HISTORY_FILE)
+                                st.success("✅ History cleared!")
+                                st.rerun()
+                else:
+                    st.info("📝 No predictions in history yet.")
+            except Exception as e:
+                st.error(f"❌ Error loading history: {str(e)}")
+                with st.expander("🐛 Error Details"):
+                    st.exception(e)
+        else:
+            st.info("📝 No history file found yet.")
+            st.markdown("""
+            ### How to Build History:
+            1. Go to **Single Prediction** tab
+            2. Enable "Save to history" checkbox
+            3. Analyze some text
+            4. Your predictions will appear here!
+            """)
+if __name__ == "__main__":
+    main()