Spaces:

UDHOV
/

Nepali-hate-classification

Sleeping

File size: 62,601 Bytes

"""
Nepali Hate Speech Detection - Streamlit Application
=====================================================
Complete application with preprocessing, prediction, and explainability (LIME/SHAP/Captum)

Run with: streamlit run main_app.py
"""

import os
import sys
import streamlit as st
import pandas as pd
import numpy as np
import torch
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Matplotlib for Nepali font support
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties, fontManager

# ============================================================================
# HF SPACES COMPATIBILITY — paths and environment
# ============================================================================

# Detect if running on HF Spaces
IS_HF_SPACES = bool(os.environ.get('SPACE_ID'))

# Use /tmp for writable storage on HF Spaces, local 'data/' otherwise
DATA_DIR = '/tmp/data' if IS_HF_SPACES else 'data'
os.makedirs(DATA_DIR, exist_ok=True)
HISTORY_FILE = os.path.join(DATA_DIR, 'prediction_history.json')

# ============================================================================
# SCRIPT PATH SETUP
# ============================================================================

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SCRIPTS_DIR = os.path.join(BASE_DIR, 'scripts')
if SCRIPTS_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)
    sys.path.insert(0, SCRIPTS_DIR)

# ============================================================================
# CUSTOM MODULE IMPORTS
# ============================================================================

try:
    from scripts.transformer_data_preprocessing import (
        HateSpeechPreprocessor,
        preprocess_text,
        get_script_info,
        get_emoji_info,
        EMOJI_TO_NEPALI
    )
    from scripts.explainability import (
        create_explainer_wrapper,
        LIMEExplainer,
        SHAPExplainer,
        check_availability as check_explainability
    )
    from scripts.captum_explainer import (
        CaptumExplainer,
        check_availability as check_captum_availability
    )
    CUSTOM_MODULES_AVAILABLE = True
except MemoryError:
    st.warning("⚠️ Captum not available due to memory constraints.")
    CUSTOM_MODULES_AVAILABLE = False
    captum_available = False
except ImportError as e:
    st.error(f"⚠️ Custom modules not found: {e}")
    CUSTOM_MODULES_AVAILABLE = False

# ============================================================================
# PAGE CONFIGURATION
# ============================================================================

st.set_page_config(
    page_title="Nepali Hate Content Detector",
    page_icon="🛡️",
    layout="wide",
    initial_sidebar_state="expanded"
)

# ============================================================================
# CUSTOM CSS
# ============================================================================

st.markdown("""
    <style>
    /* Main header */
    .main-header {
        font-size: 2.8rem;
        font-weight: 700;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 0.5rem;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.1);
    }

    .sub-header {
        text-align: center;
        color: #666;
        font-size: 1.1rem;
        margin-bottom: 2rem;
    }

    /* Prediction boxes */
    .prediction-box {
        padding: 1.5rem;
        border-radius: 15px;
        margin: 1rem 0;
        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        color: white;
        font-weight: 600;
    }

    .no-box { background: linear-gradient(135deg, #28a745 0%, #1e7e34 100%); }
    .oo-box { background: linear-gradient(135deg, #ffc107 0%, #e0a800 100%); }
    .or-box { background: linear-gradient(135deg, #dc3545 0%, #a71d2a 100%); }
    .os-box { background: linear-gradient(135deg, #6f42c1 0%, #4a1f9e 100%); }

    /* Info boxes */
    .info-box {
        padding: 1rem;
        border-radius: 10px;
        background: #f8f9fa;
        border-left: 4px solid #007bff;
        margin: 1rem 0;
    }

    /* Metrics */
    .metric-card {
        background: white;
        padding: 1rem;
        border-radius: 10px;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        text-align: center;
    }

    /* Buttons */
    .stButton>button {
        border-radius: 8px;
        font-weight: 600;
    }

    /* Expander */
    .streamlit-expanderHeader {
        font-weight: 600;
        font-size: 1.1rem;
    }
    </style>
""", unsafe_allow_html=True)

# ============================================================================
# NEPALI FONT LOADING
# ============================================================================

@st.cache_resource
def load_nepali_font():
    """Load Nepali font for matplotlib visualizations.
    
    Tries multiple font paths in order of preference:
    1. Kalimati (primary — downloaded by Dockerfile)
    2. Noto Sans Devanagari (Linux/HF Spaces fallback)
    3. Other system Devanagari fonts (macOS, Windows)
    """
    font_paths = [
        # ── Kalimati (primary) ──────────────────────────────────────────
        # HF Spaces / Docker — downloaded by Dockerfile curl command
        '/app/fonts/Kalimati.ttf',
        # Registered system-wide by fc-cache in Dockerfile
        '/usr/local/share/fonts/nepali/Kalimati.ttf',
        # Local dev — absolute path relative to script location
        os.path.join(BASE_DIR, 'fonts', 'Kalimati.ttf'),
        # Local dev — relative path
        'fonts/Kalimati.ttf',

        # ── Noto Sans Devanagari (Linux / HF Spaces fallback) ───────────
        '/usr/share/fonts/truetype/noto/NotoSansDevanagari-Regular.ttf',
        '/usr/share/fonts/truetype/noto/NotoSansDevanagari[wdth,wght].ttf',
        '/usr/share/fonts/opentype/noto/NotoSansDevanagari-Regular.otf',
        '/usr/share/fonts/noto/NotoSansDevanagari-Regular.ttf',
        # Noto Serif Devanagari variant
        '/usr/share/fonts/truetype/noto/NotoSerifDevanagari-Regular.ttf',
        # Generic Noto fallback
        '/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf',

        # ── macOS ────────────────────────────────────────────────────────
        '/System/Library/Fonts/Supplemental/DevanagariSangamMN.ttc',
        '/System/Library/Fonts/Supplemental/DevanagariMT.ttc',
        '/Library/Fonts/Devanagari Sangam MN.ttc',

        # ── Windows ─────────────────────────────────────────────────────
        r'C:\Windows\Fonts\NirmalaUI.ttf',
        r'C:\Windows\Fonts\NirmalaUI-Bold.ttf',
        r'C:\Windows\Fonts\mangal.ttf',
        r'C:\Windows\Fonts\Aparajita.ttf',
    ]

    for font_path in font_paths:
        if os.path.exists(font_path):
            try:
                fontManager.addfont(font_path)
                fp = FontProperties(fname=font_path)
                return fp
            except Exception:
                continue

    # Silent failure — charts still render, just without Devanagari-specific glyphs
    return None


# ============================================================================
# SESSION STATE INITIALIZATION
# ============================================================================

if 'last_prediction' not in st.session_state:
    st.session_state.last_prediction = None
if 'last_text' not in st.session_state:
    st.session_state.last_text = ""
if 'batch_results' not in st.session_state:
    st.session_state.batch_results = None
if 'batch_mode' not in st.session_state:
    st.session_state.batch_mode = None
if 'csv_text_column' not in st.session_state:
    st.session_state.csv_text_column = None
if 'explainability_results' not in st.session_state:
    st.session_state.explainability_results = None
if 'preprocessor' not in st.session_state:
    st.session_state.preprocessor = None
if 'model_wrapper' not in st.session_state:
    st.session_state.model_wrapper = None
if 'nepali_font' not in st.session_state:
    st.session_state.nepali_font = None
if 'session_predictions' not in st.session_state:
    st.session_state.session_predictions = 0
if 'session_class_counts' not in st.session_state:
    st.session_state.session_class_counts = {'NO': 0, 'OO': 0, 'OR': 0, 'OS': 0}

# ============================================================================
# MODEL LOADING
# ============================================================================

@st.cache_resource(show_spinner="Loading model... this may take a minute on first run.")
def load_model_and_preprocessor():
    """Load model, tokenizer, label encoder, and preprocessor."""
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import joblib

    hf_model_id = "UDHOV/xlm-roberta-large-nepali-hate-classification"
    local_model_path = 'models/saved_models/xlm_roberta_results/large_final'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize default label encoder as fallback
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(['NO', 'OO', 'OR', 'OS'])

    # Try local model first (only relevant for local dev), then HF Hub
    if not IS_HF_SPACES and os.path.exists(local_model_path):
        try:
            tokenizer = AutoTokenizer.from_pretrained(local_model_path)
            model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
            model.to(device).eval()

            le_path = os.path.join(local_model_path, 'label_encoder.pkl')
            if os.path.exists(le_path):
                le = joblib.load(le_path)

            st.success(f"✅ Model loaded from local path on {device}")

        except Exception as e:
            st.warning(f"⚠️ Local model failed: {e}. Falling back to HuggingFace Hub...")
            tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
            model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
            model.to(device).eval()

            try:
                from huggingface_hub import hf_hub_download
                le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
                le = joblib.load(le_file)
            except Exception:
                pass  # Use default label encoder

            st.success(f"✅ Model loaded from HuggingFace Hub on {device}")
    else:
        # HF Spaces or local path not found — load directly from Hub
        tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
        model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
        model.to(device).eval()

        try:
            from huggingface_hub import hf_hub_download
            le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
            le = joblib.load(le_file)
        except Exception:
            pass  # Use default label encoder

        st.success(f"✅ Model loaded from HuggingFace Hub on {device}")

    # Initialize preprocessor
    if CUSTOM_MODULES_AVAILABLE:
        preprocessor = HateSpeechPreprocessor(
            model_type="xlmr",
            translate_english=True,
            cache_size=2000
        )
    else:
        preprocessor = None

    return model, tokenizer, le, preprocessor, device


# ============================================================================
# PREDICTION FUNCTIONS
# ============================================================================

def predict_text(text, model, tokenizer, label_encoder, preprocessor, max_length=256):
    """Make prediction with preprocessing."""
    device = next(model.parameters()).device

    # Preprocess
    if preprocessor:
        preprocessed, emoji_features = preprocessor.preprocess(text, verbose=False)
    else:
        preprocessed = text
        emoji_features = {}

    if not preprocessed.strip():
        return {
            'prediction': 'NO',
            'confidence': 0.0,
            'probabilities': {label: 0.0 for label in label_encoder.classes_},
            'preprocessed_text': '',
            'emoji_features': emoji_features,
            'error': 'Empty text after preprocessing'
        }

    # Tokenize
    inputs = tokenizer(
        preprocessed,
        return_tensors='pt',
        max_length=max_length,
        padding='max_length',
        truncation=True
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=-1)[0]

    probs_np = probs.cpu().numpy()
    pred_idx = np.argmax(probs_np)
    pred_label = label_encoder.classes_[pred_idx]
    confidence = probs_np[pred_idx]

    return {
        'prediction': pred_label,
        'confidence': float(confidence),
        'probabilities': {
            label_encoder.classes_[i]: float(probs_np[i])
            for i in range(len(label_encoder.classes_))
        },
        'preprocessed_text': preprocessed,
        'emoji_features': emoji_features
    }


# ============================================================================
# VISUALIZATION FUNCTIONS
# ============================================================================

def plot_probabilities(probabilities):
    """Create probability bar chart."""
    labels = list(probabilities.keys())
    probs = list(probabilities.values())

    colors = {
        'NO': '#28a745',
        'OO': '#ffc107',
        'OR': '#dc3545',
        'OS': '#6f42c1'
    }
    bar_colors = [colors.get(label, '#6c757d') for label in labels]

    fig = go.Figure(data=[
        go.Bar(
            x=labels,
            y=probs,
            marker_color=bar_colors,
            text=[f'{p:.2%}' for p in probs],
            textposition='outside',
            hovertemplate='%{x}<br>Probability: %{y:.4f}<extra></extra>'
        )
    ])

    fig.update_layout(
        title="Class Probabilities",
        xaxis_title="Class",
        yaxis_title="Probability",
        yaxis_range=[0, 1.1],
        height=400,
        showlegend=False,
        template='plotly_white'
    )

    return fig


def get_label_description(label):
    """Get description for each label."""
    descriptions = {
        'NO': '✅ Non-Offensive: The text does not contain hate speech or offensive content.',
        'OO': '⚠️ Other-Offensive: Contains general offensive language but not targeted hate.',
        'OR': '🚫 Offensive-Racist: Contains hate speech targeting race, ethnicity, or religion.',
        'OS': '🚫 Offensive-Sexist: Contains hate speech targeting gender or sexuality.'
    }
    return descriptions.get(label, 'Unknown category')


# ============================================================================
# HISTORY MANAGEMENT
# ============================================================================

def save_prediction_to_history(text, result, feedback=None):
    """Save prediction to history file."""
    entry = {
        'timestamp': datetime.now().isoformat(),
        'text': text,
        'prediction': result.get('prediction'),
        'confidence': result.get('confidence'),
        'probabilities': result.get('probabilities'),
        'preprocessed_text': result.get('preprocessed_text'),
        'emoji_features': result.get('emoji_features', {}),
        'feedback': feedback
    }

    # Load existing history
    history = []
    if os.path.exists(HISTORY_FILE):
        try:
            with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
                history = json.load(f)
        except Exception:
            history = []

    # Append and save
    history.append(entry)

    try:
        with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
            json.dump(history, f, ensure_ascii=False, indent=2)
        return True
    except Exception as e:
        st.error(f"Failed to save history: {e}")
        return False


# ============================================================================
# BATCH EXPLAINABILITY HELPER
# ============================================================================

def render_batch_explainability(results_df, text_column, model, tokenizer, label_encoder,
                                preprocessor, nepali_font, explainability_available,
                                captum_available, mode_key="batch"):
    """Render explainability UI for batch results."""
    if not CUSTOM_MODULES_AVAILABLE:
        st.warning("⚠️ Explainability not available.")
        return

    if not (explainability_available['lime'] or explainability_available['shap'] or captum_available):
        st.warning("⚠️ No explainability methods available.")
        return

    with st.expander("💡 Explain Individual Results", expanded=False):
        st.markdown("**Select a text from the batch to explain:**")

        text_options = [f"Row {idx}: {str(row[text_column])[:50]}..." for idx, row in results_df.iterrows()]
        selected_idx = st.selectbox(
            "Choose text:",
            range(len(text_options)),
            format_func=lambda x: text_options[x],
            key=f"{mode_key}_select"
        )

        selected_text = str(results_df.iloc[selected_idx][text_column])
        selected_pred = results_df.iloc[selected_idx]['Prediction']

        st.write(f"**Selected:** {selected_text}")
        st.write(f"**Prediction:** {selected_pred}")

        available_methods = []
        if explainability_available['lime']:
            available_methods.append("LIME")
        if explainability_available['shap']:
            available_methods.append("SHAP")
        if captum_available:
            available_methods.append("Captum (IG)")

        if not available_methods:
            st.warning("⚠️ No explainability methods available.")
            return

        explain_method = st.selectbox(
            "Explanation method:",
            available_methods,
            key=f"{mode_key}_method"
        )

        if st.button("🔍 Generate Explanation", key=f"{mode_key}_explain_btn"):
            with st.spinner("Generating explanation..."):
                try:
                    if st.session_state.model_wrapper is None:
                        st.session_state.model_wrapper = create_explainer_wrapper(
                            model, tokenizer, label_encoder, preprocessor
                        )

                    wrapper = st.session_state.model_wrapper
                    clean_selected = selected_text.replace('"', '').replace("'", '').replace('\u201c', '').replace('\u201d', '')
                    preprocessed, emoji_features = preprocessor.preprocess(clean_selected)
                    analysis = wrapper.predict_with_analysis(clean_selected)

                    if explain_method == "LIME":
                        lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
                        result = lime_exp.explain_and_visualize(
                            analysis['original_text'],
                            analysis['preprocessed_text'],
                            save_path=None,
                            show=False,
                            num_samples=200
                        )
                        st.subheader("LIME Explanation")
                        st.pyplot(result['figure'])
                        st.markdown("---")
                        st.markdown("**📊 Feature Importance Details:**")
                        word_scores = result['explanation']['word_scores']
                        if word_scores:
                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
                            df = df.sort_values('Score', ascending=False)
                            st.dataframe(df, hide_index=True, use_container_width=True)
                        else:
                            st.warning("No word scores available")

                    elif explain_method == "SHAP":
                        shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
                        result = shap_exp.explain_and_visualize(
                            analysis['original_text'],
                            analysis['preprocessed_text'],
                            save_path=None,
                            show=False,
                            use_fallback=True
                        )
                        st.subheader("SHAP Explanation")
                        st.pyplot(result['figure'])
                        st.markdown("---")
                        st.markdown("**📊 Attribution Details:**")
                        st.write(f"**Method used:** {result['explanation']['method_used']}")
                        word_scores = result['explanation']['word_scores']
                        if word_scores:
                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
                            df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
                            st.dataframe(df, hide_index=True, use_container_width=True)
                        else:
                            st.warning("No word scores available")

                    elif explain_method == "Captum (IG)":
                        try:
                            captum_exp = CaptumExplainer(
                                model, tokenizer, label_encoder, preprocessor,
                                emoji_to_nepali_map=EMOJI_TO_NEPALI
                            )
                            result = captum_exp.explain_and_visualize(
                                analysis['original_text'],
                                target=None,
                                n_steps=50,
                                save_dir=None,
                                show=False,
                                nepali_font=nepali_font
                            )
                            st.subheader("Captum Integrated Gradients")
                            col1, col2 = st.columns(2)
                            with col1:
                                st.markdown("**Bar Chart**")
                                st.pyplot(result['bar_chart'])
                            with col2:
                                st.markdown("**Heatmap**")
                                st.pyplot(result['heatmap'])
                            st.markdown("---")
                            st.markdown("**📊 Attribution Details:**")
                            st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
                            word_attrs = result['explanation']['word_attributions']
                            if word_attrs:
                                df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
                                df = df.sort_values('Abs Score', ascending=False)
                                st.dataframe(df, hide_index=True, use_container_width=True)
                            else:
                                st.warning("No word attributions available")
                        except (MemoryError, RuntimeError):
                            st.error("❌ Captum (Integrated Gradients) requires more memory than available on this server.")
                            st.info("💡 **Tip:** Use LIME or SHAP instead — they work on cloud deployments. Captum works on local machines with more RAM/GPU.")

                except Exception as e:
                    st.error(f"❌ Explanation failed: {str(e)}")
                    st.markdown("**🐛 Error Details:**")
                    import traceback
                    st.code(traceback.format_exc())


# ============================================================================
# MAIN APPLICATION
# ============================================================================

def main():
    """Main application."""

    # Load Nepali font
    if st.session_state.nepali_font is None:
        st.session_state.nepali_font = load_nepali_font()

    nepali_font = st.session_state.nepali_font

    # Header
    st.markdown('<h1 class="main-header">🛡️ Nepali Hate Content Detector</h1>', unsafe_allow_html=True)
    st.markdown("""
    <div class="sub-header">
    AI-powered hate speech detection for Nepali text with advanced explainability
    <br>
    <strong>XLM-RoBERTa Large</strong> fine-tuned on Nepali social media data
    </div>
    """, unsafe_allow_html=True)

    # ========================================================================
    # SIDEBAR
    # ========================================================================

    with st.sidebar:
        st.header("ℹ️ About")
        st.markdown("""
        **Model**: XLM-RoBERTa Large
        **Task**: Multi-class hate speech detection
        **Language**: Nepali (Devanagari & Romanized)

        **Classes:**
        - **NO**: Non-offensive
        - **OO**: General offensive
        - **OR**: Racist/ethnic hate
        - **OS**: Sexist/gender hate
        """)

        st.markdown("---")

        st.header("🔧 Features")
        st.markdown("""
        ✅ **Preprocessing**
        - Script detection
        - Transliteration
        - Translation
        - Emoji mapping

        ✅ **Explainability**
        - LIME
        - SHAP
        - Captum (IG)

        ✅ **Batch Analysis**
        - CSV upload
        - Text area input
        """)

        st.markdown("---")

        st.header("🎨 Font Settings")
        with st.expander("Nepali Font Info", expanded=False):
            st.markdown(f"""
            **Status:** {'✅ Loaded' if nepali_font else '❌ Not loaded'}

            **Fix squares in Devanagari:**
            1. Download Kalimati.ttf
            2. Create `fonts/` directory
            3. Place font file there
            4. Restart app
            """)

        st.markdown("---")

        st.header("📊 Statistics")

        # Session Statistics
        st.subheader("🔄 Current Session")
        if st.session_state.session_predictions > 0:
            st.metric("Predictions", st.session_state.session_predictions)
            session_counts = st.session_state.session_class_counts
            if any(count > 0 for count in session_counts.values()):
                st.write("**Session Distribution:**")
                for label in ['NO', 'OO', 'OR', 'OS']:
                    count = session_counts.get(label, 0)
                    if count > 0:
                        pct = (count / st.session_state.session_predictions) * 100
                        st.write(f"• {label}: {count} ({pct:.0f}%)")
        else:
            st.info("No predictions in this session yet.")

        st.markdown("---")

        # History Statistics
        st.subheader("📚 All Time")
        if os.path.exists(HISTORY_FILE):
            try:
                with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
                    history = json.load(f)
                if history:
                    st.metric("Total Saved", len(history))
                    pred_counts = pd.Series([h['prediction'] for h in history]).value_counts()
                    st.write("**Distribution:**")
                    for label, count in pred_counts.items():
                        st.write(f"• {label}: {count}")
                else:
                    st.info("No saved predictions yet.")
            except Exception as e:
                st.warning("⚠️ History file error")
                with st.expander("Error details"):
                    st.code(str(e))
        else:
            st.info("📝 No history file\n\nEnable 'Save to history' in Tab 1 to track predictions.")

        st.markdown("---")
        st.markdown("""
        <div style='text-align: center; font-size: 0.9rem; color: #666;'>
        <a href='https://huggingface.co/UDHOV/xlm-roberta-large-nepali-hate-classification' target='_blank'>
        Model on HuggingFace 🤗
        </a>
        </div>
        """, unsafe_allow_html=True)

    # ========================================================================
    # LOAD MODEL
    # ========================================================================

    with st.spinner("Loading model..."):
        model, tokenizer, label_encoder, preprocessor, device = load_model_and_preprocessor()

    if model is None:
        st.error("❌ Failed to load model!")
        st.stop()

    # Check explainability availability
    explainability_available = check_explainability() if CUSTOM_MODULES_AVAILABLE else {'lime': False, 'shap': False}
    captum_available = check_captum_availability() if CUSTOM_MODULES_AVAILABLE else False

    # ========================================================================
    # TABS
    # ========================================================================

    tabs = st.tabs([
        "🔍 Single Prediction",
        "💡 Explainability",
        "📝 Batch Analysis",
        "📈 History"
    ])

    # ========================================================================
    # TAB 1: SINGLE PREDICTION
    # ========================================================================

    with tabs[0]:
        st.subheader("🔍 Single Text Analysis")

        col1, col2 = st.columns([2, 1])

        with col1:
            text_input = st.text_area(
                "Enter Nepali Text",
                height=200,
                placeholder="यहाँ आफ्नो पाठ लेख्नुहोस्...\nOr enter romanized Nepali: ma khusi xu\nOr English: This is a test",
                help="Enter text in Devanagari, Romanized Nepali, or English."
            )

            col_a, col_b = st.columns(2)
            with col_a:
                analyze_button = st.button("🔍 Analyze Text", type="primary", use_container_width=True)
            with col_b:
                save_to_history = st.checkbox("Save to history", value=True)

        with col2:
            st.markdown("##### 💡 Quick Info")
            st.info("""
            **Supported:**
            - Devanagari: नेपाली
            - Romanized: ma nepali xu
            - English: I am Nepali
            - Mixed scripts
            - Emojis: 😀😡🙏

            **Auto-processing:**
            - Script detection
            - Transliteration
            - Translation
            - Emoji → Nepali words
            - URL/mention removal
            """)

        if analyze_button and text_input.strip():
            with st.spinner("🔄 Analyzing text..."):
                result = predict_text(
                    text_input, model, tokenizer,
                    label_encoder, preprocessor
                )

                st.session_state.last_prediction = result
                st.session_state.last_text = text_input

                if 'prediction' in result:
                    st.session_state.session_predictions += 1
                    pred_label = result['prediction']
                    if pred_label in st.session_state.session_class_counts:
                        st.session_state.session_class_counts[pred_label] += 1

                if save_to_history:
                    save_prediction_to_history(text_input, result)

            if 'error' in result:
                st.warning(f"⚠️ {result['error']}")
                st.stop()

            st.markdown("---")
            st.subheader("📊 Analysis Results")

            pred_label = result['prediction']
            confidence = result['confidence']

            box_class = {
                'NO': 'no-box',
                'OO': 'oo-box',
                'OR': 'or-box',
                'OS': 'os-box'
            }.get(pred_label, 'no-box')

            st.markdown(f"""
            <div class='prediction-box {box_class}'>
                <h2 style='margin:0;'>Prediction: {pred_label}</h2>
                <p style='font-size:1.3rem; margin:0.5rem 0;'>
                    Confidence: <strong>{confidence:.2%}</strong>
                </p>
                <p style='margin:0; font-size:1rem;'>{get_label_description(pred_label)}</p>
            </div>
            """, unsafe_allow_html=True)

            st.plotly_chart(plot_probabilities(result['probabilities']), use_container_width=True)

            with st.expander("🔍 Preprocessing Details", expanded=False):
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.markdown("**Original Text:**")
                    st.code(text_input, language=None)

                with col2:
                    st.markdown("**Preprocessed:**")
                    st.code(result['preprocessed_text'], language=None)

                with col3:
                    if CUSTOM_MODULES_AVAILABLE and preprocessor:
                        script_info = get_script_info(text_input)
                        st.markdown("**Script Detected:**")
                        st.write(f"• Type: {script_info['script_type']}")
                        confidence_pct = min(script_info['confidence'] * 100, 100.0)
                        st.write(f"• Confidence: {confidence_pct:.1f}%")

            if result.get('emoji_features', {}).get('total_emoji_count', 0) > 0:
                with st.expander("😊 Emoji Analysis", expanded=False):
                    features = result['emoji_features']

                    col1, col2, col3 = st.columns(3)
                    with col1:
                        st.metric("Total Emojis", features['total_emoji_count'])
                        st.metric("Hate Emojis", features['hate_emoji_count'])
                    with col2:
                        st.metric("Positive Emojis", features['positive_emoji_count'])
                        st.metric("Mockery Emojis", features['mockery_emoji_count'])
                    with col3:
                        st.metric("Sadness Emojis", features['sadness_emoji_count'])
                        st.metric("Fear Emojis", features['fear_emoji_count'])

                    if CUSTOM_MODULES_AVAILABLE:
                        emoji_info = get_emoji_info(text_input)
                        if emoji_info['emojis_found']:
                            st.markdown("**Emojis Found:**")
                            st.write(" ".join(emoji_info['emojis_found']))

            with st.expander("📊 Detailed Probabilities", expanded=False):
                prob_df = pd.DataFrame({
                    'Class': list(result['probabilities'].keys()),
                    'Probability': list(result['probabilities'].values())
                })
                prob_df['Probability'] = prob_df['Probability'].apply(lambda x: f"{x:.4f}")
                st.dataframe(prob_df, hide_index=True, use_container_width=True)

    # ========================================================================
    # TAB 2: EXPLAINABILITY
    # ========================================================================

    with tabs[1]:
        st.subheader("💡 Model Explainability")

        if not CUSTOM_MODULES_AVAILABLE:
            st.error("❌ Explainability modules not available. Please check scripts directory.")
            st.stop()

        st.info(f"""
        **Available Methods:**
        - LIME: {'✅' if explainability_available['lime'] else '❌ (install: pip install lime)'}
        - SHAP: {'✅' if explainability_available['shap'] else '❌ (install: pip install shap)'}
        - Captum: {'✅' if captum_available else '❌ (install: pip install captum)'}
        """)

        explain_text = st.text_area(
            "Enter text to explain",
            height=150,
            value=st.session_state.last_text if st.session_state.last_text else "",
            placeholder="Enter Nepali text..."
        )

        available_methods = []
        if explainability_available['lime']:
            available_methods.append("LIME")
        if explainability_available['shap']:
            available_methods.append("SHAP")
        if captum_available:
            available_methods.append("Captum (IG)")

        if not available_methods:
            st.warning("⚠️ No explainability methods available. Please install required packages.")
            st.code("pip install lime shap captum", language="bash")
            st.stop()

        method = st.selectbox("Select explanation method", available_methods)

        with st.expander("⚙️ Configuration", expanded=False):
            if method == "LIME":
                num_samples = st.slider("Number of samples", 100, 500, 200, 50)
            elif method == "SHAP":
                use_fallback = st.checkbox("Use fallback if SHAP fails", value=True)
            elif method == "Captum (IG)":
                n_steps = st.slider("Integration steps", 10, 100, 50, 10)

        explain_button = st.button("🔍 Generate Explanation", type="primary", use_container_width=True)

        if explain_button and explain_text.strip():
            with st.spinner("Generating explanation..."):
                if st.session_state.model_wrapper is None:
                    st.session_state.model_wrapper = create_explainer_wrapper(
                        model, tokenizer, label_encoder, preprocessor
                    )

                wrapper = st.session_state.model_wrapper
                preprocessed, emoji_features = preprocessor.preprocess(explain_text)
                analysis = wrapper.predict_with_analysis(explain_text)

                st.success(f"**Prediction:** {analysis['predicted_label']} ({analysis['confidence']:.2%})")

                col1, col2 = st.columns(2)
                with col1:
                    st.write("**Original:**", explain_text)
                with col2:
                    st.write("**Preprocessed:**", preprocessed)

                st.markdown("---")

                try:
                    if method == "LIME":
                        lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
                        result = lime_exp.explain_and_visualize(
                            analysis['original_text'],
                            analysis['preprocessed_text'],
                            save_path=None,
                            show=False,
                            num_samples=num_samples
                        )
                        st.subheader("LIME Explanation")
                        st.pyplot(result['figure'])
                        with st.expander("📊 Feature Importance Details"):
                            word_scores = result['explanation']['word_scores']
                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
                            df = df.sort_values('Score', ascending=False)
                            st.dataframe(df, hide_index=True, use_container_width=True)

                    elif method == "SHAP":
                        shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
                        result = shap_exp.explain_and_visualize(
                            analysis['original_text'],
                            analysis['preprocessed_text'],
                            save_path=None,
                            show=False,
                            use_fallback=use_fallback
                        )
                        st.subheader("SHAP Explanation")
                        st.pyplot(result['figure'])
                        with st.expander("📊 Attribution Details"):
                            st.write(f"**Method used:** {result['explanation']['method_used']}")
                            word_scores = result['explanation']['word_scores']
                            df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
                            df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
                            st.dataframe(df, hide_index=True, use_container_width=True)

                    elif method == "Captum (IG)":
                        try:
                            captum_exp = CaptumExplainer(
                                model, tokenizer, label_encoder, preprocessor,
                                emoji_to_nepali_map=EMOJI_TO_NEPALI
                            )
                            result = captum_exp.explain_and_visualize(
                                analysis['original_text'],
                                target=None,
                                n_steps=n_steps,
                                save_dir=None,
                                show=False,
                                nepali_font=nepali_font
                            )
                            st.subheader("Captum Integrated Gradients")
                            col1, col2 = st.columns(2)
                            with col1:
                                st.markdown("**Bar Chart**")
                                st.pyplot(result['bar_chart'])
                            with col2:
                                st.markdown("**Heatmap**")
                                st.pyplot(result['heatmap'])
                            with st.expander("📊 Attribution Details"):
                                st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
                                word_attrs = result['explanation']['word_attributions']
                                df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
                                df = df.sort_values('Abs Score', ascending=False)
                                st.dataframe(df, hide_index=True, use_container_width=True)
                        except (MemoryError, RuntimeError) as mem_err:
                            st.error("❌ Captum (Integrated Gradients) requires more memory than available on this server.")
                            st.info("💡 **Tip:** Use LIME or SHAP instead — they work great on cloud deployments. Captum works on local machines with more RAM/GPU.")

                except Exception as e:
                    st.error(f"❌ Explanation failed: {str(e)}")
                    with st.expander("🐛 Error Details"):
                        st.exception(e)

    # ========================================================================
    # TAB 3: BATCH ANALYSIS
    # ========================================================================

    with tabs[2]:
        st.subheader("📝 Batch Analysis")

        st.markdown("### 📥 Download Example Files")
        col1, col2 = st.columns(2)

        with col1:
            example_csv_data = {
                'text': [
                    'यो राम्रो छ',
                    'तिमी मुर्ख हौ',
                    'मुस्लिम हरु सबै खराब छन्',
                    'केटीहरु घरमा बस्नु पर्छ',
                    'नमस्ते, कस्तो छ?'
                ]
            }
            example_csv = pd.DataFrame(example_csv_data).to_csv(index=False)
            st.download_button(
                label="📄 Download Example CSV",
                data=example_csv,
                file_name="example_batch.csv",
                mime="text/csv",
                use_container_width=True
            )

        with col2:
            example_text = "यो राम्रो छ\nतिमी मुर्ख हौ\nमुस्लिम हरु सबै खराब छन्\nकेटीहरु घरमा बस्नु पर्छ\nनमस्ते, कस्तो छ?"
            st.download_button(
                label="📝 Download Example Text",
                data=example_text,
                file_name="example_batch.txt",
                mime="text/plain",
                use_container_width=True
            )

        st.markdown("---")

        input_method = st.radio("Input method:", ["Text Area", "CSV Upload"])

        # ---- TEXT AREA ----
        if input_method == "Text Area":
            st.info("💡 Enter one text per line")

            batch_text = st.text_area(
                "Enter texts (one per line)",
                height=250,
                placeholder="यो राम्रो छ\nतिमी मुर्ख हौ\n..."
            )

            if st.button("🚀 Analyze Batch", type="primary"):
                if batch_text.strip():
                    texts = [line.strip() for line in batch_text.split('\n') if line.strip()]

                    with st.spinner(f"Analyzing {len(texts)} texts..."):
                        results = []
                        progress_bar = st.progress(0)

                        for idx, text in enumerate(texts):
                            try:
                                result = predict_text(
                                    text, model, tokenizer,
                                    label_encoder, preprocessor
                                )
                                results.append({
                                    'Text': text[:60] + '...' if len(text) > 60 else text,
                                    'Full_Text': text,
                                    'Prediction': result['prediction'],
                                    'Confidence': result['confidence'],
                                    'Preprocessed': result['preprocessed_text']
                                })
                            except Exception as e:
                                results.append({
                                    'Text': text[:60],
                                    'Full_Text': text,
                                    'Prediction': 'Error',
                                    'Confidence': 0.0,
                                    'Preprocessed': str(e)
                                })
                            progress_bar.progress((idx + 1) / len(texts))

                        st.session_state.batch_results = pd.DataFrame(results)
                        st.session_state.batch_mode = 'text_area'

                        for result in results:
                            if result['Prediction'] != 'Error':
                                st.session_state.session_predictions += 1
                                pred_label = result['Prediction']
                                if pred_label in st.session_state.session_class_counts:
                                    st.session_state.session_class_counts[pred_label] += 1

                        st.rerun()
                else:
                    st.warning("Please enter some texts.")

            # Display results outside button block
            if (st.session_state.batch_results is not None and
                    st.session_state.get('batch_mode') == 'text_area'):

                results_df = st.session_state.batch_results

                st.success(f"✅ Analyzed {len(results_df)} texts!")

                display_df = results_df[['Text', 'Prediction', 'Confidence']].copy()
                display_df['Confidence'] = display_df['Confidence'].apply(lambda x: f"{x:.2%}")
                st.dataframe(display_df, use_container_width=True, hide_index=True, height=400)

                st.markdown("---")
                st.subheader("📊 Summary Statistics")

                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("Total Texts", len(results_df))
                    st.metric("Avg Confidence", f"{results_df['Confidence'].mean():.2%}")
                with col2:
                    summary = results_df['Prediction'].value_counts()
                    fig = px.pie(
                        values=summary.values,
                        names=summary.index,
                        title="Prediction Distribution",
                        color_discrete_sequence=px.colors.qualitative.Set2
                    )
                    st.plotly_chart(fig, use_container_width=True)
                with col3:
                    st.markdown("**Class Breakdown:**")
                    for label, count in summary.items():
                        pct = count / len(results_df) * 100
                        st.write(f"• {label}: {count} ({pct:.1f}%)")

                st.markdown("---")
                download_df = results_df[['Full_Text', 'Prediction', 'Confidence', 'Preprocessed']].copy()
                download_df.columns = ['Text', 'Prediction', 'Confidence', 'Preprocessed']
                csv = download_df.to_csv(index=False)

                col_download, col_explain = st.columns(2)
                with col_download:
                    st.download_button(
                        label="📥 Download Results CSV",
                        data=csv,
                        file_name=f"batch_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
                        mime="text/csv",
                        use_container_width=True,
                        key="download_batch_text"
                    )
                with col_explain:
                    if st.button("💡 Explain Selected", use_container_width=True, key="hint_batch_text"):
                        st.info("👇 Select a text below to explain")

                render_batch_explainability(
                    results_df=results_df,
                    text_column='Full_Text',
                    model=model,
                    tokenizer=tokenizer,
                    label_encoder=label_encoder,
                    preprocessor=preprocessor,
                    nepali_font=nepali_font,
                    explainability_available=explainability_available,
                    captum_available=captum_available,
                    mode_key="text_area"
                )

        # ---- CSV UPLOAD ----
        else:
            st.info("💡 Upload CSV with a 'text' column")

            uploaded_file = st.file_uploader(
                "Choose CSV file", 
                type=['csv'],
                help="Max 200MB. Upload a CSV with a text column containing Nepali text."
            )

            if uploaded_file:
                try:
                    # Try multiple encodings for Nepali text compatibility
                    try:
                        df = pd.read_csv(uploaded_file, encoding='utf-8')
                    except UnicodeDecodeError:
                        uploaded_file.seek(0)
                        df = pd.read_csv(uploaded_file, encoding='latin-1')
                    st.write("📄 **File Preview:**")
                    st.dataframe(df.head(10), use_container_width=True)

                    text_column = st.selectbox("Select text column:", df.columns)

                    if st.button("🚀 Analyze CSV", type="primary"):
                        texts = df[text_column].astype(str).tolist()

                        with st.spinner(f"Analyzing {len(texts)} texts..."):
                            predictions = []
                            confidences = []
                            preprocessed_list = []
                            progress_bar = st.progress(0)

                            for idx, text in enumerate(texts):
                                try:
                                    result = predict_text(
                                        str(text), model, tokenizer,
                                        label_encoder, preprocessor
                                    )
                                    predictions.append(result['prediction'])
                                    confidences.append(result['confidence'])
                                    preprocessed_list.append(result['preprocessed_text'])
                                except Exception as e:
                                    predictions.append('Error')
                                    confidences.append(0.0)
                                    preprocessed_list.append(str(e))
                                progress_bar.progress((idx + 1) / len(texts))

                            df['Prediction'] = predictions
                            df['Confidence'] = confidences
                            df['Preprocessed'] = preprocessed_list

                            st.session_state.batch_results = df
                            st.session_state.batch_mode = 'csv'
                            st.session_state.csv_text_column = text_column

                            for pred in predictions:
                                if pred != 'Error':
                                    st.session_state.session_predictions += 1
                                    if pred in st.session_state.session_class_counts:
                                        st.session_state.session_class_counts[pred] += 1

                            st.rerun()

                    # Display results outside button block
                    if (st.session_state.batch_results is not None and
                            st.session_state.get('batch_mode') == 'csv'):

                        df_results = st.session_state.batch_results
                        text_col = st.session_state.get('csv_text_column', text_column)

                        st.success("✅ Analysis complete!")
                        st.dataframe(df_results, use_container_width=True, height=400)

                        st.markdown("---")
                        st.subheader("📊 Summary")

                        col1, col2 = st.columns(2)
                        with col1:
                            summary = df_results['Prediction'].value_counts()
                            fig = px.bar(
                                x=summary.index,
                                y=summary.values,
                                title="Prediction Distribution",
                                labels={'x': 'Class', 'y': 'Count'},
                                color=summary.index,
                                color_discrete_map={
                                    'NO': '#28a745',
                                    'OO': '#ffc107',
                                    'OR': '#dc3545',
                                    'OS': '#6f42c1'
                                }
                            )
                            st.plotly_chart(fig, use_container_width=True)
                        with col2:
                            st.metric("Total Texts", len(df_results))
                            st.metric("Avg Confidence", f"{df_results['Confidence'].mean():.2%}")
                            st.markdown("**Class Distribution:**")
                            for label, count in summary.items():
                                st.write(f"• {label}: {count}")

                        st.markdown("---")
                        csv_data = df_results.to_csv(index=False)

                        col_download, col_explain = st.columns(2)
                        with col_download:
                            st.download_button(
                                label="📥 Download Results CSV",
                                data=csv_data,
                                file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
                                mime="text/csv",
                                use_container_width=True,
                                key="download_csv_results"
                            )
                        with col_explain:
                            if st.button("💡 Explain Selected", use_container_width=True, key="csv_explain_hint"):
                                st.info("👇 Use expander below to explain")

                        render_batch_explainability(
                            results_df=df_results,
                            text_column=text_col,
                            model=model,
                            tokenizer=tokenizer,
                            label_encoder=label_encoder,
                            preprocessor=preprocessor,
                            nepali_font=nepali_font,
                            explainability_available=explainability_available,
                            captum_available=captum_available,
                            mode_key="csv"
                        )

                except Exception as e:
                    st.error(f"❌ Error processing file: {str(e)}")
                    with st.expander("🐛 Error Details"):
                        st.exception(e)

    # ========================================================================
    # TAB 4: HISTORY
    # ========================================================================

    with tabs[3]:
        st.subheader("📈 Prediction History")

        col1, col2 = st.columns([3, 1])
        with col1:
            st.write("View and analyze your prediction history")
        with col2:
            if st.button("🔄 Refresh", use_container_width=True):
                st.rerun()

        if os.path.exists(HISTORY_FILE):
            try:
                with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
                    history = json.load(f)

                if history:
                    history_df = pd.DataFrame(history)
                    history_df['timestamp'] = pd.to_datetime(history_df['timestamp'])

                    st.markdown("### 📊 Overview")
                    col1, col2, col3, col4 = st.columns(4)
                    with col1:
                        st.metric("Total Predictions", len(history_df))
                    with col2:
                        st.metric("Avg Confidence", f"{history_df['confidence'].mean():.2%}")
                    with col3:
                        if 'emoji_features' in history_df.columns:
                            total_emojis = sum(
                                e.get('total_emoji_count', 0)
                                for e in history_df['emoji_features']
                                if isinstance(e, dict)
                            )
                            st.metric("Total Emojis", total_emojis)
                        else:
                            st.metric("Total Emojis", "N/A")
                    with col4:
                        most_common = history_df['prediction'].mode()[0]
                        st.metric("Most Common", most_common)

                    st.markdown("---")
                    st.markdown("### 📈 Trends")

                    col1, col2 = st.columns(2)
                    with col1:
                        daily_counts = history_df.groupby(
                            history_df['timestamp'].dt.date
                        ).size().reset_index(name='count')
                        fig = px.line(
                            daily_counts,
                            x='timestamp',
                            y='count',
                            title="Predictions Over Time",
                            labels={'timestamp': 'Date', 'count': 'Count'}
                        )
                        st.plotly_chart(fig, use_container_width=True)
                    with col2:
                        class_dist = history_df['prediction'].value_counts()
                        fig = px.pie(
                            values=class_dist.values,
                            names=class_dist.index,
                            title="Class Distribution",
                            color=class_dist.index,
                            color_discrete_map={
                                'NO': '#28a745',
                                'OO': '#ffc107',
                                'OR': '#dc3545',
                                'OS': '#6f42c1'
                            }
                        )
                        st.plotly_chart(fig, use_container_width=True)

                    st.markdown("---")
                    st.markdown("### 📋 Recent Predictions")

                    num_to_show = st.slider("Number to show", 5, 50, 20, 5)
                    recent = history_df.tail(num_to_show).sort_values('timestamp', ascending=False)
                    display = recent[['timestamp', 'text', 'prediction', 'confidence']].copy()
                    display['confidence'] = display['confidence'].apply(lambda x: f"{x:.2%}")
                    display['text'] = display['text'].apply(lambda x: x[:80] + '...' if len(x) > 80 else x)
                    display['timestamp'] = display['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
                    st.dataframe(display, use_container_width=True, hide_index=True, height=400)

                    st.markdown("---")
                    col1, col2 = st.columns(2)
                    with col1:
                        csv = history_df.to_csv(index=False)
                        st.download_button(
                            label="📥 Download Full History",
                            data=csv,
                            file_name=f"history_{datetime.now().strftime('%Y%m%d')}.csv",
                            mime="text/csv",
                            use_container_width=True
                        )
                    with col2:
                        if st.button("🗑️ Clear History", type="secondary", use_container_width=True):
                            if os.path.exists(HISTORY_FILE):
                                os.remove(HISTORY_FILE)
                                st.success("✅ History cleared!")
                                st.rerun()
                else:
                    st.info("📝 No predictions in history yet.")

            except Exception as e:
                st.error(f"❌ Error loading history: {str(e)}")
                with st.expander("🐛 Error Details"):
                    st.exception(e)
        else:
            st.info("📝 No history file found yet.")
            st.markdown("""
            ### How to Build History:
            1. Go to **Single Prediction** tab
            2. Enable "Save to history" checkbox
            3. Analyze some text
            4. Your predictions will appear here!
            """)


if __name__ == "__main__":
    main()