Spaces:
Sleeping
Sleeping
The initail deployment
Browse files- Dockerfile +41 -7
- README.md +36 -12
- requirements.txt +34 -3
- scripts/__init__.py +0 -0
- scripts/captum_explainer.py +710 -0
- scripts/explainability.py +884 -0
- scripts/transformer_data_preprocessing.py +786 -0
- src/streamlit_app.py +1421 -35
Dockerfile
CHANGED
|
@@ -1,20 +1,54 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
build-essential \
|
| 7 |
curl \
|
| 8 |
git \
|
|
|
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
|
| 12 |
-
COPY
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
# Set working directory
|
| 4 |
WORKDIR /app
|
| 5 |
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
# fontconfig is needed to register and cache the Kalimati font
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
build-essential \
|
| 10 |
curl \
|
| 11 |
git \
|
| 12 |
+
fontconfig \
|
| 13 |
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
|
| 15 |
+
# Copy requirements first for better Docker layer caching
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 20 |
+
pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
| 22 |
+
# Copy all app files (includes fonts/Kalimati.ttf)
|
| 23 |
+
COPY . .
|
| 24 |
|
| 25 |
+
# Register Kalimati font with the system font cache
|
| 26 |
+
# This allows matplotlib to find it system-wide as a fallback
|
| 27 |
+
RUN if [ -f /app/fonts/Kalimati.ttf ]; then \
|
| 28 |
+
mkdir -p /usr/local/share/fonts/nepali && \
|
| 29 |
+
cp /app/fonts/Kalimati.ttf /usr/local/share/fonts/nepali/ && \
|
| 30 |
+
fc-cache -fv; \
|
| 31 |
+
fi
|
| 32 |
|
| 33 |
+
# Set environment variables
|
| 34 |
+
ENV PYTHONPATH=/app
|
| 35 |
+
ENV HF_HOME=/tmp/huggingface
|
| 36 |
+
ENV TRANSFORMERS_CACHE=/tmp/huggingface
|
| 37 |
+
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 38 |
+
ENV STREAMLIT_SERVER_PORT=7860
|
| 39 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 40 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 41 |
+
ENV MPLCONFIGDIR=/tmp/matplotlib
|
| 42 |
+
|
| 43 |
+
# Create necessary temp directories
|
| 44 |
+
RUN mkdir -p /tmp/huggingface /tmp/matplotlib
|
| 45 |
+
|
| 46 |
+
# Expose the port HF Spaces expects
|
| 47 |
+
EXPOSE 7860
|
| 48 |
+
|
| 49 |
+
# Run the Streamlit app
|
| 50 |
+
CMD ["streamlit", "run", "main_app.py", \
|
| 51 |
+
"--server.port=7860", \
|
| 52 |
+
"--server.address=0.0.0.0", \
|
| 53 |
+
"--server.headless=true", \
|
| 54 |
+
"--browser.gatherUsageStats=false"]
|
README.md
CHANGED
|
@@ -1,20 +1,44 @@
|
|
| 1 |
---
|
| 2 |
-
title: Nepali Hate Classification
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: red
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: Multi-class Nepali hate content classification system suppor
|
| 12 |
license: mit
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Nepali Hate Content Classification
|
| 3 |
+
emoji: 🔍
|
| 4 |
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
+
pinned: true
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
license: mit
|
| 9 |
+
short_description: Multi-class hate content classifier for Nepali social media text
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Nepali Hate Content Classification System
|
| 13 |
|
| 14 |
+
An interactive web application for classifying hate content in Nepali social media text. Supports **Devanagari script**, **Romanized Nepali**, **English**, and **code-mixed** inputs.
|
| 15 |
|
| 16 |
+
## Models
|
| 17 |
+
|
| 18 |
+
| Model | HF Repo |
|
| 19 |
+
|-------|---------|
|
| 20 |
+
| XLM-RoBERTa Large | [UDHOV/xlm-roberta-large-nepali-hate-classification](https://huggingface.co/UDHOV/xlm-roberta-large-nepali-hate-classification) |
|
| 21 |
+
| NepaliBERT | [UDHOV/nepalibert-nepali-hate-classification](https://huggingface.co/UDHOV/nepalibert-nepali-hate-classification) |
|
| 22 |
+
|
| 23 |
+
## Classes
|
| 24 |
+
|
| 25 |
+
| Label | Description |
|
| 26 |
+
|-------|-------------|
|
| 27 |
+
| 🟢 NON_OFFENSIVE | No offensive content |
|
| 28 |
+
| 🟡 OTHER_OFFENSIVE | General offensive content |
|
| 29 |
+
| 🔴 OFFENSIVE_RACIST | Targets ethnicity, race, or caste |
|
| 30 |
+
| 🔴 OFFENSIVE_SEXIST | Targets gender |
|
| 31 |
+
|
| 32 |
+
## Features
|
| 33 |
+
|
| 34 |
+
- Single text and batch (CSV) classification
|
| 35 |
+
- Automatic script detection and preprocessing
|
| 36 |
+
- Emoji semantic mapping (180+ emojis)
|
| 37 |
+
- Confidence scores with visualization
|
| 38 |
+
- Explainability via LIME, SHAP, and Integrated Gradients (Captum)
|
| 39 |
+
- Prediction history tracking
|
| 40 |
+
|
| 41 |
+
## Project
|
| 42 |
+
|
| 43 |
+
Bachelor of Computer Engineering Final Project
|
| 44 |
+
Khwopa College of Engineering, Tribhuvan University, Nepal (2026)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,34 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core
|
| 2 |
+
streamlit>=1.32.0
|
| 3 |
+
pandas>=1.5.0
|
| 4 |
+
numpy>=1.23.0
|
| 5 |
+
|
| 6 |
+
# Deep Learning
|
| 7 |
+
torch>=2.0.0
|
| 8 |
+
transformers>=4.38.0
|
| 9 |
+
sentencepiece>=0.1.99
|
| 10 |
+
protobuf>=3.20.0
|
| 11 |
+
|
| 12 |
+
# HuggingFace
|
| 13 |
+
huggingface-hub>=0.21.0
|
| 14 |
+
accelerate>=0.27.0
|
| 15 |
+
|
| 16 |
+
# Preprocessing
|
| 17 |
+
deep-translator>=1.11.4
|
| 18 |
+
indic-transliteration>=2.3.0
|
| 19 |
+
emoji>=2.10.0
|
| 20 |
+
|
| 21 |
+
# Explainability
|
| 22 |
+
lime>=0.2.0.1
|
| 23 |
+
shap>=0.44.0
|
| 24 |
+
captum>=0.7.0
|
| 25 |
+
|
| 26 |
+
# Visualization
|
| 27 |
+
plotly>=5.18.0
|
| 28 |
+
matplotlib>=3.7.0
|
| 29 |
+
|
| 30 |
+
# Utilities
|
| 31 |
+
scikit-learn>=1.3.0
|
| 32 |
+
joblib>=1.3.0
|
| 33 |
+
scipy>=1.10.0
|
| 34 |
+
tqdm>=4.65.0
|
scripts/__init__.py
ADDED
|
File without changes
|
scripts/captum_explainer.py
ADDED
|
@@ -0,0 +1,710 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Captum Explainer Module
|
| 3 |
+
========================
|
| 4 |
+
Gradient-based explainability using Captum's Integrated Gradients.
|
| 5 |
+
|
| 6 |
+
This module provides:
|
| 7 |
+
- Layer Integrated Gradients attribution
|
| 8 |
+
- Token-level importance visualization
|
| 9 |
+
- Emoji-aware visualization with Nepali font support
|
| 10 |
+
- Heatmap and bar chart visualizations
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
------
|
| 14 |
+
from scripts.captum_explainer import CaptumExplainer, explain_with_captum
|
| 15 |
+
|
| 16 |
+
# Create explainer
|
| 17 |
+
explainer = CaptumExplainer(model, tokenizer, label_encoder, preprocessor)
|
| 18 |
+
|
| 19 |
+
# Explain prediction
|
| 20 |
+
result = explainer.explain(
|
| 21 |
+
original_text="Your text here",
|
| 22 |
+
n_steps=50,
|
| 23 |
+
nepali_font=font
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Visualize
|
| 27 |
+
explainer.visualize_bar_chart(result, save_path="ig_bar.png")
|
| 28 |
+
explainer.visualize_heatmap(result, save_path="ig_heatmap.png")
|
| 29 |
+
|
| 30 |
+
# All-in-one
|
| 31 |
+
result = explainer.explain_and_visualize(
|
| 32 |
+
original_text="Your text",
|
| 33 |
+
save_dir="./explanations",
|
| 34 |
+
show=True
|
| 35 |
+
)
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
import os
|
| 39 |
+
import numpy as np
|
| 40 |
+
import torch
|
| 41 |
+
import re
|
| 42 |
+
import emoji
|
| 43 |
+
import regex
|
| 44 |
+
import warnings
|
| 45 |
+
warnings.filterwarnings("ignore")
|
| 46 |
+
|
| 47 |
+
from typing import Dict, List, Tuple, Optional
|
| 48 |
+
from matplotlib import pyplot as plt, cm
|
| 49 |
+
from matplotlib.font_manager import FontProperties
|
| 50 |
+
import matplotlib.colors as mcolors
|
| 51 |
+
|
| 52 |
+
# Captum
|
| 53 |
+
try:
|
| 54 |
+
from captum.attr import LayerIntegratedGradients
|
| 55 |
+
CAPTUM_AVAILABLE = True
|
| 56 |
+
except ImportError:
|
| 57 |
+
CAPTUM_AVAILABLE = False
|
| 58 |
+
print("⚠️ Captum not installed. Install with: pip install captum")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ============================================================================
|
| 62 |
+
# TOKEN ALIGNMENT WITH EMOJI PRESERVATION
|
| 63 |
+
# ============================================================================
|
| 64 |
+
|
| 65 |
+
def create_display_tokens_from_subwords(
|
| 66 |
+
original_text: str,
|
| 67 |
+
preprocessed_text: str,
|
| 68 |
+
tokenizer_tokens: List[str],
|
| 69 |
+
emoji_to_nepali_map: Dict[str, str],
|
| 70 |
+
remove_special: bool = True
|
| 71 |
+
) -> List[str]:
|
| 72 |
+
"""
|
| 73 |
+
Create display tokens that preserve emojis from original text
|
| 74 |
+
|
| 75 |
+
Maps preprocessed tokens (with emoji translations) back to original tokens (with actual emojis)
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
original_text: Original text with emojis (e.g., "तेरी कसम 😀😀")
|
| 79 |
+
preprocessed_text: Preprocessed text (e.g., "तेरी कसम खुशी खुशी")
|
| 80 |
+
tokenizer_tokens: Tokenized output from model
|
| 81 |
+
emoji_to_nepali_map: Emoji to Nepali mapping dictionary
|
| 82 |
+
remove_special: Whether to remove special tokens
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
List of display tokens with emojis preserved (e.g., ["तेरी", "कसम", "😀", "😀"])
|
| 86 |
+
"""
|
| 87 |
+
# Build reverse emoji mapping (Nepali text → emoji)
|
| 88 |
+
# For multi-word translations like "ठूलो रिस", we need to handle them specially
|
| 89 |
+
reverse_emoji_map = {}
|
| 90 |
+
multi_word_emoji_map = {} # For phrases like "ठूलो रिस"
|
| 91 |
+
|
| 92 |
+
for emoji_char, nepali_text in emoji_to_nepali_map.items():
|
| 93 |
+
if ' ' in nepali_text:
|
| 94 |
+
# Multi-word translation
|
| 95 |
+
multi_word_emoji_map[nepali_text] = emoji_char
|
| 96 |
+
# Also map individual words (as fallback)
|
| 97 |
+
for word in nepali_text.split():
|
| 98 |
+
if word not in reverse_emoji_map:
|
| 99 |
+
reverse_emoji_map[word] = emoji_char
|
| 100 |
+
else:
|
| 101 |
+
# Single word translation
|
| 102 |
+
reverse_emoji_map[nepali_text] = emoji_char
|
| 103 |
+
|
| 104 |
+
# Clean and group tokenizer output into words
|
| 105 |
+
word_pieces = []
|
| 106 |
+
current_word = ""
|
| 107 |
+
|
| 108 |
+
for tok in tokenizer_tokens:
|
| 109 |
+
# Skip special tokens if requested
|
| 110 |
+
if remove_special and tok in ['<s>', '</s>', '[CLS]', '[SEP]', '<pad>', '[PAD]']:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
if tok.startswith("▁"):
|
| 114 |
+
# New word
|
| 115 |
+
if current_word:
|
| 116 |
+
word_pieces.append(current_word)
|
| 117 |
+
current_word = tok.replace("▁", "")
|
| 118 |
+
else:
|
| 119 |
+
# Continue current word
|
| 120 |
+
current_word += tok.replace("▁", "")
|
| 121 |
+
|
| 122 |
+
if current_word:
|
| 123 |
+
word_pieces.append(current_word)
|
| 124 |
+
|
| 125 |
+
# Get original words
|
| 126 |
+
original_words = original_text.split()
|
| 127 |
+
|
| 128 |
+
# Map word_pieces back to original with emojis
|
| 129 |
+
display_tokens = []
|
| 130 |
+
orig_idx = 0
|
| 131 |
+
word_idx = 0
|
| 132 |
+
|
| 133 |
+
while word_idx < len(word_pieces):
|
| 134 |
+
word = word_pieces[word_idx]
|
| 135 |
+
|
| 136 |
+
# Check for multi-word emoji phrases first
|
| 137 |
+
if word_idx < len(word_pieces) - 1:
|
| 138 |
+
two_word_phrase = f"{word} {word_pieces[word_idx + 1]}"
|
| 139 |
+
if two_word_phrase in multi_word_emoji_map:
|
| 140 |
+
# Found a multi-word emoji translation - show emoji once
|
| 141 |
+
display_tokens.append(multi_word_emoji_map[two_word_phrase])
|
| 142 |
+
word_idx += 2 # Skip both words
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
# Check if this single word is an emoji translation
|
| 146 |
+
if word in reverse_emoji_map:
|
| 147 |
+
# This is a Nepali emoji translation → use the actual emoji
|
| 148 |
+
display_tokens.append(reverse_emoji_map[word])
|
| 149 |
+
word_idx += 1
|
| 150 |
+
else:
|
| 151 |
+
# Regular word - try to match with original
|
| 152 |
+
matched = False
|
| 153 |
+
|
| 154 |
+
# Look for matching word in original
|
| 155 |
+
while orig_idx < len(original_words):
|
| 156 |
+
orig_word = original_words[orig_idx]
|
| 157 |
+
|
| 158 |
+
# Skip emojis in original (they're handled by reverse_emoji_map)
|
| 159 |
+
if any(c in emoji.EMOJI_DATA for c in orig_word):
|
| 160 |
+
orig_idx += 1
|
| 161 |
+
continue
|
| 162 |
+
|
| 163 |
+
# Check if words match
|
| 164 |
+
orig_clean = emoji.replace_emoji(orig_word, replace="").strip()
|
| 165 |
+
if orig_clean and (word in orig_clean or orig_clean in word or word == orig_clean):
|
| 166 |
+
display_tokens.append(orig_word)
|
| 167 |
+
matched = True
|
| 168 |
+
orig_idx += 1
|
| 169 |
+
break
|
| 170 |
+
|
| 171 |
+
orig_idx += 1
|
| 172 |
+
|
| 173 |
+
if not matched:
|
| 174 |
+
# Couldn't match - use the word as-is
|
| 175 |
+
display_tokens.append(word)
|
| 176 |
+
|
| 177 |
+
word_idx += 1
|
| 178 |
+
|
| 179 |
+
return display_tokens
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# ============================================================================
|
| 183 |
+
# FONT HANDLING
|
| 184 |
+
# ============================================================================
|
| 185 |
+
|
| 186 |
+
def apply_nepali_font(ax_or_text, nepali_font: Optional[FontProperties] = None,
|
| 187 |
+
is_axis: bool = True):
|
| 188 |
+
"""
|
| 189 |
+
Apply Nepali font to text containing Devanagari (but not emojis)
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
ax_or_text: Matplotlib axis or text object
|
| 193 |
+
nepali_font: Nepali font properties
|
| 194 |
+
is_axis: Whether ax_or_text is an axis (True) or text object (False)
|
| 195 |
+
"""
|
| 196 |
+
if nepali_font is None:
|
| 197 |
+
return
|
| 198 |
+
|
| 199 |
+
if is_axis:
|
| 200 |
+
# Apply to axis tick labels
|
| 201 |
+
for lbl in ax_or_text.get_xticklabels():
|
| 202 |
+
text_content = lbl.get_text()
|
| 203 |
+
# Only apply if has Devanagari AND no emojis
|
| 204 |
+
has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
|
| 205 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
|
| 206 |
+
|
| 207 |
+
if has_devanagari and not has_emoji:
|
| 208 |
+
lbl.set_fontproperties(nepali_font)
|
| 209 |
+
lbl.set_fontsize(11)
|
| 210 |
+
else:
|
| 211 |
+
# Apply to single text object
|
| 212 |
+
text_content = ax_or_text.get_text()
|
| 213 |
+
has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
|
| 214 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
|
| 215 |
+
|
| 216 |
+
if has_devanagari and not has_emoji:
|
| 217 |
+
ax_or_text.set_fontproperties(nepali_font)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# ============================================================================
|
| 221 |
+
# CAPTUM EXPLAINER CLASS
|
| 222 |
+
# ============================================================================
|
| 223 |
+
|
| 224 |
+
class CaptumExplainer:
|
| 225 |
+
"""
|
| 226 |
+
Captum Integrated Gradients explainer with emoji support
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
def __init__(self, model, tokenizer, label_encoder, preprocessor,
|
| 230 |
+
emoji_to_nepali_map: Optional[Dict[str, str]] = None,
|
| 231 |
+
device=None, max_length: int = 256):
|
| 232 |
+
"""
|
| 233 |
+
Args:
|
| 234 |
+
model: Trained model
|
| 235 |
+
tokenizer: Model tokenizer
|
| 236 |
+
label_encoder: Label encoder
|
| 237 |
+
preprocessor: HateSpeechPreprocessor instance
|
| 238 |
+
emoji_to_nepali_map: Emoji to Nepali mapping (optional)
|
| 239 |
+
device: torch device (auto-detected if None)
|
| 240 |
+
max_length: Maximum sequence length
|
| 241 |
+
"""
|
| 242 |
+
if not CAPTUM_AVAILABLE:
|
| 243 |
+
raise ImportError("Captum not installed. Install with: pip install captum")
|
| 244 |
+
|
| 245 |
+
self.model = model
|
| 246 |
+
self.tokenizer = tokenizer
|
| 247 |
+
self.label_encoder = label_encoder
|
| 248 |
+
self.preprocessor = preprocessor
|
| 249 |
+
self.class_names = label_encoder.classes_.tolist()
|
| 250 |
+
self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 251 |
+
self.max_length = max_length
|
| 252 |
+
self.emoji_to_nepali_map = emoji_to_nepali_map or {}
|
| 253 |
+
|
| 254 |
+
self.model.to(self.device).eval()
|
| 255 |
+
|
| 256 |
+
# Get embedding layer (model-specific)
|
| 257 |
+
self.embedding_layer = self._get_embedding_layer()
|
| 258 |
+
|
| 259 |
+
def _get_embedding_layer(self):
|
| 260 |
+
"""Get the embedding layer from the model"""
|
| 261 |
+
# Try different model architectures
|
| 262 |
+
if hasattr(self.model, 'roberta'):
|
| 263 |
+
# XLM-RoBERTa
|
| 264 |
+
return self.model.roberta.embeddings.word_embeddings
|
| 265 |
+
elif hasattr(self.model, 'bert'):
|
| 266 |
+
# BERT-based
|
| 267 |
+
return self.model.bert.embeddings.word_embeddings
|
| 268 |
+
elif hasattr(self.model, 'transformer'):
|
| 269 |
+
# Generic transformer
|
| 270 |
+
return self.model.transformer.wte
|
| 271 |
+
else:
|
| 272 |
+
raise AttributeError("Could not find embedding layer. Please specify manually.")
|
| 273 |
+
|
| 274 |
+
def explain(self, original_text: str, target: Optional[int] = None,
|
| 275 |
+
n_steps: int = 50) -> Dict:
|
| 276 |
+
"""
|
| 277 |
+
Generate Integrated Gradients explanation
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
original_text: Original text with emojis
|
| 281 |
+
target: Target class index (None = predicted class)
|
| 282 |
+
n_steps: Number of IG steps
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
Dictionary with explanation results
|
| 286 |
+
"""
|
| 287 |
+
# Preprocess
|
| 288 |
+
preprocessed, emoji_features = self.preprocessor.preprocess(original_text, verbose=False)
|
| 289 |
+
|
| 290 |
+
if not preprocessed:
|
| 291 |
+
raise ValueError("Preprocessing resulted in empty text")
|
| 292 |
+
|
| 293 |
+
# Tokenize
|
| 294 |
+
encoding = self.tokenizer(
|
| 295 |
+
preprocessed,
|
| 296 |
+
return_tensors="pt",
|
| 297 |
+
truncation=True,
|
| 298 |
+
padding="max_length",
|
| 299 |
+
max_length=self.max_length
|
| 300 |
+
)
|
| 301 |
+
input_ids = encoding['input_ids'].to(self.device)
|
| 302 |
+
attention_mask = encoding['attention_mask'].to(self.device)
|
| 303 |
+
|
| 304 |
+
# Get prediction
|
| 305 |
+
with torch.no_grad():
|
| 306 |
+
out = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
| 307 |
+
probs = torch.softmax(out.logits, dim=-1)[0].cpu().numpy()
|
| 308 |
+
pred_idx = int(np.argmax(probs))
|
| 309 |
+
pred_label = self.class_names[pred_idx]
|
| 310 |
+
pred_conf = float(probs[pred_idx])
|
| 311 |
+
|
| 312 |
+
if target is None:
|
| 313 |
+
target = pred_idx
|
| 314 |
+
|
| 315 |
+
# Forward function for Captum
|
| 316 |
+
def forward_func(input_ids_arg, attention_mask_arg):
|
| 317 |
+
"""Forward function that takes input_ids"""
|
| 318 |
+
return self.model(input_ids=input_ids_arg, attention_mask=attention_mask_arg).logits[:, target]
|
| 319 |
+
|
| 320 |
+
# Initialize Integrated Gradients
|
| 321 |
+
lig = LayerIntegratedGradients(forward_func, self.embedding_layer)
|
| 322 |
+
|
| 323 |
+
# Baseline: all pad tokens
|
| 324 |
+
baseline_ids = torch.full_like(input_ids, self.tokenizer.pad_token_id)
|
| 325 |
+
|
| 326 |
+
# Calculate attributions
|
| 327 |
+
attributions, delta = lig.attribute(
|
| 328 |
+
input_ids,
|
| 329 |
+
baselines=baseline_ids,
|
| 330 |
+
additional_forward_args=(attention_mask,),
|
| 331 |
+
return_convergence_delta=True,
|
| 332 |
+
n_steps=n_steps
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# Sum across embedding dimension
|
| 336 |
+
attributions_sum = attributions.sum(dim=-1).squeeze(0)
|
| 337 |
+
|
| 338 |
+
# Get tokens
|
| 339 |
+
tokens = self.tokenizer.convert_ids_to_tokens(
|
| 340 |
+
input_ids[0].cpu().tolist(),
|
| 341 |
+
skip_special_tokens=False
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# Create display tokens with emojis preserved
|
| 345 |
+
display_tokens = create_display_tokens_from_subwords(
|
| 346 |
+
original_text,
|
| 347 |
+
preprocessed,
|
| 348 |
+
tokens,
|
| 349 |
+
self.emoji_to_nepali_map,
|
| 350 |
+
remove_special=True
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
# Aggregate word-level attributions
|
| 354 |
+
word_attributions = self._aggregate_word_attributions(
|
| 355 |
+
tokens, attributions_sum, display_tokens
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
return {
|
| 359 |
+
"original_text": original_text,
|
| 360 |
+
"preprocessed_text": preprocessed,
|
| 361 |
+
"emoji_features": emoji_features,
|
| 362 |
+
"predicted_label": pred_label,
|
| 363 |
+
"predicted_index": pred_idx,
|
| 364 |
+
"confidence": pred_conf,
|
| 365 |
+
"probabilities": {label: float(prob) for label, prob in zip(self.class_names, probs)},
|
| 366 |
+
"word_attributions": word_attributions,
|
| 367 |
+
"convergence_delta": float(delta.sum().cpu().numpy()),
|
| 368 |
+
"tokens": tokens,
|
| 369 |
+
"display_tokens": display_tokens
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
def _aggregate_word_attributions(self, tokens: List[str], attributions_sum: torch.Tensor,
|
| 373 |
+
display_tokens: List[str]) -> List[Tuple[str, float, float]]:
|
| 374 |
+
"""
|
| 375 |
+
Aggregate subword attributions to word-level
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
List of (word, abs_score, signed_score) tuples
|
| 379 |
+
"""
|
| 380 |
+
word_attributions = []
|
| 381 |
+
current_indices = []
|
| 382 |
+
|
| 383 |
+
for i, tok in enumerate(tokens):
|
| 384 |
+
# Skip special tokens
|
| 385 |
+
if tok in ['<s>', '</s>', '[CLS]', '[SEP]', '<pad>', '[PAD]']:
|
| 386 |
+
continue
|
| 387 |
+
|
| 388 |
+
if tok.startswith("▁"):
|
| 389 |
+
# New word starts
|
| 390 |
+
if current_indices:
|
| 391 |
+
# Save previous word
|
| 392 |
+
grp_vals = attributions_sum[current_indices].detach().cpu().numpy()
|
| 393 |
+
score = float(np.sum(np.abs(grp_vals)))
|
| 394 |
+
signed_score = float(np.sum(grp_vals))
|
| 395 |
+
word = "".join([tokens[j].replace("▁", "") for j in current_indices])
|
| 396 |
+
word_attributions.append((word, score, signed_score))
|
| 397 |
+
|
| 398 |
+
current_indices = [i]
|
| 399 |
+
else:
|
| 400 |
+
# Continue current word
|
| 401 |
+
current_indices.append(i)
|
| 402 |
+
|
| 403 |
+
# Don't forget last word
|
| 404 |
+
if current_indices:
|
| 405 |
+
grp_vals = attributions_sum[current_indices].detach().cpu().numpy()
|
| 406 |
+
score = float(np.sum(np.abs(grp_vals)))
|
| 407 |
+
signed_score = float(np.sum(grp_vals))
|
| 408 |
+
word = "".join([tokens[j].replace("▁", "") for j in current_indices])
|
| 409 |
+
word_attributions.append((word, score, signed_score))
|
| 410 |
+
|
| 411 |
+
# Align with display tokens
|
| 412 |
+
if len(display_tokens) == len(word_attributions):
|
| 413 |
+
aligned_attributions = [
|
| 414 |
+
(display_tok, score, signed_score)
|
| 415 |
+
for display_tok, (_, score, signed_score) in zip(display_tokens, word_attributions)
|
| 416 |
+
]
|
| 417 |
+
else:
|
| 418 |
+
aligned_attributions = word_attributions
|
| 419 |
+
|
| 420 |
+
# Post-process: merge attributions for multi-word emoji translations
|
| 421 |
+
# Build reverse mapping to detect which words are parts of multi-word emojis
|
| 422 |
+
multi_word_phrases = set()
|
| 423 |
+
for emoji_char, nepali_text in self.emoji_to_nepali_map.items():
|
| 424 |
+
if ' ' in nepali_text:
|
| 425 |
+
multi_word_phrases.add(nepali_text)
|
| 426 |
+
|
| 427 |
+
# Merge consecutive words that form a multi-word emoji phrase
|
| 428 |
+
merged_attributions = []
|
| 429 |
+
i = 0
|
| 430 |
+
while i < len(aligned_attributions):
|
| 431 |
+
word, score, signed_score = aligned_attributions[i]
|
| 432 |
+
|
| 433 |
+
# Check if this word + next word(s) form a multi-word emoji phrase
|
| 434 |
+
merged = False
|
| 435 |
+
for phrase in multi_word_phrases:
|
| 436 |
+
phrase_words = phrase.split()
|
| 437 |
+
if i + len(phrase_words) <= len(aligned_attributions):
|
| 438 |
+
# Check if consecutive words match the phrase
|
| 439 |
+
candidate_words = [aligned_attributions[i + j][0] for j in range(len(phrase_words))]
|
| 440 |
+
candidate_phrase = ' '.join(candidate_words)
|
| 441 |
+
|
| 442 |
+
# Also check if any word is already the emoji (from display_tokens fix)
|
| 443 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in word)
|
| 444 |
+
|
| 445 |
+
if candidate_phrase == phrase or (has_emoji and len(phrase_words) > 1):
|
| 446 |
+
# Found a multi-word emoji phrase - merge their scores
|
| 447 |
+
total_abs_score = sum(aligned_attributions[i + j][1] for j in range(len(phrase_words)))
|
| 448 |
+
total_signed_score = sum(aligned_attributions[i + j][2] for j in range(len(phrase_words)))
|
| 449 |
+
|
| 450 |
+
# Find the corresponding emoji
|
| 451 |
+
emoji_char = [e for e, n in self.emoji_to_nepali_map.items() if n == phrase][0]
|
| 452 |
+
|
| 453 |
+
merged_attributions.append((emoji_char, total_abs_score, total_signed_score))
|
| 454 |
+
i += len(phrase_words) # Skip all words in the phrase
|
| 455 |
+
merged = True
|
| 456 |
+
break
|
| 457 |
+
|
| 458 |
+
if not merged:
|
| 459 |
+
merged_attributions.append((word, score, signed_score))
|
| 460 |
+
i += 1
|
| 461 |
+
|
| 462 |
+
return merged_attributions
|
| 463 |
+
|
| 464 |
+
def visualize_bar_chart(self, explanation: Dict, save_path: Optional[str] = None,
|
| 465 |
+
show: bool = True, nepali_font: Optional[FontProperties] = None,
|
| 466 |
+
figsize: Tuple[int, int] = None):
|
| 467 |
+
"""
|
| 468 |
+
Create bar chart visualization
|
| 469 |
+
|
| 470 |
+
Args:
|
| 471 |
+
explanation: Explanation dictionary from explain()
|
| 472 |
+
save_path: Path to save figure
|
| 473 |
+
show: Whether to display figure
|
| 474 |
+
nepali_font: Nepali font properties
|
| 475 |
+
figsize: Figure size (auto if None)
|
| 476 |
+
|
| 477 |
+
Returns:
|
| 478 |
+
matplotlib figure
|
| 479 |
+
"""
|
| 480 |
+
word_attributions = explanation['word_attributions']
|
| 481 |
+
pred_label = explanation['predicted_label']
|
| 482 |
+
pred_conf = explanation['confidence']
|
| 483 |
+
|
| 484 |
+
scores = [s for _, s, _ in word_attributions]
|
| 485 |
+
words = [w.replace('_', ' ') for w, _, _ in word_attributions] # Replace underscores
|
| 486 |
+
signed_scores = [ss for _, _, ss in word_attributions]
|
| 487 |
+
|
| 488 |
+
if figsize is None:
|
| 489 |
+
figsize = (max(8, 0.6 * len(words)), 5)
|
| 490 |
+
|
| 491 |
+
fig, ax = plt.subplots(figsize=figsize)
|
| 492 |
+
colors = ['green' if ss > 0 else 'red' for ss in signed_scores]
|
| 493 |
+
|
| 494 |
+
ax.bar(range(len(words)), scores, tick_label=words, color=colors, alpha=0.7)
|
| 495 |
+
ax.set_ylabel("Attribution (sum abs)", fontsize=12)
|
| 496 |
+
ax.set_title(
|
| 497 |
+
f"Integrated Gradients → Pred: {pred_label} ({pred_conf:.2%})",
|
| 498 |
+
fontsize=14,
|
| 499 |
+
fontweight='bold'
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
# Apply Nepali font
|
| 503 |
+
if nepali_font:
|
| 504 |
+
apply_nepali_font(ax, nepali_font, is_axis=True)
|
| 505 |
+
|
| 506 |
+
plt.xticks(rotation=45, ha='right')
|
| 507 |
+
plt.tight_layout()
|
| 508 |
+
|
| 509 |
+
if save_path:
|
| 510 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 511 |
+
print(f"✓ Bar chart saved to: {save_path}")
|
| 512 |
+
|
| 513 |
+
if show:
|
| 514 |
+
plt.show()
|
| 515 |
+
else:
|
| 516 |
+
plt.close(fig)
|
| 517 |
+
|
| 518 |
+
return fig
|
| 519 |
+
|
| 520 |
+
def visualize_heatmap(self, explanation: Dict, save_path: Optional[str] = None,
|
| 521 |
+
show: bool = True, nepali_font: Optional[FontProperties] = None,
|
| 522 |
+
figsize: Tuple[int, int] = None):
|
| 523 |
+
"""
|
| 524 |
+
Create heatmap visualization with colored text boxes
|
| 525 |
+
|
| 526 |
+
Args:
|
| 527 |
+
explanation: Explanation dictionary from explain()
|
| 528 |
+
save_path: Path to save figure
|
| 529 |
+
show: Whether to display figure
|
| 530 |
+
nepali_font: Nepali font properties
|
| 531 |
+
figsize: Figure size (auto if None)
|
| 532 |
+
|
| 533 |
+
Returns:
|
| 534 |
+
matplotlib figure
|
| 535 |
+
"""
|
| 536 |
+
word_attributions = explanation['word_attributions']
|
| 537 |
+
pred_label = explanation['predicted_label']
|
| 538 |
+
|
| 539 |
+
scores = [s for _, s, _ in word_attributions]
|
| 540 |
+
max_score = max(scores) if scores else 1.0
|
| 541 |
+
|
| 542 |
+
cmap = cm.get_cmap("RdYlGn")
|
| 543 |
+
|
| 544 |
+
if figsize is None:
|
| 545 |
+
figsize = (max(10, 0.6 * len(word_attributions)), 3)
|
| 546 |
+
|
| 547 |
+
fig, ax = plt.subplots(figsize=figsize)
|
| 548 |
+
ax.axis('off')
|
| 549 |
+
|
| 550 |
+
x, y = 0.01, 0.6
|
| 551 |
+
text_objs = []
|
| 552 |
+
|
| 553 |
+
for word, score, signed_score in word_attributions:
|
| 554 |
+
# Replace underscores with spaces for display
|
| 555 |
+
display_word = word.replace('_', ' ')
|
| 556 |
+
|
| 557 |
+
# Normalize for color
|
| 558 |
+
intensity = min(score / max_score, 1.0) if max_score > 0 else 0.0
|
| 559 |
+
|
| 560 |
+
# Color based on signed score
|
| 561 |
+
if signed_score > 0:
|
| 562 |
+
color = cmap(0.5 + intensity * 0.5) # Green side
|
| 563 |
+
else:
|
| 564 |
+
color = cmap(0.5 - intensity * 0.5) # Red side
|
| 565 |
+
|
| 566 |
+
txt = ax.text(
|
| 567 |
+
x, y, f" {display_word} ",
|
| 568 |
+
fontsize=13,
|
| 569 |
+
bbox=dict(
|
| 570 |
+
facecolor=mcolors.to_hex(color),
|
| 571 |
+
alpha=0.8,
|
| 572 |
+
boxstyle="round,pad=0.3",
|
| 573 |
+
edgecolor='gray'
|
| 574 |
+
)
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
# Apply Nepali font only to Devanagari text (but not if it contains emojis)
|
| 578 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in display_word)
|
| 579 |
+
has_devanagari = bool(regex.search(r'\p{Devanagari}', display_word))
|
| 580 |
+
|
| 581 |
+
if nepali_font and has_devanagari and not has_emoji:
|
| 582 |
+
txt.set_fontproperties(nepali_font)
|
| 583 |
+
|
| 584 |
+
text_objs.append(txt)
|
| 585 |
+
|
| 586 |
+
# Update position - emojis take less horizontal space
|
| 587 |
+
char_width = 0.025 if any(c in emoji.EMOJI_DATA for c in display_word) else 0.04
|
| 588 |
+
x += char_width * len(display_word) + 0.01
|
| 589 |
+
|
| 590 |
+
if x > 0.92:
|
| 591 |
+
x = 0.01
|
| 592 |
+
y -= 0.35
|
| 593 |
+
|
| 594 |
+
# Title
|
| 595 |
+
ax.text(
|
| 596 |
+
0.5, 0.95,
|
| 597 |
+
f"Token Attributions (Predicted: {pred_label})",
|
| 598 |
+
ha='center',
|
| 599 |
+
va='top',
|
| 600 |
+
fontsize=14,
|
| 601 |
+
fontweight='bold',
|
| 602 |
+
transform=ax.transAxes
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
plt.tight_layout()
|
| 606 |
+
|
| 607 |
+
if save_path:
|
| 608 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 609 |
+
print(f"✓ Heatmap saved to: {save_path}")
|
| 610 |
+
|
| 611 |
+
if show:
|
| 612 |
+
plt.show()
|
| 613 |
+
else:
|
| 614 |
+
plt.close(fig)
|
| 615 |
+
|
| 616 |
+
return fig
|
| 617 |
+
|
| 618 |
+
def explain_and_visualize(self, original_text: str, target: Optional[int] = None,
|
| 619 |
+
n_steps: int = 50, save_dir: Optional[str] = None,
|
| 620 |
+
show: bool = True, nepali_font: Optional[FontProperties] = None):
|
| 621 |
+
"""
|
| 622 |
+
Explain and visualize in one step
|
| 623 |
+
|
| 624 |
+
Args:
|
| 625 |
+
original_text: Original text with emojis
|
| 626 |
+
target: Target class index (None = predicted)
|
| 627 |
+
n_steps: Number of IG steps
|
| 628 |
+
save_dir: Directory to save figures
|
| 629 |
+
show: Whether to display figures
|
| 630 |
+
nepali_font: Nepali font properties
|
| 631 |
+
|
| 632 |
+
Returns:
|
| 633 |
+
Dictionary with explanation and figures
|
| 634 |
+
"""
|
| 635 |
+
# Generate explanation
|
| 636 |
+
explanation = self.explain(original_text, target, n_steps)
|
| 637 |
+
|
| 638 |
+
# Generate file paths if save_dir provided
|
| 639 |
+
if save_dir:
|
| 640 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 641 |
+
hash_suffix = abs(hash(original_text)) % 10**8
|
| 642 |
+
bar_path = os.path.join(save_dir, f"ig_bar_{explanation['predicted_label']}_{hash_suffix}.png")
|
| 643 |
+
heatmap_path = os.path.join(save_dir, f"ig_heatmap_{explanation['predicted_label']}_{hash_suffix}.png")
|
| 644 |
+
else:
|
| 645 |
+
bar_path = None
|
| 646 |
+
heatmap_path = None
|
| 647 |
+
|
| 648 |
+
# Visualize
|
| 649 |
+
bar_fig = self.visualize_bar_chart(explanation, bar_path, show, nepali_font)
|
| 650 |
+
heatmap_fig = self.visualize_heatmap(explanation, heatmap_path, show, nepali_font)
|
| 651 |
+
|
| 652 |
+
return {
|
| 653 |
+
'explanation': explanation,
|
| 654 |
+
'bar_chart': bar_fig,
|
| 655 |
+
'heatmap': heatmap_fig
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
# ============================================================================
|
| 660 |
+
# CONVENIENCE FUNCTIONS
|
| 661 |
+
# ============================================================================
|
| 662 |
+
|
| 663 |
+
def explain_with_captum(text: str, model, tokenizer, label_encoder, preprocessor,
|
| 664 |
+
emoji_to_nepali_map: Optional[Dict[str, str]] = None,
|
| 665 |
+
n_steps: int = 50, nepali_font: Optional[FontProperties] = None,
|
| 666 |
+
save_dir: Optional[str] = None, show: bool = True) -> Dict:
|
| 667 |
+
"""
|
| 668 |
+
Convenience function to explain a text with Captum
|
| 669 |
+
|
| 670 |
+
Args:
|
| 671 |
+
text: Input text
|
| 672 |
+
model: Trained model
|
| 673 |
+
tokenizer: Model tokenizer
|
| 674 |
+
label_encoder: Label encoder
|
| 675 |
+
preprocessor: HateSpeechPreprocessor instance
|
| 676 |
+
emoji_to_nepali_map: Emoji mapping dictionary
|
| 677 |
+
n_steps: Number of IG steps
|
| 678 |
+
nepali_font: Nepali font properties
|
| 679 |
+
save_dir: Directory to save figures
|
| 680 |
+
show: Whether to display figures
|
| 681 |
+
|
| 682 |
+
Returns:
|
| 683 |
+
Dictionary with explanation and visualizations
|
| 684 |
+
"""
|
| 685 |
+
explainer = CaptumExplainer(
|
| 686 |
+
model, tokenizer, label_encoder, preprocessor,
|
| 687 |
+
emoji_to_nepali_map=emoji_to_nepali_map
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
return explainer.explain_and_visualize(
|
| 691 |
+
text, n_steps=n_steps, save_dir=save_dir, show=show, nepali_font=nepali_font
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
def check_availability() -> bool:
|
| 696 |
+
"""Check if Captum is available"""
|
| 697 |
+
return CAPTUM_AVAILABLE
|
| 698 |
+
|
| 699 |
+
|
| 700 |
+
# ============================================================================
|
| 701 |
+
# DEFAULT EMOJI MAPPING (For standalone usage)
|
| 702 |
+
# ============================================================================
|
| 703 |
+
|
| 704 |
+
DEFAULT_EMOJI_TO_NEPALI = {
|
| 705 |
+
'😀': 'खुशी', '😁': 'खुशी', '😂': 'हाँसो', '😃': 'खुशी', '😄': 'खुशी',
|
| 706 |
+
'😅': 'नर्भस हाँसो', '😆': 'हाँसो', '😊': 'मुस्कान', '😍': 'माया',
|
| 707 |
+
'😠': 'रिस', '😡': 'ठूलो रिस', '🤬': 'गाली', '😈': 'खराब',
|
| 708 |
+
'🖕': 'अपमान', '👎': 'नकारात्मक', '👍': 'सकारात्मक', '🙏': 'नमस्कार',
|
| 709 |
+
'❤️': 'माया', '💔': 'टुटेको मन', '🔥': 'आगो', '💯': 'पूर्ण',
|
| 710 |
+
}
|
scripts/explainability.py
ADDED
|
@@ -0,0 +1,884 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Explainability Module - LIME & SHAP
|
| 3 |
+
===================================
|
| 4 |
+
Model-agnostic explainability for Nepali hate speech classification.
|
| 5 |
+
|
| 6 |
+
This module provides:
|
| 7 |
+
- LIME (Local Interpretable Model-agnostic Explanations)
|
| 8 |
+
- SHAP (SHapley Additive exPlanations)
|
| 9 |
+
- Emoji-aware visualization with Nepali font support
|
| 10 |
+
|
| 11 |
+
Usage:
|
| 12 |
+
------
|
| 13 |
+
from scripts.explainability import LIMEExplainer, SHAPExplainer, create_explainer_wrapper
|
| 14 |
+
|
| 15 |
+
# Create model wrapper
|
| 16 |
+
wrapper = create_explainer_wrapper(model, tokenizer, label_encoder, preprocessor)
|
| 17 |
+
|
| 18 |
+
# LIME explanation
|
| 19 |
+
lime = LIMEExplainer(wrapper, nepali_font=font)
|
| 20 |
+
lime.explain_and_visualize(original_text, preprocessed_text, save_path="lime.png")
|
| 21 |
+
|
| 22 |
+
# SHAP explanation
|
| 23 |
+
shap_exp = SHAPExplainer(wrapper, nepali_font=font)
|
| 24 |
+
shap_exp.explain_and_visualize(original_text, preprocessed_text, save_path="shap.png")
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
import os
|
| 28 |
+
import numpy as np
|
| 29 |
+
import torch
|
| 30 |
+
import re
|
| 31 |
+
import emoji
|
| 32 |
+
import regex
|
| 33 |
+
import warnings
|
| 34 |
+
warnings.filterwarnings("ignore")
|
| 35 |
+
|
| 36 |
+
from typing import Optional, Tuple, Dict, List
|
| 37 |
+
import matplotlib.pyplot as plt
|
| 38 |
+
from matplotlib.font_manager import FontProperties
|
| 39 |
+
|
| 40 |
+
# Explainability libraries
|
| 41 |
+
try:
|
| 42 |
+
from lime.lime_text import LimeTextExplainer
|
| 43 |
+
LIME_AVAILABLE = True
|
| 44 |
+
except ImportError:
|
| 45 |
+
LIME_AVAILABLE = False
|
| 46 |
+
print("⚠️ LIME not installed. Install with: pip install lime")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
import shap
|
| 50 |
+
from shap import Explainer, maskers
|
| 51 |
+
SHAP_AVAILABLE = True
|
| 52 |
+
except ImportError:
|
| 53 |
+
SHAP_AVAILABLE = False
|
| 54 |
+
print("⚠️ SHAP not installed. Install with: pip install shap")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# ============================================================================
|
| 58 |
+
# MODEL WRAPPER CLASS
|
| 59 |
+
# ============================================================================
|
| 60 |
+
|
| 61 |
+
class ModelExplainerWrapper:
|
| 62 |
+
"""
|
| 63 |
+
Wrapper class for model + preprocessing
|
| 64 |
+
Makes model compatible with LIME/SHAP
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def __init__(self, model, tokenizer, label_encoder, preprocessor, device=None):
|
| 68 |
+
"""
|
| 69 |
+
Args:
|
| 70 |
+
model: Trained model
|
| 71 |
+
tokenizer: Model tokenizer
|
| 72 |
+
label_encoder: Label encoder
|
| 73 |
+
preprocessor: HateSpeechPreprocessor instance
|
| 74 |
+
device: torch device (auto-detected if None)
|
| 75 |
+
"""
|
| 76 |
+
self.model = model
|
| 77 |
+
self.tokenizer = tokenizer
|
| 78 |
+
self.class_names = label_encoder.classes_.tolist()
|
| 79 |
+
self.preprocessor = preprocessor
|
| 80 |
+
self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 81 |
+
self.model.to(self.device).eval()
|
| 82 |
+
|
| 83 |
+
def preprocess_text(self, text: str) -> Tuple[str, Dict[str, int]]:
|
| 84 |
+
"""Preprocess text using the HateSpeechPreprocessor"""
|
| 85 |
+
return self.preprocessor.preprocess(text, verbose=False)
|
| 86 |
+
|
| 87 |
+
def predict_proba(self, texts):
|
| 88 |
+
"""
|
| 89 |
+
Predict probabilities for texts
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
texts: Single text or list of texts (already preprocessed)
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
numpy array of probabilities
|
| 96 |
+
"""
|
| 97 |
+
if isinstance(texts, str):
|
| 98 |
+
texts = [texts]
|
| 99 |
+
elif isinstance(texts, np.ndarray):
|
| 100 |
+
texts = texts.tolist() if texts.ndim > 0 else [str(texts)]
|
| 101 |
+
|
| 102 |
+
# Convert to strings and filter empty
|
| 103 |
+
texts = [str(t).strip() for t in texts if str(t).strip()]
|
| 104 |
+
|
| 105 |
+
if not texts:
|
| 106 |
+
# Return uniform probabilities for empty input
|
| 107 |
+
return np.ones((1, len(self.class_names))) / len(self.class_names)
|
| 108 |
+
|
| 109 |
+
# Tokenize
|
| 110 |
+
enc = self.tokenizer(
|
| 111 |
+
texts,
|
| 112 |
+
padding=True,
|
| 113 |
+
truncation=True,
|
| 114 |
+
max_length=256,
|
| 115 |
+
return_tensors="pt"
|
| 116 |
+
).to(self.device)
|
| 117 |
+
|
| 118 |
+
# Predict
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
probs = torch.softmax(self.model(**enc).logits, dim=-1)
|
| 121 |
+
|
| 122 |
+
return probs.cpu().numpy()
|
| 123 |
+
|
| 124 |
+
def predict_with_analysis(self, text: str) -> Dict:
|
| 125 |
+
"""
|
| 126 |
+
Predict with full analysis
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Dictionary with original text, preprocessed text, predictions, etc.
|
| 130 |
+
"""
|
| 131 |
+
# Preprocess
|
| 132 |
+
preprocessed, emoji_features = self.preprocess_text(text)
|
| 133 |
+
|
| 134 |
+
# Predict
|
| 135 |
+
probs = self.predict_proba(preprocessed)[0]
|
| 136 |
+
pred_idx = int(np.argmax(probs))
|
| 137 |
+
|
| 138 |
+
return {
|
| 139 |
+
"original_text": text,
|
| 140 |
+
"preprocessed_text": preprocessed,
|
| 141 |
+
"emoji_features": emoji_features,
|
| 142 |
+
"predicted_label": self.class_names[pred_idx],
|
| 143 |
+
"confidence": float(probs[pred_idx]),
|
| 144 |
+
"probabilities": {label: float(prob) for label, prob in zip(self.class_names, probs)}
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ============================================================================
|
| 149 |
+
# UTILITY FUNCTIONS
|
| 150 |
+
# ============================================================================
|
| 151 |
+
|
| 152 |
+
def apply_nepali_font(ax, nepali_font: Optional[FontProperties] = None,
|
| 153 |
+
texts: Optional[list] = None, is_tick_labels: bool = True):
|
| 154 |
+
"""
|
| 155 |
+
Apply Nepali font to Devanagari text while preserving emojis
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
ax: Matplotlib axes
|
| 159 |
+
nepali_font: Nepali font properties
|
| 160 |
+
texts: Text objects to apply font to (if not tick labels)
|
| 161 |
+
is_tick_labels: Whether to apply to tick labels
|
| 162 |
+
"""
|
| 163 |
+
if nepali_font is None:
|
| 164 |
+
return
|
| 165 |
+
|
| 166 |
+
if is_tick_labels or texts is None:
|
| 167 |
+
for txt in ax.get_yticklabels():
|
| 168 |
+
text_content = txt.get_text()
|
| 169 |
+
# Only apply if has Devanagari AND no emojis
|
| 170 |
+
has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
|
| 171 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
|
| 172 |
+
|
| 173 |
+
if has_devanagari and not has_emoji:
|
| 174 |
+
txt.set_fontproperties(nepali_font)
|
| 175 |
+
txt.set_fontsize(11)
|
| 176 |
+
else:
|
| 177 |
+
for txt in texts:
|
| 178 |
+
text_content = txt.get_text()
|
| 179 |
+
has_devanagari = bool(regex.search(r'\p{Devanagari}', text_content))
|
| 180 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in text_content)
|
| 181 |
+
|
| 182 |
+
if has_devanagari and not has_emoji:
|
| 183 |
+
txt.set_fontproperties(nepali_font)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def create_display_text_with_emojis(original_text: str, preprocessed_text: str) -> Tuple[List[str], List[str]]:
|
| 187 |
+
"""
|
| 188 |
+
Create aligned display tokens preserving emojis
|
| 189 |
+
|
| 190 |
+
Handles multi-word emoji translations like: 😡 → "ठूलो रिस" (2 words)
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
original_text: Original text with emojis
|
| 194 |
+
preprocessed_text: Preprocessed text (emojis replaced with Nepali)
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Tuple of (display_tokens, model_tokens)
|
| 198 |
+
"""
|
| 199 |
+
from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
|
| 200 |
+
|
| 201 |
+
original_tokens = original_text.split()
|
| 202 |
+
preprocessed_tokens = preprocessed_text.split()
|
| 203 |
+
|
| 204 |
+
# Build emoji to word count mapping (how many words each emoji becomes)
|
| 205 |
+
emoji_word_counts = {}
|
| 206 |
+
for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
|
| 207 |
+
word_count = len(nepali_text.split())
|
| 208 |
+
emoji_word_counts[emoji_char] = word_count
|
| 209 |
+
|
| 210 |
+
display_tokens = []
|
| 211 |
+
model_tokens = []
|
| 212 |
+
|
| 213 |
+
orig_idx = 0
|
| 214 |
+
proc_idx = 0
|
| 215 |
+
|
| 216 |
+
while orig_idx < len(original_tokens):
|
| 217 |
+
orig_token = original_tokens[orig_idx]
|
| 218 |
+
|
| 219 |
+
# Check if token contains emoji
|
| 220 |
+
has_emoji = any(c in emoji.EMOJI_DATA for c in orig_token)
|
| 221 |
+
|
| 222 |
+
if has_emoji:
|
| 223 |
+
# Display: keep original emoji
|
| 224 |
+
display_tokens.append(orig_token)
|
| 225 |
+
|
| 226 |
+
# Model: use Nepali translation (may be multiple words!)
|
| 227 |
+
# Count how many emojis in this token
|
| 228 |
+
emojis_in_token = [c for c in orig_token if c in emoji.EMOJI_DATA]
|
| 229 |
+
|
| 230 |
+
if emojis_in_token:
|
| 231 |
+
# Calculate total words needed for all emojis in this token
|
| 232 |
+
total_words_needed = sum(
|
| 233 |
+
emoji_word_counts.get(e, 1) for e in emojis_in_token
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# Collect that many preprocessed tokens
|
| 237 |
+
nepali_words = []
|
| 238 |
+
for _ in range(total_words_needed):
|
| 239 |
+
if proc_idx < len(preprocessed_tokens):
|
| 240 |
+
nepali_words.append(preprocessed_tokens[proc_idx])
|
| 241 |
+
proc_idx += 1
|
| 242 |
+
|
| 243 |
+
# Join them as the model token
|
| 244 |
+
if nepali_words:
|
| 245 |
+
model_tokens.append(' '.join(nepali_words))
|
| 246 |
+
else:
|
| 247 |
+
model_tokens.append(orig_token)
|
| 248 |
+
else:
|
| 249 |
+
# Shouldn't happen, but fallback
|
| 250 |
+
if proc_idx < len(preprocessed_tokens):
|
| 251 |
+
model_tokens.append(preprocessed_tokens[proc_idx])
|
| 252 |
+
proc_idx += 1
|
| 253 |
+
else:
|
| 254 |
+
model_tokens.append(orig_token)
|
| 255 |
+
else:
|
| 256 |
+
# No emoji: use preprocessed for both
|
| 257 |
+
if proc_idx < len(preprocessed_tokens):
|
| 258 |
+
display_tokens.append(preprocessed_tokens[proc_idx])
|
| 259 |
+
model_tokens.append(preprocessed_tokens[proc_idx])
|
| 260 |
+
proc_idx += 1
|
| 261 |
+
else:
|
| 262 |
+
display_tokens.append(orig_token)
|
| 263 |
+
model_tokens.append(orig_token)
|
| 264 |
+
|
| 265 |
+
orig_idx += 1
|
| 266 |
+
|
| 267 |
+
# Handle remaining preprocessed tokens
|
| 268 |
+
while proc_idx < len(preprocessed_tokens):
|
| 269 |
+
token = preprocessed_tokens[proc_idx]
|
| 270 |
+
display_tokens.append(token)
|
| 271 |
+
model_tokens.append(token)
|
| 272 |
+
proc_idx += 1
|
| 273 |
+
|
| 274 |
+
return display_tokens, model_tokens
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# ============================================================================
|
| 278 |
+
# LIME EXPLAINER
|
| 279 |
+
# ============================================================================
|
| 280 |
+
|
| 281 |
+
class LIMEExplainer:
|
| 282 |
+
"""LIME explainer with emoji support"""
|
| 283 |
+
|
| 284 |
+
def __init__(self, model_wrapper: ModelExplainerWrapper, nepali_font: Optional[FontProperties] = None):
|
| 285 |
+
"""
|
| 286 |
+
Args:
|
| 287 |
+
model_wrapper: ModelExplainerWrapper instance
|
| 288 |
+
nepali_font: Nepali font properties for visualization
|
| 289 |
+
"""
|
| 290 |
+
if not LIME_AVAILABLE:
|
| 291 |
+
raise ImportError("LIME not installed. Install with: pip install lime")
|
| 292 |
+
|
| 293 |
+
self.model_wrapper = model_wrapper
|
| 294 |
+
self.nepali_font = nepali_font
|
| 295 |
+
self.explainer = LimeTextExplainer(
|
| 296 |
+
class_names=model_wrapper.class_names,
|
| 297 |
+
random_state=42
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
def explain(self, original_text: str, preprocessed_text: str, num_samples: int = 200) -> Dict:
|
| 301 |
+
"""
|
| 302 |
+
Generate LIME explanation
|
| 303 |
+
|
| 304 |
+
Args:
|
| 305 |
+
original_text: Original text with emojis
|
| 306 |
+
preprocessed_text: Preprocessed text for model
|
| 307 |
+
num_samples: Number of samples for LIME
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Dictionary with explanation data
|
| 311 |
+
"""
|
| 312 |
+
# Get LIME explanation
|
| 313 |
+
exp = self.explainer.explain_instance(
|
| 314 |
+
preprocessed_text,
|
| 315 |
+
self.model_wrapper.predict_proba,
|
| 316 |
+
num_samples=num_samples
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# Get token weights
|
| 320 |
+
token_weights = dict(exp.as_list())
|
| 321 |
+
|
| 322 |
+
# Create aligned tokens
|
| 323 |
+
display_tokens, model_tokens = create_display_text_with_emojis(
|
| 324 |
+
original_text, preprocessed_text
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Map weights to display tokens
|
| 328 |
+
word_scores = []
|
| 329 |
+
for display_tok, model_tok in zip(display_tokens, model_tokens):
|
| 330 |
+
score = 0.0
|
| 331 |
+
for lime_token, weight in token_weights.items():
|
| 332 |
+
if lime_token in model_tok or model_tok in lime_token:
|
| 333 |
+
score += weight
|
| 334 |
+
word_scores.append((display_tok, score))
|
| 335 |
+
|
| 336 |
+
# Merge multi-word emoji attributions
|
| 337 |
+
word_scores = self._merge_multi_word_emojis(word_scores)
|
| 338 |
+
|
| 339 |
+
return {
|
| 340 |
+
'word_scores': word_scores,
|
| 341 |
+
'display_tokens': display_tokens,
|
| 342 |
+
'model_tokens': model_tokens,
|
| 343 |
+
'lime_explanation': exp
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
def _merge_multi_word_emojis(self, word_scores: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
|
| 347 |
+
"""
|
| 348 |
+
Merge attributions for multi-word emoji translations like: ठूलो रिस → 😡
|
| 349 |
+
|
| 350 |
+
Args:
|
| 351 |
+
word_scores: List of (word, score) tuples
|
| 352 |
+
|
| 353 |
+
Returns:
|
| 354 |
+
Merged list with multi-word emojis combined
|
| 355 |
+
"""
|
| 356 |
+
from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
|
| 357 |
+
|
| 358 |
+
# Build set of multi-word phrases
|
| 359 |
+
multi_word_phrases = {}
|
| 360 |
+
for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
|
| 361 |
+
if ' ' in nepali_text:
|
| 362 |
+
multi_word_phrases[nepali_text] = emoji_char
|
| 363 |
+
|
| 364 |
+
# Merge consecutive words that form multi-word emoji phrases
|
| 365 |
+
merged_scores = []
|
| 366 |
+
i = 0
|
| 367 |
+
while i < len(word_scores):
|
| 368 |
+
word, score = word_scores[i]
|
| 369 |
+
|
| 370 |
+
# Check if this word + next word(s) form a multi-word emoji phrase
|
| 371 |
+
merged = False
|
| 372 |
+
for phrase, emoji_char in multi_word_phrases.items():
|
| 373 |
+
phrase_words = phrase.split()
|
| 374 |
+
if i + len(phrase_words) <= len(word_scores):
|
| 375 |
+
# Check if consecutive words match the phrase
|
| 376 |
+
candidate_words = [word_scores[i + j][0] for j in range(len(phrase_words))]
|
| 377 |
+
candidate_phrase = ' '.join(candidate_words)
|
| 378 |
+
|
| 379 |
+
if candidate_phrase == phrase:
|
| 380 |
+
# Found a multi-word emoji phrase - merge their scores
|
| 381 |
+
total_score = sum(word_scores[i + j][1] for j in range(len(phrase_words)))
|
| 382 |
+
|
| 383 |
+
merged_scores.append((emoji_char, total_score))
|
| 384 |
+
i += len(phrase_words) # Skip all words in the phrase
|
| 385 |
+
merged = True
|
| 386 |
+
break
|
| 387 |
+
|
| 388 |
+
if not merged:
|
| 389 |
+
merged_scores.append((word, score))
|
| 390 |
+
i += 1
|
| 391 |
+
|
| 392 |
+
return merged_scores
|
| 393 |
+
|
| 394 |
+
def visualize(self, word_scores: List[Tuple[str, float]], save_path: Optional[str] = None,
|
| 395 |
+
show: bool = True, figsize: Tuple[int, int] = None):
|
| 396 |
+
"""
|
| 397 |
+
Visualize LIME explanation
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
word_scores: List of (word, score) tuples
|
| 401 |
+
save_path: Path to save figure
|
| 402 |
+
show: Whether to display figure
|
| 403 |
+
figsize: Figure size (auto if None)
|
| 404 |
+
|
| 405 |
+
Returns:
|
| 406 |
+
matplotlib figure
|
| 407 |
+
"""
|
| 408 |
+
if not word_scores:
|
| 409 |
+
print("⚠️ No words to visualize")
|
| 410 |
+
return None
|
| 411 |
+
|
| 412 |
+
# Replace underscores with spaces for display
|
| 413 |
+
word_scores_display = [(w.replace('_', ' '), score) for w, score in word_scores]
|
| 414 |
+
|
| 415 |
+
features, weights = zip(*word_scores_display)
|
| 416 |
+
y_pos = range(len(features))
|
| 417 |
+
|
| 418 |
+
if figsize is None:
|
| 419 |
+
figsize = (10, max(6, len(features) * 0.4))
|
| 420 |
+
|
| 421 |
+
fig, ax = plt.subplots(figsize=figsize)
|
| 422 |
+
colors = ['red' if w < 0 else 'green' for w in weights]
|
| 423 |
+
ax.barh(y_pos, weights, color=colors, alpha=0.6)
|
| 424 |
+
ax.set_yticks(y_pos)
|
| 425 |
+
ax.set_yticklabels(features, fontsize=12)
|
| 426 |
+
ax.invert_yaxis()
|
| 427 |
+
ax.set_xlabel("Contribution to Prediction", fontsize=12)
|
| 428 |
+
ax.set_title("LIME Feature Importance (Red=Against, Green=For)", fontsize=14)
|
| 429 |
+
ax.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
|
| 430 |
+
|
| 431 |
+
# Apply Nepali font
|
| 432 |
+
apply_nepali_font(ax, self.nepali_font)
|
| 433 |
+
|
| 434 |
+
plt.tight_layout()
|
| 435 |
+
|
| 436 |
+
if save_path:
|
| 437 |
+
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
| 438 |
+
print(f"✓ LIME visualization saved to: {save_path}")
|
| 439 |
+
|
| 440 |
+
if show:
|
| 441 |
+
plt.show()
|
| 442 |
+
else:
|
| 443 |
+
plt.close(fig)
|
| 444 |
+
|
| 445 |
+
return fig
|
| 446 |
+
|
| 447 |
+
def explain_and_visualize(self, original_text: str, preprocessed_text: str,
|
| 448 |
+
save_path: Optional[str] = None, show: bool = True,
|
| 449 |
+
num_samples: int = 200):
|
| 450 |
+
"""
|
| 451 |
+
Explain and visualize in one step
|
| 452 |
+
|
| 453 |
+
Args:
|
| 454 |
+
original_text: Original text with emojis
|
| 455 |
+
preprocessed_text: Preprocessed text for model
|
| 456 |
+
save_path: Path to save figure
|
| 457 |
+
show: Whether to display figure
|
| 458 |
+
num_samples: Number of LIME samples
|
| 459 |
+
|
| 460 |
+
Returns:
|
| 461 |
+
Dictionary with explanation and figure
|
| 462 |
+
"""
|
| 463 |
+
# Generate explanation
|
| 464 |
+
explanation = self.explain(original_text, preprocessed_text, num_samples)
|
| 465 |
+
|
| 466 |
+
# Visualize
|
| 467 |
+
fig = self.visualize(explanation['word_scores'], save_path, show)
|
| 468 |
+
|
| 469 |
+
return {
|
| 470 |
+
'explanation': explanation,
|
| 471 |
+
'figure': fig
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
# ============================================================================
|
| 476 |
+
# SHAP EXPLAINER
|
| 477 |
+
# ============================================================================
|
| 478 |
+
|
| 479 |
+
class SHAPExplainer:
|
| 480 |
+
"""SHAP explainer with emoji support and fallback methods"""
|
| 481 |
+
|
| 482 |
+
def __init__(self, model_wrapper: ModelExplainerWrapper, nepali_font: Optional[FontProperties] = None):
|
| 483 |
+
"""
|
| 484 |
+
Args:
|
| 485 |
+
model_wrapper: ModelExplainerWrapper instance
|
| 486 |
+
nepali_font: Nepali font properties for visualization
|
| 487 |
+
"""
|
| 488 |
+
if not SHAP_AVAILABLE:
|
| 489 |
+
raise ImportError("SHAP not installed. Install with: pip install shap")
|
| 490 |
+
|
| 491 |
+
self.model_wrapper = model_wrapper
|
| 492 |
+
self.nepali_font = nepali_font
|
| 493 |
+
|
| 494 |
+
def explain(self, original_text: str, preprocessed_text: str, use_fallback: bool = True) -> Dict:
|
| 495 |
+
"""
|
| 496 |
+
Generate SHAP explanation
|
| 497 |
+
|
| 498 |
+
Args:
|
| 499 |
+
original_text: Original text with emojis
|
| 500 |
+
preprocessed_text: Preprocessed text for model
|
| 501 |
+
use_fallback: Use fallback method if SHAP fails
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
Dictionary with explanation data
|
| 505 |
+
"""
|
| 506 |
+
try:
|
| 507 |
+
# Try SHAP with text masker
|
| 508 |
+
def predict_masked(masked_texts):
|
| 509 |
+
if isinstance(masked_texts, np.ndarray):
|
| 510 |
+
if masked_texts.ndim == 1:
|
| 511 |
+
texts = [' '.join(str(t) for t in masked_texts if str(t).strip())]
|
| 512 |
+
else:
|
| 513 |
+
texts = [' '.join(str(t) for t in row if str(t).strip()) for row in masked_texts]
|
| 514 |
+
elif isinstance(masked_texts, str):
|
| 515 |
+
texts = [masked_texts]
|
| 516 |
+
elif isinstance(masked_texts, list):
|
| 517 |
+
texts = masked_texts
|
| 518 |
+
else:
|
| 519 |
+
texts = [str(masked_texts)]
|
| 520 |
+
|
| 521 |
+
return self.model_wrapper.predict_proba(texts)
|
| 522 |
+
|
| 523 |
+
explainer = Explainer(predict_masked, maskers.Text(preprocessed_text))
|
| 524 |
+
sv = explainer([preprocessed_text])[0]
|
| 525 |
+
|
| 526 |
+
shap_tokens = list(sv.data)
|
| 527 |
+
values_array = np.array(sv.values)
|
| 528 |
+
|
| 529 |
+
# Validate that we got meaningful results
|
| 530 |
+
if len(shap_tokens) == 0 or values_array.size == 0:
|
| 531 |
+
raise ValueError("SHAP returned empty results")
|
| 532 |
+
|
| 533 |
+
method_used = "shap"
|
| 534 |
+
|
| 535 |
+
except Exception as e:
|
| 536 |
+
if not use_fallback:
|
| 537 |
+
raise e
|
| 538 |
+
|
| 539 |
+
# Use fallback silently (only show in debug mode)
|
| 540 |
+
import logging
|
| 541 |
+
logging.debug(f"SHAP failed: {e}, using gradient fallback")
|
| 542 |
+
|
| 543 |
+
shap_tokens, values_array = self._gradient_based_attribution(preprocessed_text)
|
| 544 |
+
method_used = "gradient"
|
| 545 |
+
|
| 546 |
+
# Get predicted class
|
| 547 |
+
pred_probs = self.model_wrapper.predict_proba([preprocessed_text])[0]
|
| 548 |
+
class_idx = int(np.argmax(pred_probs))
|
| 549 |
+
|
| 550 |
+
# Extract values for predicted class
|
| 551 |
+
if values_array.ndim == 1:
|
| 552 |
+
token_values = values_array
|
| 553 |
+
elif values_array.ndim == 2:
|
| 554 |
+
token_values = values_array[:, class_idx]
|
| 555 |
+
elif values_array.ndim == 3:
|
| 556 |
+
token_values = values_array[0, :, class_idx]
|
| 557 |
+
else:
|
| 558 |
+
token_values = values_array.flatten()[:len(shap_tokens)]
|
| 559 |
+
|
| 560 |
+
# Create aligned tokens
|
| 561 |
+
display_tokens, model_tokens = create_display_text_with_emojis(
|
| 562 |
+
original_text, preprocessed_text
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
# Map SHAP values to display tokens
|
| 566 |
+
word_scores = self._align_shap_values(
|
| 567 |
+
display_tokens, model_tokens, shap_tokens, token_values
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
# Merge multi-word emoji attributions
|
| 571 |
+
word_scores = self._merge_multi_word_emojis(word_scores)
|
| 572 |
+
|
| 573 |
+
return {
|
| 574 |
+
'word_scores': word_scores,
|
| 575 |
+
'display_tokens': display_tokens,
|
| 576 |
+
'model_tokens': model_tokens,
|
| 577 |
+
'shap_tokens': shap_tokens,
|
| 578 |
+
'token_values': token_values,
|
| 579 |
+
'class_idx': class_idx,
|
| 580 |
+
'method_used': method_used
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
def _gradient_based_attribution(self, text: str) -> Tuple[List[str], np.ndarray]:
|
| 584 |
+
"""
|
| 585 |
+
Fallback: Word-level attribution using occlusion
|
| 586 |
+
|
| 587 |
+
Masks each word and measures prediction change
|
| 588 |
+
"""
|
| 589 |
+
words = text.split()
|
| 590 |
+
base_probs = self.model_wrapper.predict_proba([text])[0]
|
| 591 |
+
base_pred_idx = int(np.argmax(base_probs))
|
| 592 |
+
base_score = base_probs[base_pred_idx]
|
| 593 |
+
|
| 594 |
+
attributions = []
|
| 595 |
+
for i in range(len(words)):
|
| 596 |
+
# Mask the word
|
| 597 |
+
masked_words = words[:i] + words[i+1:]
|
| 598 |
+
masked_text = ' '.join(masked_words)
|
| 599 |
+
|
| 600 |
+
if not masked_text.strip():
|
| 601 |
+
attributions.append(base_score)
|
| 602 |
+
continue
|
| 603 |
+
|
| 604 |
+
# Get prediction without this word
|
| 605 |
+
masked_probs = self.model_wrapper.predict_proba([masked_text])[0]
|
| 606 |
+
masked_score = masked_probs[base_pred_idx]
|
| 607 |
+
|
| 608 |
+
# Attribution = score drop when word removed
|
| 609 |
+
attribution = base_score - masked_score
|
| 610 |
+
attributions.append(attribution)
|
| 611 |
+
|
| 612 |
+
# Ensure we have at least one attribution
|
| 613 |
+
if len(attributions) == 0:
|
| 614 |
+
attributions = [0.0] * len(words)
|
| 615 |
+
|
| 616 |
+
return words, np.array(attributions)
|
| 617 |
+
|
| 618 |
+
def _merge_multi_word_emojis(self, word_scores: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
|
| 619 |
+
"""
|
| 620 |
+
Merge attributions for multi-word emoji translations like: ठूलो रिस → 😡
|
| 621 |
+
|
| 622 |
+
Args:
|
| 623 |
+
word_scores: List of (word, score) tuples
|
| 624 |
+
|
| 625 |
+
Returns:
|
| 626 |
+
Merged list with multi-word emojis combined
|
| 627 |
+
"""
|
| 628 |
+
from scripts.transformer_data_preprocessing import EMOJI_TO_NEPALI
|
| 629 |
+
|
| 630 |
+
# Build set of multi-word phrases
|
| 631 |
+
multi_word_phrases = {}
|
| 632 |
+
for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
|
| 633 |
+
if ' ' in nepali_text:
|
| 634 |
+
multi_word_phrases[nepali_text] = emoji_char
|
| 635 |
+
|
| 636 |
+
# Merge consecutive words that form multi-word emoji phrases
|
| 637 |
+
merged_scores = []
|
| 638 |
+
i = 0
|
| 639 |
+
while i < len(word_scores):
|
| 640 |
+
word, score = word_scores[i]
|
| 641 |
+
|
| 642 |
+
# Check if this word + next word(s) form a multi-word emoji phrase
|
| 643 |
+
merged = False
|
| 644 |
+
for phrase, emoji_char in multi_word_phrases.items():
|
| 645 |
+
phrase_words = phrase.split()
|
| 646 |
+
if i + len(phrase_words) <= len(word_scores):
|
| 647 |
+
# Check if consecutive words match the phrase
|
| 648 |
+
candidate_words = [word_scores[i + j][0] for j in range(len(phrase_words))]
|
| 649 |
+
candidate_phrase = ' '.join(candidate_words)
|
| 650 |
+
|
| 651 |
+
if candidate_phrase == phrase:
|
| 652 |
+
# Found a multi-word emoji phrase - merge their scores
|
| 653 |
+
total_score = sum(word_scores[i + j][1] for j in range(len(phrase_words)))
|
| 654 |
+
|
| 655 |
+
merged_scores.append((emoji_char, total_score))
|
| 656 |
+
i += len(phrase_words) # Skip all words in the phrase
|
| 657 |
+
merged = True
|
| 658 |
+
break
|
| 659 |
+
|
| 660 |
+
if not merged:
|
| 661 |
+
merged_scores.append((word, score))
|
| 662 |
+
i += 1
|
| 663 |
+
|
| 664 |
+
return merged_scores
|
| 665 |
+
|
| 666 |
+
def _align_shap_values(self, display_tokens: List[str], model_tokens: List[str],
|
| 667 |
+
shap_tokens: List[str], token_values: np.ndarray) -> List[Tuple[str, float]]:
|
| 668 |
+
"""Align SHAP values with display tokens"""
|
| 669 |
+
word_scores = []
|
| 670 |
+
|
| 671 |
+
if len(display_tokens) == len(model_tokens):
|
| 672 |
+
# Direct alignment
|
| 673 |
+
for display_tok, model_tok in zip(display_tokens, model_tokens):
|
| 674 |
+
score = 0.0
|
| 675 |
+
for j, shap_tok in enumerate(shap_tokens):
|
| 676 |
+
if j < len(token_values) and (shap_tok in model_tok or model_tok in shap_tok):
|
| 677 |
+
score += float(token_values[j])
|
| 678 |
+
word_scores.append((display_tok, score))
|
| 679 |
+
else:
|
| 680 |
+
# Fallback: distribute evenly
|
| 681 |
+
for display_tok in display_tokens:
|
| 682 |
+
score = np.mean(token_values) if len(token_values) > 0 else 0.0
|
| 683 |
+
word_scores.append((display_tok, score))
|
| 684 |
+
|
| 685 |
+
return word_scores
|
| 686 |
+
|
| 687 |
+
def visualize(self, word_scores: List[Tuple[str, float]], class_name: str,
|
| 688 |
+
save_path: Optional[str] = None, show: bool = True,
|
| 689 |
+
figsize: Tuple[int, int] = None):
|
| 690 |
+
"""
|
| 691 |
+
Visualize SHAP explanation with highlighted text
|
| 692 |
+
|
| 693 |
+
Args:
|
| 694 |
+
word_scores: List of (word, score) tuples
|
| 695 |
+
class_name: Predicted class name
|
| 696 |
+
save_path: Path to save figure
|
| 697 |
+
show: Whether to display figure
|
| 698 |
+
figsize: Figure size (auto if None)
|
| 699 |
+
|
| 700 |
+
Returns:
|
| 701 |
+
matplotlib figure
|
| 702 |
+
"""
|
| 703 |
+
if not word_scores:
|
| 704 |
+
print("⚠️ No words to visualize")
|
| 705 |
+
return None
|
| 706 |
+
|
| 707 |
+
# Safe max calculation with fallback
|
| 708 |
+
abs_vals = [abs(v) for _, v in word_scores]
|
| 709 |
+
if not abs_vals or all(v == 0 for v in abs_vals):
|
| 710 |
+
max_val = 1.0 # Default to 1.0 if all values are zero
|
| 711 |
+
else:
|
| 712 |
+
max_val = max(abs_vals) + 1e-6
|
| 713 |
+
|
| 714 |
+
if figsize is None:
|
| 715 |
+
figsize = (max(10, 0.5 * len(word_scores)), 3)
|
| 716 |
+
|
| 717 |
+
fig, ax = plt.subplots(figsize=figsize)
|
| 718 |
+
ax.axis("off")
|
| 719 |
+
|
| 720 |
+
x, y = 0.01, 0.5
|
| 721 |
+
text_objs = []
|
| 722 |
+
|
| 723 |
+
for word, val in word_scores:
|
| 724 |
+
# Replace underscores with spaces for display
|
| 725 |
+
display_word = word.replace('_', ' ')
|
| 726 |
+
|
| 727 |
+
# Color intensity
|
| 728 |
+
intensity = min(abs(val) / max_val, 1.0)
|
| 729 |
+
|
| 730 |
+
# Red=negative, Green=positive
|
| 731 |
+
if val < 0:
|
| 732 |
+
color = (1.0, 1.0 - intensity * 0.7, 1.0 - intensity * 0.7)
|
| 733 |
+
else:
|
| 734 |
+
color = (1.0 - intensity * 0.7, 1.0, 1.0 - intensity * 0.7)
|
| 735 |
+
|
| 736 |
+
txt = ax.text(
|
| 737 |
+
x, y, f" {display_word} ",
|
| 738 |
+
fontsize=14,
|
| 739 |
+
bbox=dict(
|
| 740 |
+
facecolor=color,
|
| 741 |
+
edgecolor='gray',
|
| 742 |
+
alpha=0.8,
|
| 743 |
+
boxstyle="round,pad=0.4"
|
| 744 |
+
)
|
| 745 |
+
)
|
| 746 |
+
text_objs.append(txt)
|
| 747 |
+
|
| 748 |
+
# Update position (emojis take less space)
|
| 749 |
+
char_width = 0.025 if any(c in emoji.EMOJI_DATA for c in display_word) else 0.04
|
| 750 |
+
x += char_width * len(display_word) + 0.01
|
| 751 |
+
|
| 752 |
+
if x > 0.92:
|
| 753 |
+
x = 0.01
|
| 754 |
+
y -= 0.35
|
| 755 |
+
|
| 756 |
+
# Apply Nepali font
|
| 757 |
+
apply_nepali_font(ax, self.nepali_font, texts=text_objs, is_tick_labels=False)
|
| 758 |
+
|
| 759 |
+
ax.text(0.5, 0.95, f"SHAP Explanation (Predicted: {class_name})",
|
| 760 |
+
ha='center', va='top', fontsize=14, fontweight='bold',
|
| 761 |
+
transform=ax.transAxes)
|
| 762 |
+
|
| 763 |
+
plt.tight_layout()
|
| 764 |
+
|
| 765 |
+
if save_path:
|
| 766 |
+
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
| 767 |
+
print(f"✓ SHAP visualization saved to: {save_path}")
|
| 768 |
+
|
| 769 |
+
if show:
|
| 770 |
+
plt.show()
|
| 771 |
+
else:
|
| 772 |
+
plt.close(fig)
|
| 773 |
+
|
| 774 |
+
return fig
|
| 775 |
+
|
| 776 |
+
def explain_and_visualize(self, original_text: str, preprocessed_text: str,
|
| 777 |
+
save_path: Optional[str] = None, show: bool = True,
|
| 778 |
+
use_fallback: bool = True):
|
| 779 |
+
"""
|
| 780 |
+
Explain and visualize in one step
|
| 781 |
+
|
| 782 |
+
Args:
|
| 783 |
+
original_text: Original text with emojis
|
| 784 |
+
preprocessed_text: Preprocessed text for model
|
| 785 |
+
save_path: Path to save figure
|
| 786 |
+
show: Whether to display figure
|
| 787 |
+
use_fallback: Use fallback if SHAP fails
|
| 788 |
+
|
| 789 |
+
Returns:
|
| 790 |
+
Dictionary with explanation and figure
|
| 791 |
+
"""
|
| 792 |
+
# Generate explanation
|
| 793 |
+
explanation = self.explain(original_text, preprocessed_text, use_fallback)
|
| 794 |
+
|
| 795 |
+
# Get class name
|
| 796 |
+
class_name = self.model_wrapper.class_names[explanation['class_idx']]
|
| 797 |
+
|
| 798 |
+
# Visualize
|
| 799 |
+
fig = self.visualize(explanation['word_scores'], class_name, save_path, show)
|
| 800 |
+
|
| 801 |
+
return {
|
| 802 |
+
'explanation': explanation,
|
| 803 |
+
'figure': fig
|
| 804 |
+
}
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
# ============================================================================
|
| 808 |
+
# CONVENIENCE FUNCTIONS
|
| 809 |
+
# ============================================================================
|
| 810 |
+
|
| 811 |
+
def create_explainer_wrapper(model, tokenizer, label_encoder, preprocessor, device=None):
|
| 812 |
+
"""
|
| 813 |
+
Convenience function to create model wrapper
|
| 814 |
+
|
| 815 |
+
Args:
|
| 816 |
+
model: Trained model
|
| 817 |
+
tokenizer: Model tokenizer
|
| 818 |
+
label_encoder: Label encoder
|
| 819 |
+
preprocessor: HateSpeechPreprocessor instance
|
| 820 |
+
device: torch device (auto if None)
|
| 821 |
+
|
| 822 |
+
Returns:
|
| 823 |
+
ModelExplainerWrapper instance
|
| 824 |
+
"""
|
| 825 |
+
return ModelExplainerWrapper(model, tokenizer, label_encoder, preprocessor, device)
|
| 826 |
+
|
| 827 |
+
|
| 828 |
+
def explain_prediction(text: str, model_wrapper: ModelExplainerWrapper,
|
| 829 |
+
method: str = "both", nepali_font: Optional[FontProperties] = None,
|
| 830 |
+
save_dir: Optional[str] = None, show: bool = True) -> Dict:
|
| 831 |
+
"""
|
| 832 |
+
Explain a prediction using LIME and/or SHAP
|
| 833 |
+
|
| 834 |
+
Args:
|
| 835 |
+
text: Input text
|
| 836 |
+
model_wrapper: ModelExplainerWrapper instance
|
| 837 |
+
method: "lime", "shap", or "both"
|
| 838 |
+
nepali_font: Nepali font for visualization
|
| 839 |
+
save_dir: Directory to save figures
|
| 840 |
+
show: Whether to display figures
|
| 841 |
+
|
| 842 |
+
Returns:
|
| 843 |
+
Dictionary with explanations and figures
|
| 844 |
+
"""
|
| 845 |
+
# Get analysis
|
| 846 |
+
analysis = model_wrapper.predict_with_analysis(text)
|
| 847 |
+
original_text = analysis['original_text']
|
| 848 |
+
preprocessed_text = analysis['preprocessed_text']
|
| 849 |
+
|
| 850 |
+
results = {
|
| 851 |
+
'analysis': analysis,
|
| 852 |
+
'lime': None,
|
| 853 |
+
'shap': None
|
| 854 |
+
}
|
| 855 |
+
|
| 856 |
+
# LIME
|
| 857 |
+
if method in ["lime", "both"] and LIME_AVAILABLE:
|
| 858 |
+
lime = LIMEExplainer(model_wrapper, nepali_font)
|
| 859 |
+
save_path = os.path.join(save_dir, f"lime_{abs(hash(text)) % 10**8}.png") if save_dir else None
|
| 860 |
+
results['lime'] = lime.explain_and_visualize(
|
| 861 |
+
original_text, preprocessed_text, save_path, show
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
# SHAP
|
| 865 |
+
if method in ["shap", "both"] and SHAP_AVAILABLE:
|
| 866 |
+
shap_exp = SHAPExplainer(model_wrapper, nepali_font)
|
| 867 |
+
save_path = os.path.join(save_dir, f"shap_{abs(hash(text)) % 10**8}.png") if save_dir else None
|
| 868 |
+
results['shap'] = shap_exp.explain_and_visualize(
|
| 869 |
+
original_text, preprocessed_text, save_path, show
|
| 870 |
+
)
|
| 871 |
+
|
| 872 |
+
return results
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
# ============================================================================
|
| 876 |
+
# AVAILABILITY CHECK
|
| 877 |
+
# ============================================================================
|
| 878 |
+
|
| 879 |
+
def check_availability() -> Dict[str, bool]:
|
| 880 |
+
"""Check which explainability methods are available"""
|
| 881 |
+
return {
|
| 882 |
+
'lime': LIME_AVAILABLE,
|
| 883 |
+
'shap': SHAP_AVAILABLE
|
| 884 |
+
}
|
scripts/transformer_data_preprocessing.py
ADDED
|
@@ -0,0 +1,786 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transformer Data Preprocessing Module
|
| 3 |
+
======================================
|
| 4 |
+
Enhanced preprocessing pipeline for Nepali hate speech classification.
|
| 5 |
+
|
| 6 |
+
This module provides:
|
| 7 |
+
- Script detection (Devanagari/Romanized/English/Mixed)
|
| 8 |
+
- Transliteration (Romanized → Devanagari)
|
| 9 |
+
- Translation (English → Nepali)
|
| 10 |
+
- Emoji semantic mapping with feature extraction
|
| 11 |
+
- Text normalization
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
------
|
| 15 |
+
from scripts.transformer_data_preprocessing import HateSpeechPreprocessor
|
| 16 |
+
|
| 17 |
+
# Initialize preprocessor
|
| 18 |
+
preprocessor = HateSpeechPreprocessor(
|
| 19 |
+
model_type="xlmr",
|
| 20 |
+
translate_english=True
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Preprocess single text
|
| 24 |
+
processed_text, emoji_features = preprocessor.preprocess("Your text here")
|
| 25 |
+
|
| 26 |
+
# Preprocess batch
|
| 27 |
+
texts_list = ["text1", "text2", "text3"]
|
| 28 |
+
processed_texts, features_list = preprocessor.preprocess_batch(texts_list)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import re
|
| 32 |
+
import emoji
|
| 33 |
+
import regex
|
| 34 |
+
from typing import Any, Literal, Optional, Tuple, Dict, List
|
| 35 |
+
from deep_translator import GoogleTranslator
|
| 36 |
+
from functools import lru_cache
|
| 37 |
+
import logging
|
| 38 |
+
|
| 39 |
+
# Setup logging
|
| 40 |
+
logging.basicConfig(level=logging.WARNING)
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
# Try to import transliteration (optional)
|
| 44 |
+
try:
|
| 45 |
+
from indic_transliteration import sanscript
|
| 46 |
+
from indic_transliteration.sanscript import transliterate
|
| 47 |
+
TRANSLITERATION_AVAILABLE = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
TRANSLITERATION_AVAILABLE = False
|
| 50 |
+
logger.warning("indic_transliteration not available. Transliteration disabled.")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ============================================================================
|
| 54 |
+
# COMPREHENSIVE EMOJI MAPPINGS
|
| 55 |
+
# ============================================================================
|
| 56 |
+
|
| 57 |
+
EMOJI_TO_NEPALI = {
|
| 58 |
+
# Positive emotions
|
| 59 |
+
'😂': 'हाँसो', '🤣': 'ठूलो_हाँसो', '😀': 'खुशी', '😁': 'खुशी', '😃': 'खुशी',
|
| 60 |
+
'😄': 'खुशी', '😅': 'नर्भस_हाँसो', '😆': 'हाँसो', '😊': 'मुस्कान', '☺️': 'मुस्कान',
|
| 61 |
+
'😉': 'आँखा_झिम्काउने', '🙂': 'मुस्कान', '🙃': 'उल्टो_मुस्कान', '😌': 'शान्त',
|
| 62 |
+
'😍': 'माया', '🥰': 'माया', '😘': 'चुम्बन', '😗': 'चुम्बन', '😙': 'चुम्बन', '😚': 'चुम्बन',
|
| 63 |
+
'🤗': 'अँगालो', '🤩': 'चकित', '🥳': 'उत्सव', '🤤': 'लालसा',
|
| 64 |
+
|
| 65 |
+
# Mockery & Sarcasm
|
| 66 |
+
'😏': 'व्यंग्य', '😜': 'जिब्रो_देखाउने', '😝': 'जिब्रो_देखाउने', '😛': 'जिब्रो',
|
| 67 |
+
'🙄': 'आँखा_घुमाउने', '😤': 'निराश', '😑': 'अभिव्यक्तिहीन', '😐': 'तटस्थ',
|
| 68 |
+
'😬': 'तनाव', '🤨': 'शंकास्पद', '🤫': 'चुपचाप', '🤭': 'हात_छोप्ने',
|
| 69 |
+
'🤥': 'झूठ', '😶': 'मौन',
|
| 70 |
+
|
| 71 |
+
# Anger & Hate
|
| 72 |
+
'😠': 'रिस', '😡': 'ठूलो_रिस', '🤬': 'गाली', '😈': 'खराब', '👿': 'खराब',
|
| 73 |
+
'💢': 'क्रोध', '🔪': 'हिंसा', '💣': 'हिंसा', '🗡️': 'तरवार', '⚔️': 'युद्ध',
|
| 74 |
+
'💥': 'विस्फोट', '🔫': 'बन्दुक', '🧨': 'विस्फोटक', '☠️': 'मृत्यु', '💀': 'खोपडी',
|
| 75 |
+
'👹': 'राक्षस', '👺': 'दानव', '🤡': 'जोकर', '🖤': 'कालो_मन',
|
| 76 |
+
'😾': 'रिसाएको', '👊': 'मुक्का', '✊': 'मुक्का',
|
| 77 |
+
|
| 78 |
+
# Offensive Gestures
|
| 79 |
+
'🖕': 'अपमान', '👎': 'नकारात्मक', '👎🏻': 'नकारात्मक', '👎🏼': 'नकारात्मक',
|
| 80 |
+
'👎🏽': 'नकारात्मक', '👎🏾': 'नकारात्मक', '👎🏿': 'नकारात्मक',
|
| 81 |
+
'🖕🏻': 'अपमान', '🖕🏼': 'अपमान', '🖕🏽': 'अपमान', '🖕🏾': 'अपमान', '🖕🏿': 'अपमान',
|
| 82 |
+
|
| 83 |
+
# Sadness
|
| 84 |
+
'😭': 'रुवाइ', '😢': 'रुवाइ', '😿': 'रुवाइ', '😔': 'उदास', '😞': 'उदास',
|
| 85 |
+
'😒': 'उदास', '😓': 'चिन्तित', '😟': 'चिन्तित', '😕': 'अलमलिएको',
|
| 86 |
+
'🙁': 'तल्लो_मुख', '☹️': 'दुःखी', '😩': 'थकित', '😫': 'थकित',
|
| 87 |
+
'😖': 'भ्रमित', '😣': 'अडिग', '😥': 'निराश', '🥺': 'बिन्ती',
|
| 88 |
+
|
| 89 |
+
# Fear & Shock
|
| 90 |
+
'😨': 'डर', '😰': 'चिन्तित_पसिना', '😱': 'चिच्याउने', '😳': 'लजाउने',
|
| 91 |
+
'🤯': 'मन_उडेको', '😵': 'चक्कर', '😲': 'चकित', '😯': 'छक्क',
|
| 92 |
+
|
| 93 |
+
# Disgust
|
| 94 |
+
'🤢': 'बान्ता', '🤮': 'बान्ता', '🤧': 'हाच्छ्यूँ', '😷': 'बिरामी',
|
| 95 |
+
'🤒': 'ज्वरो', '🤕': 'घाइते', '🥴': 'मात्तिएको', '😪': 'निद्रा',
|
| 96 |
+
|
| 97 |
+
# Positive Gestures
|
| 98 |
+
'👍': 'सकारात्मक', '👍🏻': 'सकारात्मक', '👍🏼': 'सकारात्मक',
|
| 99 |
+
'👍🏽': 'सकारात्मक', '👍🏾': 'सकारात्मक', '👍🏿': 'सकारात्मक',
|
| 100 |
+
'👏': 'तालि', '🙌': 'उत्सव', '👌': 'ठीक_छ', '🤝': 'हात_मिलाउनु',
|
| 101 |
+
'🙏': 'नमस्कार', '🤲': 'प्रार्थना', '💪': 'शक्ति', '✌️': 'शान्ति',
|
| 102 |
+
|
| 103 |
+
# Hearts
|
| 104 |
+
'❤️': 'माया', '🧡': 'माया', '💛': 'माया', '💚': 'माया', '💙': 'माया',
|
| 105 |
+
'💜': 'माया', '🤍': 'सेतो_मन', '🤎': 'खैरो_मन', '❣️': 'माया',
|
| 106 |
+
'💕': 'माया', '💞': 'माया', '💓': 'माया', '💗': 'माया',
|
| 107 |
+
'💖': 'माया', '💘': 'माया', '💝': 'माया', '💔': 'टुटेको_मन',
|
| 108 |
+
|
| 109 |
+
# Symbols
|
| 110 |
+
'🔥': 'आगो', '💯': 'पूर्ण', '💨': 'हावा', '💫': 'चमक',
|
| 111 |
+
'⭐': 'तारा', '✨': 'चमक', '🌟': 'चम्किलो_तारा',
|
| 112 |
+
'🚫': 'निषेध', '⛔': 'प्रवेश_निषेध', '❌': 'रद्द', '❎': 'गलत',
|
| 113 |
+
|
| 114 |
+
# People
|
| 115 |
+
'👫': 'जोडी', '👬': 'पुरुष_जोडी', '👭': 'महिला_जोडी', '👨\u200d👩\u200d👧\u200d👦': 'परिवार',
|
| 116 |
+
'👶': 'बच्चा', '👦': 'केटा', '👧': 'केटी', '👨': 'पुरुष', '👩': 'महिला',
|
| 117 |
+
'👴': 'बूढो', '👵': 'बूढी', '🧒': 'बालक', '👱': 'गोरो', '🧔': 'दाह्री',
|
| 118 |
+
|
| 119 |
+
# Country
|
| 120 |
+
'🇳🇵': 'नेपाल', '🇮🇳': 'भारत', '🇵🇰': 'पाकिस्तान', '🇧🇩': 'बंगलादेश',
|
| 121 |
+
'🇨🇳': 'चीन', '🇺🇸': 'अमेरिका', '🏴': 'झण्डा',
|
| 122 |
+
|
| 123 |
+
# Animals
|
| 124 |
+
'🐕': 'कुकुर', '🐖': 'सुँगुर', '🐀': 'मुसा', '🐍': 'सर्प', '🦂': 'बिच्छी',
|
| 125 |
+
'🐒': 'बाँदर', '🐵': 'बाँदर_अनुहार', '🦍': 'गोरिल्ला', '🐗': 'जङ्गली_सुँगुर',
|
| 126 |
+
|
| 127 |
+
# Other
|
| 128 |
+
'🤔': 'सोच', '🧐': 'अनुसन्धान', '😴': 'सुत्ने', '💩': 'मल',
|
| 129 |
+
'👻': 'भूत', '🤖': 'रोबोट', '👽': 'विदेशी', '🎭': 'मुखौटा',
|
| 130 |
+
|
| 131 |
+
# === EXPANDED COMMON EMOJIS ===
|
| 132 |
+
|
| 133 |
+
# Celebrations & Party
|
| 134 |
+
'🎉': 'उत्सव', '🎊': 'पार्टी', '🎈': 'बेलुन', '🎁': 'उपहार',
|
| 135 |
+
'🎂': 'केक', '🍰': 'मिठाई', '🥂': 'चश्मा', '🍾': 'शराब',
|
| 136 |
+
|
| 137 |
+
# Food & Drink (common in casual/hate contexts)
|
| 138 |
+
'🍕': 'पिज्जा', '🍔': 'बर्गर', '🍗': 'चिकन', '🍖': 'मासु',
|
| 139 |
+
'🍺': 'बियर', '🍻': 'पार्टी', '☕': 'चिया', '🍵': 'चिया',
|
| 140 |
+
'🍜': 'नूडल', '🍛': 'करी', '🍲': 'खाना', '🥘': 'परिकार',
|
| 141 |
+
|
| 142 |
+
# Sports & Activities
|
| 143 |
+
'⚽': 'फुटबल', '🏏': 'क्रिकेट', '🏀': 'बास्केटबल', '🎮': 'खेल',
|
| 144 |
+
'🏆': 'ट्रफी', '🥇': 'स्वर्ण', '🥈': 'रजत', '🥉': 'कांस्य',
|
| 145 |
+
|
| 146 |
+
# Weather & Nature
|
| 147 |
+
'☀️': 'घाम', '🌙': 'चन्द्रमा', '🌧️': 'पानी', '⛈️': 'आँधी',
|
| 148 |
+
'❄️': 'हिउँ', '🌈': 'इन्द्रेणी', '⚡': 'बिजुली', '🌪️': 'बतास',
|
| 149 |
+
|
| 150 |
+
# Technology & Modern
|
| 151 |
+
'📱': 'मोबाइल', '💻': 'कम्प्युटर', '📷': 'क्यामेरा', '🎥': 'भिडियो',
|
| 152 |
+
'🖥️': 'कम्प्युटर', '⌨️': 'किबोर्ड', '🖱️': 'माउस', '📡': 'एन्टेना',
|
| 153 |
+
|
| 154 |
+
# Time & Clock
|
| 155 |
+
'⏰': 'घडी', '⏳': 'समय', '⌛': 'बालुवा_घडी', '🕐': 'एक_बजे',
|
| 156 |
+
|
| 157 |
+
# Objects
|
| 158 |
+
'📚': 'किताब', '📖': 'खुल्ला_किताब', '✏️': 'पेन्सिल', '📝': 'लेख',
|
| 159 |
+
'🎤': 'माइक', '🎧': 'हेडफोन', '📢': 'घोषणा', '📣': 'चिल्लाउने',
|
| 160 |
+
|
| 161 |
+
# Miscellaneous Common
|
| 162 |
+
'✅': 'ठीक', '☑️': 'जाँच', '💯': 'सय', '🆗': 'ठीक',
|
| 163 |
+
'🆕': 'नयाँ', '🆓': 'मुक्त', '🔴': 'रातो', '🟢': 'हरियो',
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
# Emoji categories for feature extraction
|
| 167 |
+
HATE_RELATED_EMOJIS = {
|
| 168 |
+
'😠', '😡', '🤬', '😈', '👿', '💢', '👊', '✊',
|
| 169 |
+
'🔪', '💣', '🗡️', '⚔️', '💥', '🔫', '🧨', '☠️', '💀',
|
| 170 |
+
'🖕', '🖕🏻', '🖕🏼', '🖕🏽', '🖕🏾', '🖕🏿',
|
| 171 |
+
'👎', '👎🏻', '👎🏼', '👎🏽', '👎🏾', '👎🏿',
|
| 172 |
+
'👹', '👺', '🤡', '🖤', '💔',
|
| 173 |
+
'🐕', '🐖', '🐀', '🐍', '🦂', '🐒', '🐵', '🦍', '🐗',
|
| 174 |
+
'💩', '😾',
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
MOCKERY_EMOJIS = {
|
| 178 |
+
'😏', '😜', '😝', '😛', '🙄', '😤', '🙃',
|
| 179 |
+
'😑', '😐', '😬', '🤨', '🤫', '🤭', '🤥',
|
| 180 |
+
'🤡', '👻', '🎭',
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
POSITIVE_EMOJIS = {
|
| 184 |
+
'😊', '😀', '😁', '😃', '😄', '☺️', '🙂', '😌', '🥰', '😍',
|
| 185 |
+
'❤️', '🧡', '💛', '💚', '💙', '💜', '🤍', '🤎',
|
| 186 |
+
'💕', '💞', '💓', '💗', '💖', '💘', '💝', '❣️',
|
| 187 |
+
'👍', '👍🏻', '👍🏼', '👍🏽', '👍🏾', '👍🏿',
|
| 188 |
+
'🙏', '👏', '🙌', '👌', '🤝', '✌️',
|
| 189 |
+
'🥳', '🎉', '🎊', '⭐', '✨', '🌟',
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
SADNESS_EMOJIS = {
|
| 193 |
+
'😭', '😢', '😿', '😔', '😞', '😒', '😓', '😟', '😕',
|
| 194 |
+
'🙁', '☹️', '😩', '😫', '😖', '😣', '😥', '🥺',
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
FEAR_EMOJIS = {
|
| 198 |
+
'😨', '😰', '😱', '😳', '🤯', '😵', '😲', '😯',
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
DISGUST_EMOJIS = {
|
| 202 |
+
'🤢', '🤮', '🤧', '😷', '🤒', '🤕', '🥴',
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ============================================================================
|
| 207 |
+
# NORMALIZATION MAPPINGS
|
| 208 |
+
# ============================================================================
|
| 209 |
+
|
| 210 |
+
DIRGHIKARAN_MAP = {
|
| 211 |
+
"\u200d": "", # Zero-width joiner
|
| 212 |
+
"\u200c": "", # Zero-width non-joiner
|
| 213 |
+
"।": ".", # Devanagari danda
|
| 214 |
+
"॥": ".", # Double danda
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# ============================================================================
|
| 219 |
+
# TYPE DEFINITIONS
|
| 220 |
+
# ============================================================================
|
| 221 |
+
|
| 222 |
+
ScriptType = Literal["devanagari", "romanized_nepali", "english", "mixed", "other"]
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# EMOJI FEATURE EXTRACTION
|
| 227 |
+
# ============================================================================
|
| 228 |
+
|
| 229 |
+
def extract_emoji_features(text: str) -> Dict[str, int]:
|
| 230 |
+
"""
|
| 231 |
+
Extract comprehensive emoji-based semantic features
|
| 232 |
+
|
| 233 |
+
Returns 18 features:
|
| 234 |
+
- 6 binary flags (has_X_emoji)
|
| 235 |
+
- 6 count features (X_emoji_count)
|
| 236 |
+
- 6 derived features (total, ratio, mixed_sentiment, unknown tracking)
|
| 237 |
+
"""
|
| 238 |
+
emojis_found = [c for c in text if c in emoji.EMOJI_DATA]
|
| 239 |
+
|
| 240 |
+
hate_count = sum(1 for e in emojis_found if e in HATE_RELATED_EMOJIS)
|
| 241 |
+
mockery_count = sum(1 for e in emojis_found if e in MOCKERY_EMOJIS)
|
| 242 |
+
positive_count = sum(1 for e in emojis_found if e in POSITIVE_EMOJIS)
|
| 243 |
+
sadness_count = sum(1 for e in emojis_found if e in SADNESS_EMOJIS)
|
| 244 |
+
fear_count = sum(1 for e in emojis_found if e in FEAR_EMOJIS)
|
| 245 |
+
disgust_count = sum(1 for e in emojis_found if e in DISGUST_EMOJIS)
|
| 246 |
+
|
| 247 |
+
# Track unknown emojis (not in our mapping)
|
| 248 |
+
known_emojis = set(EMOJI_TO_NEPALI.keys())
|
| 249 |
+
unknown_emojis = [e for e in emojis_found if e not in known_emojis]
|
| 250 |
+
unknown_count = len(unknown_emojis)
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
# Binary flags
|
| 254 |
+
'has_hate_emoji': 1 if hate_count > 0 else 0,
|
| 255 |
+
'has_mockery_emoji': 1 if mockery_count > 0 else 0,
|
| 256 |
+
'has_positive_emoji': 1 if positive_count > 0 else 0,
|
| 257 |
+
'has_sadness_emoji': 1 if sadness_count > 0 else 0,
|
| 258 |
+
'has_fear_emoji': 1 if fear_count > 0 else 0,
|
| 259 |
+
'has_disgust_emoji': 1 if disgust_count > 0 else 0,
|
| 260 |
+
|
| 261 |
+
# Count features
|
| 262 |
+
'hate_emoji_count': hate_count,
|
| 263 |
+
'mockery_emoji_count': mockery_count,
|
| 264 |
+
'positive_emoji_count': positive_count,
|
| 265 |
+
'sadness_emoji_count': sadness_count,
|
| 266 |
+
'fear_emoji_count': fear_count,
|
| 267 |
+
'disgust_emoji_count': disgust_count,
|
| 268 |
+
'total_emoji_count': len(emojis_found),
|
| 269 |
+
|
| 270 |
+
# Derived features
|
| 271 |
+
'hate_to_positive_ratio': hate_count / max(positive_count, 1),
|
| 272 |
+
'has_mixed_sentiment': 1 if (hate_count > 0 and positive_count > 0) else 0,
|
| 273 |
+
|
| 274 |
+
# NEW: Unknown emoji tracking
|
| 275 |
+
'unknown_emoji_count': unknown_count,
|
| 276 |
+
'has_unknown_emoji': 1 if unknown_count > 0 else 0,
|
| 277 |
+
'known_emoji_ratio': (len(emojis_found) - unknown_count) / max(len(emojis_found), 1),
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def remove_emojis_for_detection(text: str) -> str:
|
| 282 |
+
"""Remove emojis temporarily for script detection"""
|
| 283 |
+
return emoji.replace_emoji(text, replace="")
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
# ============================================================================
|
| 287 |
+
# SCRIPT DETECTION
|
| 288 |
+
# ============================================================================
|
| 289 |
+
|
| 290 |
+
def detect_script_type(text: str) -> Tuple[ScriptType, dict]:
|
| 291 |
+
"""
|
| 292 |
+
Detect the dominant script type ignoring emojis
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
Tuple of (script_type, detection_details)
|
| 296 |
+
"""
|
| 297 |
+
if not text or not text.strip():
|
| 298 |
+
return "other", {"confidence": 0.0, "reason": "empty_text"}
|
| 299 |
+
|
| 300 |
+
# Remove emojis before detection
|
| 301 |
+
text_no_emoji = remove_emojis_for_detection(text)
|
| 302 |
+
|
| 303 |
+
if not text_no_emoji.strip():
|
| 304 |
+
return "other", {"confidence": 0.5, "reason": "emoji_only"}
|
| 305 |
+
|
| 306 |
+
letters = regex.findall(r"\p{L}", text_no_emoji)
|
| 307 |
+
letter_count = len(letters)
|
| 308 |
+
|
| 309 |
+
if letter_count == 0:
|
| 310 |
+
return "other", {"confidence": 0.0, "reason": "no_letters"}
|
| 311 |
+
|
| 312 |
+
devanagari_chars = regex.findall(r"\p{Devanagari}", text_no_emoji)
|
| 313 |
+
dev_count = len(devanagari_chars)
|
| 314 |
+
dev_ratio = dev_count / letter_count
|
| 315 |
+
|
| 316 |
+
latin_chars = regex.findall(r"[a-zA-Z]", text_no_emoji)
|
| 317 |
+
latin_count = len(latin_chars)
|
| 318 |
+
latin_ratio = latin_count / letter_count
|
| 319 |
+
|
| 320 |
+
# Romanized Nepali patterns
|
| 321 |
+
romanized_nepali_patterns = [
|
| 322 |
+
# Common words
|
| 323 |
+
r'\b[xX]u\b', r'\b[xX]um?\b', r'\bhajur\b', r'\bdai\b', r'\bbhai\b', r'\bdidi\b',
|
| 324 |
+
r'\bbahini\b', r'\bsanghai\b', r'\bsunu\b', r'\bhera\b', r'\bsun\b',
|
| 325 |
+
|
| 326 |
+
# Particles & Postpositions
|
| 327 |
+
r'\bko\b', r'\bki\b', r'\bka\b', r'\bho\b', r'\btyo\b', r'\byo\b', r'\bta\b',
|
| 328 |
+
r'\bma\b', r'\bma?i\b', r'\bla[ie]?\b', r'\bnai?\b', r'\bpani\b', r'\bni\b',
|
| 329 |
+
|
| 330 |
+
# Verbs
|
| 331 |
+
r'\bhun[ae]\b', r'\bhunchha\b', r'\bhunuhunchha\b', r'\bgar\w+\b', r'\bgarna\b',
|
| 332 |
+
r'\bx[ao]\b', r'\bxa\b', r'\bxan\b', r'\bxaina\b', r'\bxu\b',
|
| 333 |
+
r'\bchain\b', r'\bchaina\b', r'\bthiy[oe]\b', r'\bhola\b', r'\bhos\b',
|
| 334 |
+
r'\bbhan\w*\b', r'\bbol\w*\b', r'\bher\w*\b',
|
| 335 |
+
|
| 336 |
+
# Common adjectives/states
|
| 337 |
+
r'\bkh[ou]s[hi]?\b', r'\bkhusi\b', r'\bkhushi\b', r'\bramro\b', r'\bnaramro\b',
|
| 338 |
+
r'\bthulo\b', r'\bsano\b', r'\brasilo\b', r'\bmitho\b', r'\btikhi\b',
|
| 339 |
+
r'\bdherei\b', r'\baliali\b', r'\bastai\b', r'\blastai\b',
|
| 340 |
+
|
| 341 |
+
# Question words
|
| 342 |
+
r'\bkina\b', r'\bkasari\b', r'\bkahile\b', r'\bkaha[n]?\b', r'\bke\b', r'\bko\b',
|
| 343 |
+
|
| 344 |
+
# Pronouns
|
| 345 |
+
r'\bma\b', r'\btimi\b', r'\btapai\b', r'\buha\b', r'\buni\b', r'\byini\b',
|
| 346 |
+
r'\bmero\b', r'\btimro\b', r'\buhako\b', r'\buniko\b', r'\bhamro\b',
|
| 347 |
+
|
| 348 |
+
# Common nouns
|
| 349 |
+
r'\bmanxe\b', r'\bmanchhe\b', r'\bmanche\b', r'\bharu\b', r'\bdes[ha]?\b',
|
| 350 |
+
r'\bgha?r\b', r'\bthau\b', r'\bsamay\b', r'\bbela\b',
|
| 351 |
+
|
| 352 |
+
# Nepali-specific endings (transliterated)
|
| 353 |
+
r'\w+[ae]ko\b', r'\w+[ae]ki\b', r'\w+dai\b', r'\w+lai\b',
|
| 354 |
+
r'\w+ma\b', r'\w+xa\b', r'\w+hun[ae]\b', r'\w+thiyo\b',
|
| 355 |
+
]
|
| 356 |
+
|
| 357 |
+
romanized_indicators = sum(1 for pattern in romanized_nepali_patterns
|
| 358 |
+
if re.search(pattern, text_no_emoji, re.IGNORECASE))
|
| 359 |
+
|
| 360 |
+
# Calculate Romanized Nepali score
|
| 361 |
+
romanized_score = 0.0
|
| 362 |
+
if latin_ratio > 0.5 and dev_ratio < 0.3:
|
| 363 |
+
if romanized_indicators > 0:
|
| 364 |
+
romanized_score = min(0.5 + (romanized_indicators * 0.15), 0.95)
|
| 365 |
+
else:
|
| 366 |
+
# Check for typical Romanized Nepali patterns
|
| 367 |
+
romanized_patterns = re.findall(r'\b\w*[aeiou](?:h)?\b', text_no_emoji.lower())
|
| 368 |
+
if any(word.endswith(('xu', 'ro', 'no', 'lo', 'ko', 'ho'))
|
| 369 |
+
for word in romanized_patterns):
|
| 370 |
+
romanized_score = 0.4
|
| 371 |
+
else:
|
| 372 |
+
romanized_score = 0.3
|
| 373 |
+
|
| 374 |
+
# English indicators (EXPANDED)
|
| 375 |
+
english_indicators = [
|
| 376 |
+
# Articles & Determiners
|
| 377 |
+
'the', 'a', 'an', 'this', 'that', 'these', 'those', 'some', 'any', 'all', 'every',
|
| 378 |
+
|
| 379 |
+
# Pronouns
|
| 380 |
+
'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
|
| 381 |
+
'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs',
|
| 382 |
+
'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
|
| 383 |
+
|
| 384 |
+
# Common verbs (be, have, do)
|
| 385 |
+
'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
|
| 386 |
+
'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'done',
|
| 387 |
+
'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
|
| 388 |
+
|
| 389 |
+
# Common verbs (action)
|
| 390 |
+
'get', 'got', 'go', 'went', 'gone', 'make', 'made', 'take', 'took', 'taken',
|
| 391 |
+
'come', 'came', 'see', 'saw', 'seen', 'know', 'knew', 'known', 'say', 'said',
|
| 392 |
+
'tell', 'told', 'think', 'thought', 'give', 'gave', 'given', 'find', 'found',
|
| 393 |
+
|
| 394 |
+
# Question words
|
| 395 |
+
'what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how',
|
| 396 |
+
|
| 397 |
+
# Prepositions
|
| 398 |
+
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'from', 'by', 'about', 'as',
|
| 399 |
+
'into', 'through', 'over', 'under', 'after', 'before', 'between', 'among',
|
| 400 |
+
|
| 401 |
+
# Conjunctions
|
| 402 |
+
'and', 'or', 'but', 'so', 'yet', 'nor', 'because', 'if', 'when', 'while',
|
| 403 |
+
'although', 'though', 'unless', 'since', 'until', 'where', 'whether',
|
| 404 |
+
|
| 405 |
+
# Negations
|
| 406 |
+
'not', 'no', 'never', 'none', 'nothing', 'nobody', 'nowhere', 'neither',
|
| 407 |
+
|
| 408 |
+
# Common adjectives
|
| 409 |
+
'good', 'bad', 'great', 'big', 'small', 'long', 'short', 'high', 'low',
|
| 410 |
+
'old', 'new', 'young', 'early', 'late', 'right', 'wrong', 'true', 'false',
|
| 411 |
+
'hot', 'cold', 'happy', 'sad', 'angry', 'nice', 'beautiful', 'ugly',
|
| 412 |
+
|
| 413 |
+
# Sentiment words (hate speech relevant)
|
| 414 |
+
'hate', 'love', 'like', 'dislike', 'stupid', 'dumb', 'idiot', 'fool',
|
| 415 |
+
'kill', 'die', 'dead', 'death', 'fuck', 'shit', 'ass', 'damn', 'hell',
|
| 416 |
+
'worst', 'terrible', 'horrible', 'awful', 'disgusting', 'pathetic',
|
| 417 |
+
|
| 418 |
+
# Common nouns
|
| 419 |
+
'man', 'woman', 'people', 'person', 'thing', 'time', 'day', 'year',
|
| 420 |
+
'way', 'work', 'life', 'world', 'country', 'place', 'home', 'hand',
|
| 421 |
+
|
| 422 |
+
# Very & Adverbs
|
| 423 |
+
'very', 'really', 'quite', 'too', 'so', 'just', 'only', 'even', 'also',
|
| 424 |
+
'well', 'much', 'more', 'most', 'less', 'least', 'still', 'already',
|
| 425 |
+
]
|
| 426 |
+
english_words = [w.lower() for w in re.findall(r'\b\w+\b', text_no_emoji)]
|
| 427 |
+
english_count = sum(1 for w in english_words if w in english_indicators)
|
| 428 |
+
english_ratio = english_count / len(english_words) if english_words else 0
|
| 429 |
+
|
| 430 |
+
# Detection details
|
| 431 |
+
details = {
|
| 432 |
+
"devanagari_count": dev_count,
|
| 433 |
+
"devanagari_ratio": dev_ratio,
|
| 434 |
+
"latin_count": latin_count,
|
| 435 |
+
"latin_ratio": latin_ratio,
|
| 436 |
+
"romanized_indicators": romanized_indicators,
|
| 437 |
+
"english_ratio": english_ratio,
|
| 438 |
+
"letter_count": letter_count
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
# Decision logic
|
| 442 |
+
if dev_ratio >= 0.8:
|
| 443 |
+
return "devanagari", {**details, "confidence": dev_ratio, "reason": "dominant_devanagari"}
|
| 444 |
+
|
| 445 |
+
elif dev_ratio >= 0.4:
|
| 446 |
+
return "mixed", {**details, "confidence": 0.7, "reason": "mixed_with_devanagari"}
|
| 447 |
+
|
| 448 |
+
elif romanized_score > 0.5 and dev_ratio < 0.2:
|
| 449 |
+
return "romanized_nepali", {**details, "confidence": romanized_score, "reason": "romanized_nepali_detected"}
|
| 450 |
+
|
| 451 |
+
elif english_ratio > 0.2 and romanized_score < 0.4:
|
| 452 |
+
return "english", {**details, "confidence": min(english_ratio + 0.3, 0.9), "reason": "english_detected"}
|
| 453 |
+
|
| 454 |
+
elif latin_ratio > 0.5 and romanized_score > 0.3:
|
| 455 |
+
return "romanized_nepali", {**details, "confidence": romanized_score, "reason": "likely_romanized_nepali"}
|
| 456 |
+
|
| 457 |
+
elif latin_ratio > 0.8:
|
| 458 |
+
if english_ratio > 0.1:
|
| 459 |
+
return "english", {**details, "confidence": 0.6, "reason": "likely_english"}
|
| 460 |
+
else:
|
| 461 |
+
return "romanized_nepali", {**details, "confidence": 0.5, "reason": "ambiguous_latin_script"}
|
| 462 |
+
|
| 463 |
+
else:
|
| 464 |
+
return "other", {**details, "confidence": 0.3, "reason": "insufficient_indicators"}
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
# ============================================================================
|
| 469 |
+
# TEXT PROCESSING FUNCTIONS
|
| 470 |
+
# ============================================================================
|
| 471 |
+
|
| 472 |
+
def clean_text_basic(text: str) -> str:
|
| 473 |
+
"""Basic text cleaning"""
|
| 474 |
+
# Remove URLs
|
| 475 |
+
text = re.sub(r"http\S+|www\S+", "", text)
|
| 476 |
+
# Remove mentions
|
| 477 |
+
text = re.sub(r"@\w+", "", text)
|
| 478 |
+
# Remove hashtag symbol but keep text
|
| 479 |
+
text = re.sub(r"#(\w+)", r"\1", text)
|
| 480 |
+
# Remove quotes (single and double, including smart quotes)
|
| 481 |
+
text = text.replace('"', '').replace("'", '').replace('"', '').replace('"', '').replace(''', '').replace(''', '')
|
| 482 |
+
# Normalize whitespace
|
| 483 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 484 |
+
return text
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def normalize_devanagari(text: str) -> str:
|
| 488 |
+
"""Normalize Devanagari-specific characters"""
|
| 489 |
+
for k, v in DIRGHIKARAN_MAP.items():
|
| 490 |
+
text = text.replace(k, v)
|
| 491 |
+
return text
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def replace_emojis_semantic(text: str, preserve_spacing: bool = True,
|
| 495 |
+
preserve_unknown: bool = True) -> str:
|
| 496 |
+
"""
|
| 497 |
+
Replace emojis with Nepali text
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
text: Input text with emojis
|
| 501 |
+
preserve_spacing: Add spaces around Nepali replacements
|
| 502 |
+
preserve_unknown: Keep emojis not in EMOJI_TO_NEPALI mapping (default: True)
|
| 503 |
+
If False, unknown emojis will be removed
|
| 504 |
+
|
| 505 |
+
Returns:
|
| 506 |
+
Text with emojis replaced (or preserved)
|
| 507 |
+
"""
|
| 508 |
+
# Replace known emojis with Nepali translations
|
| 509 |
+
for emoji_char, nepali_text in EMOJI_TO_NEPALI.items():
|
| 510 |
+
if preserve_spacing:
|
| 511 |
+
text = text.replace(emoji_char, f" {nepali_text} ")
|
| 512 |
+
else:
|
| 513 |
+
text = text.replace(emoji_char, nepali_text)
|
| 514 |
+
|
| 515 |
+
# Handle unknown emojis
|
| 516 |
+
if not preserve_unknown:
|
| 517 |
+
# OLD BEHAVIOR: Remove all remaining emojis
|
| 518 |
+
text = emoji.replace_emoji(text, replace=" ")
|
| 519 |
+
# else: NEW BEHAVIOR: Keep unknown emojis as-is
|
| 520 |
+
# This allows the model to process them directly
|
| 521 |
+
|
| 522 |
+
return text
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def transliterate_romanized_nepali(text: str) -> str:
|
| 526 |
+
"""Transliterate Romanized Nepali to Devanagari"""
|
| 527 |
+
if not TRANSLITERATION_AVAILABLE:
|
| 528 |
+
return text
|
| 529 |
+
|
| 530 |
+
try:
|
| 531 |
+
result = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
|
| 532 |
+
return result if result else text
|
| 533 |
+
except Exception as e:
|
| 534 |
+
logger.warning(f"Transliteration failed: {e}")
|
| 535 |
+
return text
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
# ============================================================================
|
| 539 |
+
# CACHED TRANSLATOR
|
| 540 |
+
# ============================================================================
|
| 541 |
+
|
| 542 |
+
class CachedNepaliTranslator:
|
| 543 |
+
"""Translator with LRU cache for efficiency"""
|
| 544 |
+
|
| 545 |
+
def __init__(self, cache_size: int = 2000):
|
| 546 |
+
self.translator = GoogleTranslator(source='en', target='ne')
|
| 547 |
+
self.cache_size = cache_size
|
| 548 |
+
self._translate_cached = lru_cache(maxsize=cache_size)(self._translate_single)
|
| 549 |
+
|
| 550 |
+
def _translate_single(self, text: str) -> str:
|
| 551 |
+
if not text or not text.strip():
|
| 552 |
+
return ""
|
| 553 |
+
try:
|
| 554 |
+
result = self.translator.translate(text.strip())
|
| 555 |
+
return result if result else text
|
| 556 |
+
except Exception as e:
|
| 557 |
+
logger.error(f"Translation failed: {str(e)}")
|
| 558 |
+
return text
|
| 559 |
+
|
| 560 |
+
def translate(self, text: str, fallback_to_original: bool = True) -> str:
|
| 561 |
+
if not text or not text.strip():
|
| 562 |
+
return ""
|
| 563 |
+
try:
|
| 564 |
+
return self._translate_cached(text.strip())
|
| 565 |
+
except Exception as e:
|
| 566 |
+
if fallback_to_original:
|
| 567 |
+
logger.warning(f"Translation failed, using original: {str(e)}")
|
| 568 |
+
return text
|
| 569 |
+
raise
|
| 570 |
+
|
| 571 |
+
def get_cache_info(self) -> dict:
|
| 572 |
+
"""Get cache statistics"""
|
| 573 |
+
cache_info = self._translate_cached.cache_info()
|
| 574 |
+
return {
|
| 575 |
+
'hits': cache_info.hits,
|
| 576 |
+
'misses': cache_info.misses,
|
| 577 |
+
'size': cache_info.currsize,
|
| 578 |
+
'max_size': cache_info.maxsize,
|
| 579 |
+
'hit_rate': cache_info.hits / (cache_info.hits + cache_info.misses)
|
| 580 |
+
if (cache_info.hits + cache_info.misses) > 0 else 0.0
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def translate_latin_spans(text: str, translator: CachedNepaliTranslator) -> str:
|
| 585 |
+
"""Translate Latin word spans in Devanagari text"""
|
| 586 |
+
def repl(match):
|
| 587 |
+
latin_text = match.group(0)
|
| 588 |
+
translated = translator.translate(latin_text, fallback_to_original=True)
|
| 589 |
+
return f" {translated} "
|
| 590 |
+
|
| 591 |
+
return re.sub(r"[A-Za-z][A-Za-z\s]{2,}", repl, text)
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# ============================================================================
|
| 595 |
+
# MAIN PREPROCESSOR CLASS
|
| 596 |
+
# ============================================================================
|
| 597 |
+
|
| 598 |
+
class HateSpeechPreprocessor:
|
| 599 |
+
"""
|
| 600 |
+
Main preprocessing pipeline for Nepali hate speech classification
|
| 601 |
+
|
| 602 |
+
Pipeline:
|
| 603 |
+
1. Extract emoji features (before any processing)
|
| 604 |
+
2. Detect script type (ignoring emojis)
|
| 605 |
+
3. Apply script-specific processing
|
| 606 |
+
4. Replace emojis with Nepali text
|
| 607 |
+
5. Normalize Devanagari
|
| 608 |
+
"""
|
| 609 |
+
|
| 610 |
+
def __init__(
|
| 611 |
+
self,
|
| 612 |
+
model_type: Literal["xlmr", "mbert", "nepalibert"] = "xlmr",
|
| 613 |
+
translate_english: bool = True,
|
| 614 |
+
cache_size: int = 2000
|
| 615 |
+
):
|
| 616 |
+
self.model_type = model_type
|
| 617 |
+
self.translate_english = translate_english
|
| 618 |
+
self.translator = CachedNepaliTranslator(cache_size) if translate_english else None
|
| 619 |
+
|
| 620 |
+
def preprocess(self, text: str, verbose: bool = False) -> Tuple[str, Dict[str, int]]:
|
| 621 |
+
"""
|
| 622 |
+
Preprocess a single text
|
| 623 |
+
|
| 624 |
+
Args:
|
| 625 |
+
text: Input text
|
| 626 |
+
verbose: Print processing steps
|
| 627 |
+
|
| 628 |
+
Returns:
|
| 629 |
+
Tuple of (preprocessed_text, emoji_features)
|
| 630 |
+
"""
|
| 631 |
+
if not isinstance(text, str) or not text.strip():
|
| 632 |
+
return "", {
|
| 633 |
+
'has_hate_emoji': 0, 'has_mockery_emoji': 0, 'has_positive_emoji': 0,
|
| 634 |
+
'has_sadness_emoji': 0, 'has_fear_emoji': 0, 'has_disgust_emoji': 0,
|
| 635 |
+
'hate_emoji_count': 0, 'mockery_emoji_count': 0, 'positive_emoji_count': 0,
|
| 636 |
+
'sadness_emoji_count': 0, 'fear_emoji_count': 0, 'disgust_emoji_count': 0,
|
| 637 |
+
'total_emoji_count': 0, 'hate_to_positive_ratio': 0.0, 'has_mixed_sentiment': 0
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
original_text = text
|
| 641 |
+
|
| 642 |
+
# Step 1: Extract emoji features
|
| 643 |
+
emoji_features = extract_emoji_features(original_text)
|
| 644 |
+
|
| 645 |
+
# Step 2: Detect script type
|
| 646 |
+
script_type, details = detect_script_type(text)
|
| 647 |
+
|
| 648 |
+
if verbose:
|
| 649 |
+
print(f"Script detected: {script_type} (confidence: {details.get('confidence', 0):.2%})")
|
| 650 |
+
|
| 651 |
+
# Step 3: Basic cleaning
|
| 652 |
+
text = clean_text_basic(text)
|
| 653 |
+
|
| 654 |
+
# Step 4: Script-specific processing
|
| 655 |
+
if script_type == "devanagari":
|
| 656 |
+
processed = text
|
| 657 |
+
if self.translate_english and self.translator:
|
| 658 |
+
processed = translate_latin_spans(processed, self.translator)
|
| 659 |
+
|
| 660 |
+
elif script_type == "romanized_nepali":
|
| 661 |
+
processed = transliterate_romanized_nepali(text)
|
| 662 |
+
|
| 663 |
+
elif script_type == "english":
|
| 664 |
+
if self.translate_english and self.translator:
|
| 665 |
+
processed = self.translator.translate(text, fallback_to_original=True)
|
| 666 |
+
else:
|
| 667 |
+
processed = text
|
| 668 |
+
|
| 669 |
+
elif script_type == "mixed":
|
| 670 |
+
processed = transliterate_romanized_nepali(text)
|
| 671 |
+
if self.translate_english and self.translator:
|
| 672 |
+
processed = translate_latin_spans(processed, self.translator)
|
| 673 |
+
else:
|
| 674 |
+
processed = text
|
| 675 |
+
|
| 676 |
+
# Step 5: Replace emojis
|
| 677 |
+
processed = replace_emojis_semantic(processed)
|
| 678 |
+
|
| 679 |
+
# Step 6: Normalize
|
| 680 |
+
final = normalize_devanagari(processed)
|
| 681 |
+
final = re.sub(r"\s+", " ", final).strip()
|
| 682 |
+
|
| 683 |
+
if verbose:
|
| 684 |
+
print(f"Original: {original_text}")
|
| 685 |
+
print(f"Processed: {final}")
|
| 686 |
+
print(f"Emoji features: {emoji_features}")
|
| 687 |
+
|
| 688 |
+
return final, emoji_features
|
| 689 |
+
|
| 690 |
+
def preprocess_batch(self, texts: List[str], verbose: bool = False, show_progress: bool = False) -> Tuple[List[str], List[Dict[str, int]]]:
|
| 691 |
+
"""
|
| 692 |
+
Preprocess multiple texts
|
| 693 |
+
|
| 694 |
+
Args:
|
| 695 |
+
texts: List of input texts
|
| 696 |
+
verbose: Print processing steps for each text
|
| 697 |
+
show_progress: Show progress bar (requires tqdm)
|
| 698 |
+
|
| 699 |
+
Returns:
|
| 700 |
+
Tuple of (preprocessed_texts, emoji_features_list)
|
| 701 |
+
"""
|
| 702 |
+
if show_progress:
|
| 703 |
+
try:
|
| 704 |
+
from tqdm import tqdm
|
| 705 |
+
results = [self.preprocess(text, verbose=verbose) for text in tqdm(texts, desc="Preprocessing")]
|
| 706 |
+
except ImportError:
|
| 707 |
+
results = [self.preprocess(text, verbose=verbose) for text in texts]
|
| 708 |
+
else:
|
| 709 |
+
results = [self.preprocess(text, verbose=verbose) for text in texts]
|
| 710 |
+
|
| 711 |
+
texts_processed = [r[0] for r in results]
|
| 712 |
+
features = [r[1] for r in results]
|
| 713 |
+
return texts_processed, features
|
| 714 |
+
|
| 715 |
+
def get_stats(self) -> dict:
|
| 716 |
+
"""Get preprocessor statistics"""
|
| 717 |
+
stats = {
|
| 718 |
+
'model_type': self.model_type,
|
| 719 |
+
'translation_enabled': self.translate_english,
|
| 720 |
+
'transliteration_available': TRANSLITERATION_AVAILABLE,
|
| 721 |
+
}
|
| 722 |
+
if self.translator:
|
| 723 |
+
stats['cache_info'] = self.translator.get_cache_info()
|
| 724 |
+
return stats
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
# ============================================================================
|
| 728 |
+
# CONVENIENCE FUNCTIONS FOR STREAMLIT
|
| 729 |
+
# ============================================================================
|
| 730 |
+
|
| 731 |
+
def preprocess_text(
|
| 732 |
+
text: str,
|
| 733 |
+
model_type: str = "xlmr",
|
| 734 |
+
translate_english: bool = True,
|
| 735 |
+
verbose: bool = False
|
| 736 |
+
) -> Tuple[str, Dict[str, int]]:
|
| 737 |
+
"""
|
| 738 |
+
Quick preprocessing function for single text (Streamlit-friendly)
|
| 739 |
+
|
| 740 |
+
Args:
|
| 741 |
+
text: Input text
|
| 742 |
+
model_type: Model type (xlmr, mbert, nepalibert)
|
| 743 |
+
translate_english: Whether to translate English
|
| 744 |
+
verbose: Print processing steps
|
| 745 |
+
|
| 746 |
+
Returns:
|
| 747 |
+
Tuple of (preprocessed_text, emoji_features)
|
| 748 |
+
"""
|
| 749 |
+
preprocessor = HateSpeechPreprocessor(
|
| 750 |
+
model_type=model_type,
|
| 751 |
+
translate_english=translate_english
|
| 752 |
+
)
|
| 753 |
+
return preprocessor.preprocess(text, verbose=verbose)
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
def get_script_info(text: str) -> Dict[str, any]:
|
| 757 |
+
"""
|
| 758 |
+
Get detailed script detection info (useful for Streamlit display)
|
| 759 |
+
|
| 760 |
+
Returns:
|
| 761 |
+
Dictionary with script type, confidence, and details
|
| 762 |
+
"""
|
| 763 |
+
script_type, details = detect_script_type(text)
|
| 764 |
+
return {
|
| 765 |
+
'script_type': script_type,
|
| 766 |
+
'confidence': details.get('confidence', 0),
|
| 767 |
+
'details': details
|
| 768 |
+
}
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
def get_emoji_info(text: str) -> Dict[str, Any]:
|
| 772 |
+
"""Get detailed information about emojis in text"""
|
| 773 |
+
emojis_found = [c for c in text if c in emoji.EMOJI_DATA]
|
| 774 |
+
known_emojis = set(EMOJI_TO_NEPALI.keys())
|
| 775 |
+
unknown_emojis = [e for e in emojis_found if e not in known_emojis]
|
| 776 |
+
known_emojis_found = [e for e in emojis_found if e in known_emojis]
|
| 777 |
+
|
| 778 |
+
return {
|
| 779 |
+
'emojis_found': emojis_found,
|
| 780 |
+
'total_count': len(emojis_found),
|
| 781 |
+
'known_emojis': known_emojis_found,
|
| 782 |
+
'known_count': len(known_emojis_found),
|
| 783 |
+
'unknown_emojis': unknown_emojis,
|
| 784 |
+
'unknown_count': len(unknown_emojis),
|
| 785 |
+
'coverage': len(known_emojis_found) / len(emojis_found) if emojis_found else 1.0
|
| 786 |
+
}
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,1426 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
|
|
|
| 6 |
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Nepali Hate Speech Detection - Streamlit Application
|
| 3 |
+
=====================================================
|
| 4 |
+
Complete application with preprocessing, prediction, and explainability (LIME/SHAP/Captum)
|
| 5 |
|
| 6 |
+
Run with: streamlit run main_app.py
|
| 7 |
"""
|
|
|
|
| 8 |
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import streamlit as st
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
import plotly.graph_objects as go
|
| 16 |
+
import plotly.express as px
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
import json
|
| 19 |
+
import warnings
|
| 20 |
+
warnings.filterwarnings('ignore')
|
| 21 |
+
|
| 22 |
+
# Matplotlib for Nepali font support
|
| 23 |
+
import matplotlib.pyplot as plt
|
| 24 |
+
from matplotlib.font_manager import FontProperties, fontManager
|
| 25 |
+
|
| 26 |
+
# ============================================================================
|
| 27 |
+
# HF SPACES COMPATIBILITY — paths and environment
|
| 28 |
+
# ============================================================================
|
| 29 |
+
|
| 30 |
+
# Detect if running on HF Spaces
|
| 31 |
+
IS_HF_SPACES = bool(os.environ.get('SPACE_ID'))
|
| 32 |
+
|
| 33 |
+
# Use /tmp for writable storage on HF Spaces, local 'data/' otherwise
|
| 34 |
+
DATA_DIR = '/tmp/data' if IS_HF_SPACES else 'data'
|
| 35 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 36 |
+
HISTORY_FILE = os.path.join(DATA_DIR, 'prediction_history.json')
|
| 37 |
+
|
| 38 |
+
# ============================================================================
|
| 39 |
+
# SCRIPT PATH SETUP
|
| 40 |
+
# ============================================================================
|
| 41 |
+
|
| 42 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 43 |
+
SCRIPTS_DIR = os.path.join(BASE_DIR, 'scripts')
|
| 44 |
+
if SCRIPTS_DIR not in sys.path:
|
| 45 |
+
sys.path.insert(0, BASE_DIR)
|
| 46 |
+
sys.path.insert(0, SCRIPTS_DIR)
|
| 47 |
+
|
| 48 |
+
# ============================================================================
|
| 49 |
+
# CUSTOM MODULE IMPORTS
|
| 50 |
+
# ============================================================================
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
from scripts.transformer_data_preprocessing import (
|
| 54 |
+
HateSpeechPreprocessor,
|
| 55 |
+
preprocess_text,
|
| 56 |
+
get_script_info,
|
| 57 |
+
get_emoji_info,
|
| 58 |
+
EMOJI_TO_NEPALI
|
| 59 |
+
)
|
| 60 |
+
from scripts.explainability import (
|
| 61 |
+
create_explainer_wrapper,
|
| 62 |
+
LIMEExplainer,
|
| 63 |
+
SHAPExplainer,
|
| 64 |
+
check_availability as check_explainability
|
| 65 |
+
)
|
| 66 |
+
from scripts.captum_explainer import (
|
| 67 |
+
CaptumExplainer,
|
| 68 |
+
check_availability as check_captum_availability
|
| 69 |
+
)
|
| 70 |
+
CUSTOM_MODULES_AVAILABLE = True
|
| 71 |
+
except ImportError as e:
|
| 72 |
+
st.error(f"⚠️ Custom modules not found: {e}")
|
| 73 |
+
CUSTOM_MODULES_AVAILABLE = False
|
| 74 |
+
|
| 75 |
+
# ============================================================================
|
| 76 |
+
# PAGE CONFIGURATION
|
| 77 |
+
# ============================================================================
|
| 78 |
+
|
| 79 |
+
st.set_page_config(
|
| 80 |
+
page_title="Nepali Hate Content Detector",
|
| 81 |
+
page_icon="🛡️",
|
| 82 |
+
layout="wide",
|
| 83 |
+
initial_sidebar_state="expanded"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# ============================================================================
|
| 87 |
+
# CUSTOM CSS
|
| 88 |
+
# ============================================================================
|
| 89 |
+
|
| 90 |
+
st.markdown("""
|
| 91 |
+
<style>
|
| 92 |
+
/* Main header */
|
| 93 |
+
.main-header {
|
| 94 |
+
font-size: 2.8rem;
|
| 95 |
+
font-weight: 700;
|
| 96 |
+
color: #1f77b4;
|
| 97 |
+
text-align: center;
|
| 98 |
+
margin-bottom: 0.5rem;
|
| 99 |
+
text-shadow: 2px 2px 4px rgba(0,0,0,0.1);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
.sub-header {
|
| 103 |
+
text-align: center;
|
| 104 |
+
color: #666;
|
| 105 |
+
font-size: 1.1rem;
|
| 106 |
+
margin-bottom: 2rem;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
/* Prediction boxes */
|
| 110 |
+
.prediction-box {
|
| 111 |
+
padding: 1.5rem;
|
| 112 |
+
border-radius: 15px;
|
| 113 |
+
margin: 1rem 0;
|
| 114 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 115 |
+
color: white;
|
| 116 |
+
font-weight: 600;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.no-box { background: linear-gradient(135deg, #28a745 0%, #1e7e34 100%); }
|
| 120 |
+
.oo-box { background: linear-gradient(135deg, #ffc107 0%, #e0a800 100%); }
|
| 121 |
+
.or-box { background: linear-gradient(135deg, #dc3545 0%, #a71d2a 100%); }
|
| 122 |
+
.os-box { background: linear-gradient(135deg, #6f42c1 0%, #4a1f9e 100%); }
|
| 123 |
+
|
| 124 |
+
/* Info boxes */
|
| 125 |
+
.info-box {
|
| 126 |
+
padding: 1rem;
|
| 127 |
+
border-radius: 10px;
|
| 128 |
+
background: #f8f9fa;
|
| 129 |
+
border-left: 4px solid #007bff;
|
| 130 |
+
margin: 1rem 0;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/* Metrics */
|
| 134 |
+
.metric-card {
|
| 135 |
+
background: white;
|
| 136 |
+
padding: 1rem;
|
| 137 |
+
border-radius: 10px;
|
| 138 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 139 |
+
text-align: center;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
/* Buttons */
|
| 143 |
+
.stButton>button {
|
| 144 |
+
border-radius: 8px;
|
| 145 |
+
font-weight: 600;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
/* Expander */
|
| 149 |
+
.streamlit-expanderHeader {
|
| 150 |
+
font-weight: 600;
|
| 151 |
+
font-size: 1.1rem;
|
| 152 |
+
}
|
| 153 |
+
</style>
|
| 154 |
+
""", unsafe_allow_html=True)
|
| 155 |
+
|
| 156 |
+
# ============================================================================
|
| 157 |
+
# NEPALI FONT LOADING
|
| 158 |
+
# ============================================================================
|
| 159 |
+
|
| 160 |
+
@st.cache_resource
|
| 161 |
+
def load_nepali_font():
|
| 162 |
+
"""Load Nepali font for matplotlib visualizations."""
|
| 163 |
+
font_paths = [
|
| 164 |
+
# HF Spaces / Linux (installed via apt fonts-noto in Dockerfile)
|
| 165 |
+
'/usr/share/fonts/truetype/noto/NotoSansDevanagari-Regular.ttf',
|
| 166 |
+
'/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf',
|
| 167 |
+
# Local development
|
| 168 |
+
'fonts/Kalimati.ttf',
|
| 169 |
+
# macOS
|
| 170 |
+
'/System/Library/Fonts/Supplemental/DevanagariSangamMN.ttc',
|
| 171 |
+
# Windows
|
| 172 |
+
'C:\\Windows\\Fonts\\NirmalaUI.ttf',
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
for font_path in font_paths:
|
| 176 |
+
if os.path.exists(font_path):
|
| 177 |
+
try:
|
| 178 |
+
fontManager.addfont(font_path)
|
| 179 |
+
fp = FontProperties(fname=font_path)
|
| 180 |
+
return fp
|
| 181 |
+
except Exception:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
st.warning("⚠️ Nepali font not found. Devanagari text may display as squares. "
|
| 185 |
+
"Place Kalimati.ttf in 'fonts/' directory for proper display.")
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ============================================================================
|
| 190 |
+
# SESSION STATE INITIALIZATION
|
| 191 |
+
# ============================================================================
|
| 192 |
+
|
| 193 |
+
if 'last_prediction' not in st.session_state:
|
| 194 |
+
st.session_state.last_prediction = None
|
| 195 |
+
if 'last_text' not in st.session_state:
|
| 196 |
+
st.session_state.last_text = ""
|
| 197 |
+
if 'batch_results' not in st.session_state:
|
| 198 |
+
st.session_state.batch_results = None
|
| 199 |
+
if 'batch_mode' not in st.session_state:
|
| 200 |
+
st.session_state.batch_mode = None
|
| 201 |
+
if 'csv_text_column' not in st.session_state:
|
| 202 |
+
st.session_state.csv_text_column = None
|
| 203 |
+
if 'explainability_results' not in st.session_state:
|
| 204 |
+
st.session_state.explainability_results = None
|
| 205 |
+
if 'preprocessor' not in st.session_state:
|
| 206 |
+
st.session_state.preprocessor = None
|
| 207 |
+
if 'model_wrapper' not in st.session_state:
|
| 208 |
+
st.session_state.model_wrapper = None
|
| 209 |
+
if 'nepali_font' not in st.session_state:
|
| 210 |
+
st.session_state.nepali_font = None
|
| 211 |
+
if 'session_predictions' not in st.session_state:
|
| 212 |
+
st.session_state.session_predictions = 0
|
| 213 |
+
if 'session_class_counts' not in st.session_state:
|
| 214 |
+
st.session_state.session_class_counts = {'NO': 0, 'OO': 0, 'OR': 0, 'OS': 0}
|
| 215 |
+
|
| 216 |
+
# ============================================================================
|
| 217 |
+
# MODEL LOADING
|
| 218 |
+
# ============================================================================
|
| 219 |
+
|
| 220 |
+
@st.cache_resource(show_spinner="Loading model... this may take a minute on first run.")
|
| 221 |
+
def load_model_and_preprocessor():
|
| 222 |
+
"""Load model, tokenizer, label encoder, and preprocessor."""
|
| 223 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 224 |
+
import joblib
|
| 225 |
+
|
| 226 |
+
hf_model_id = "UDHOV/xlm-roberta-large-nepali-hate-classification"
|
| 227 |
+
local_model_path = 'models/saved_models/xlm_roberta_results/large_final'
|
| 228 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 229 |
+
|
| 230 |
+
# Initialize default label encoder as fallback
|
| 231 |
+
from sklearn.preprocessing import LabelEncoder
|
| 232 |
+
le = LabelEncoder()
|
| 233 |
+
le.fit(['NO', 'OO', 'OR', 'OS'])
|
| 234 |
+
|
| 235 |
+
# Try local model first (only relevant for local dev), then HF Hub
|
| 236 |
+
if not IS_HF_SPACES and os.path.exists(local_model_path):
|
| 237 |
+
try:
|
| 238 |
+
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
|
| 239 |
+
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
|
| 240 |
+
model.to(device).eval()
|
| 241 |
+
|
| 242 |
+
le_path = os.path.join(local_model_path, 'label_encoder.pkl')
|
| 243 |
+
if os.path.exists(le_path):
|
| 244 |
+
le = joblib.load(le_path)
|
| 245 |
+
|
| 246 |
+
st.success(f"✅ Model loaded from local path on {device}")
|
| 247 |
+
|
| 248 |
+
except Exception as e:
|
| 249 |
+
st.warning(f"⚠️ Local model failed: {e}. Falling back to HuggingFace Hub...")
|
| 250 |
+
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
|
| 251 |
+
model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
|
| 252 |
+
model.to(device).eval()
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
from huggingface_hub import hf_hub_download
|
| 256 |
+
le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
|
| 257 |
+
le = joblib.load(le_file)
|
| 258 |
+
except Exception:
|
| 259 |
+
pass # Use default label encoder
|
| 260 |
+
|
| 261 |
+
st.success(f"✅ Model loaded from HuggingFace Hub on {device}")
|
| 262 |
+
else:
|
| 263 |
+
# HF Spaces or local path not found — load directly from Hub
|
| 264 |
+
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
|
| 265 |
+
model = AutoModelForSequenceClassification.from_pretrained(hf_model_id)
|
| 266 |
+
model.to(device).eval()
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
from huggingface_hub import hf_hub_download
|
| 270 |
+
le_file = hf_hub_download(repo_id=hf_model_id, filename="label_encoder.pkl")
|
| 271 |
+
le = joblib.load(le_file)
|
| 272 |
+
except Exception:
|
| 273 |
+
pass # Use default label encoder
|
| 274 |
+
|
| 275 |
+
st.success(f"✅ Model loaded from HuggingFace Hub on {device}")
|
| 276 |
+
|
| 277 |
+
# Initialize preprocessor
|
| 278 |
+
if CUSTOM_MODULES_AVAILABLE:
|
| 279 |
+
preprocessor = HateSpeechPreprocessor(
|
| 280 |
+
model_type="xlmr",
|
| 281 |
+
translate_english=True,
|
| 282 |
+
cache_size=2000
|
| 283 |
+
)
|
| 284 |
+
else:
|
| 285 |
+
preprocessor = None
|
| 286 |
+
|
| 287 |
+
return model, tokenizer, le, preprocessor, device
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
# ============================================================================
|
| 291 |
+
# PREDICTION FUNCTIONS
|
| 292 |
+
# ============================================================================
|
| 293 |
+
|
| 294 |
+
def predict_text(text, model, tokenizer, label_encoder, preprocessor, max_length=256):
|
| 295 |
+
"""Make prediction with preprocessing."""
|
| 296 |
+
device = next(model.parameters()).device
|
| 297 |
+
|
| 298 |
+
# Preprocess
|
| 299 |
+
if preprocessor:
|
| 300 |
+
preprocessed, emoji_features = preprocessor.preprocess(text, verbose=False)
|
| 301 |
+
else:
|
| 302 |
+
preprocessed = text
|
| 303 |
+
emoji_features = {}
|
| 304 |
+
|
| 305 |
+
if not preprocessed.strip():
|
| 306 |
+
return {
|
| 307 |
+
'prediction': 'NO',
|
| 308 |
+
'confidence': 0.0,
|
| 309 |
+
'probabilities': {label: 0.0 for label in label_encoder.classes_},
|
| 310 |
+
'preprocessed_text': '',
|
| 311 |
+
'emoji_features': emoji_features,
|
| 312 |
+
'error': 'Empty text after preprocessing'
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
# Tokenize
|
| 316 |
+
inputs = tokenizer(
|
| 317 |
+
preprocessed,
|
| 318 |
+
return_tensors='pt',
|
| 319 |
+
max_length=max_length,
|
| 320 |
+
padding='max_length',
|
| 321 |
+
truncation=True
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
input_ids = inputs['input_ids'].to(device)
|
| 325 |
+
attention_mask = inputs['attention_mask'].to(device)
|
| 326 |
+
|
| 327 |
+
# Predict
|
| 328 |
+
with torch.no_grad():
|
| 329 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
| 330 |
+
probs = torch.softmax(outputs.logits, dim=-1)[0]
|
| 331 |
+
|
| 332 |
+
probs_np = probs.cpu().numpy()
|
| 333 |
+
pred_idx = np.argmax(probs_np)
|
| 334 |
+
pred_label = label_encoder.classes_[pred_idx]
|
| 335 |
+
confidence = probs_np[pred_idx]
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
'prediction': pred_label,
|
| 339 |
+
'confidence': float(confidence),
|
| 340 |
+
'probabilities': {
|
| 341 |
+
label_encoder.classes_[i]: float(probs_np[i])
|
| 342 |
+
for i in range(len(label_encoder.classes_))
|
| 343 |
+
},
|
| 344 |
+
'preprocessed_text': preprocessed,
|
| 345 |
+
'emoji_features': emoji_features
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
# ============================================================================
|
| 350 |
+
# VISUALIZATION FUNCTIONS
|
| 351 |
+
# ============================================================================
|
| 352 |
+
|
| 353 |
+
def plot_probabilities(probabilities):
|
| 354 |
+
"""Create probability bar chart."""
|
| 355 |
+
labels = list(probabilities.keys())
|
| 356 |
+
probs = list(probabilities.values())
|
| 357 |
+
|
| 358 |
+
colors = {
|
| 359 |
+
'NO': '#28a745',
|
| 360 |
+
'OO': '#ffc107',
|
| 361 |
+
'OR': '#dc3545',
|
| 362 |
+
'OS': '#6f42c1'
|
| 363 |
+
}
|
| 364 |
+
bar_colors = [colors.get(label, '#6c757d') for label in labels]
|
| 365 |
+
|
| 366 |
+
fig = go.Figure(data=[
|
| 367 |
+
go.Bar(
|
| 368 |
+
x=labels,
|
| 369 |
+
y=probs,
|
| 370 |
+
marker_color=bar_colors,
|
| 371 |
+
text=[f'{p:.2%}' for p in probs],
|
| 372 |
+
textposition='outside',
|
| 373 |
+
hovertemplate='%{x}<br>Probability: %{y:.4f}<extra></extra>'
|
| 374 |
+
)
|
| 375 |
+
])
|
| 376 |
+
|
| 377 |
+
fig.update_layout(
|
| 378 |
+
title="Class Probabilities",
|
| 379 |
+
xaxis_title="Class",
|
| 380 |
+
yaxis_title="Probability",
|
| 381 |
+
yaxis_range=[0, 1.1],
|
| 382 |
+
height=400,
|
| 383 |
+
showlegend=False,
|
| 384 |
+
template='plotly_white'
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
return fig
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def get_label_description(label):
|
| 391 |
+
"""Get description for each label."""
|
| 392 |
+
descriptions = {
|
| 393 |
+
'NO': '✅ Non-Offensive: The text does not contain hate speech or offensive content.',
|
| 394 |
+
'OO': '⚠️ Other-Offensive: Contains general offensive language but not targeted hate.',
|
| 395 |
+
'OR': '🚫 Offensive-Racist: Contains hate speech targeting race, ethnicity, or religion.',
|
| 396 |
+
'OS': '🚫 Offensive-Sexist: Contains hate speech targeting gender or sexuality.'
|
| 397 |
+
}
|
| 398 |
+
return descriptions.get(label, 'Unknown category')
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
# ============================================================================
|
| 402 |
+
# HISTORY MANAGEMENT
|
| 403 |
+
# ============================================================================
|
| 404 |
+
|
| 405 |
+
def save_prediction_to_history(text, result, feedback=None):
|
| 406 |
+
"""Save prediction to history file."""
|
| 407 |
+
entry = {
|
| 408 |
+
'timestamp': datetime.now().isoformat(),
|
| 409 |
+
'text': text,
|
| 410 |
+
'prediction': result.get('prediction'),
|
| 411 |
+
'confidence': result.get('confidence'),
|
| 412 |
+
'probabilities': result.get('probabilities'),
|
| 413 |
+
'preprocessed_text': result.get('preprocessed_text'),
|
| 414 |
+
'emoji_features': result.get('emoji_features', {}),
|
| 415 |
+
'feedback': feedback
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
# Load existing history
|
| 419 |
+
history = []
|
| 420 |
+
if os.path.exists(HISTORY_FILE):
|
| 421 |
+
try:
|
| 422 |
+
with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
|
| 423 |
+
history = json.load(f)
|
| 424 |
+
except Exception:
|
| 425 |
+
history = []
|
| 426 |
+
|
| 427 |
+
# Append and save
|
| 428 |
+
history.append(entry)
|
| 429 |
+
|
| 430 |
+
try:
|
| 431 |
+
with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
|
| 432 |
+
json.dump(history, f, ensure_ascii=False, indent=2)
|
| 433 |
+
return True
|
| 434 |
+
except Exception as e:
|
| 435 |
+
st.error(f"Failed to save history: {e}")
|
| 436 |
+
return False
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
# ============================================================================
|
| 440 |
+
# BATCH EXPLAINABILITY HELPER
|
| 441 |
+
# ============================================================================
|
| 442 |
+
|
| 443 |
+
def render_batch_explainability(results_df, text_column, model, tokenizer, label_encoder,
|
| 444 |
+
preprocessor, nepali_font, explainability_available,
|
| 445 |
+
captum_available, mode_key="batch"):
|
| 446 |
+
"""Render explainability UI for batch results."""
|
| 447 |
+
if not CUSTOM_MODULES_AVAILABLE:
|
| 448 |
+
st.warning("⚠️ Explainability not available.")
|
| 449 |
+
return
|
| 450 |
+
|
| 451 |
+
if not (explainability_available['lime'] or explainability_available['shap'] or captum_available):
|
| 452 |
+
st.warning("⚠️ No explainability methods available.")
|
| 453 |
+
return
|
| 454 |
+
|
| 455 |
+
with st.expander("💡 Explain Individual Results", expanded=False):
|
| 456 |
+
st.markdown("**Select a text from the batch to explain:**")
|
| 457 |
+
|
| 458 |
+
text_options = [f"Row {idx}: {str(row[text_column])[:50]}..." for idx, row in results_df.iterrows()]
|
| 459 |
+
selected_idx = st.selectbox(
|
| 460 |
+
"Choose text:",
|
| 461 |
+
range(len(text_options)),
|
| 462 |
+
format_func=lambda x: text_options[x],
|
| 463 |
+
key=f"{mode_key}_select"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
selected_text = str(results_df.iloc[selected_idx][text_column])
|
| 467 |
+
selected_pred = results_df.iloc[selected_idx]['Prediction']
|
| 468 |
+
|
| 469 |
+
st.write(f"**Selected:** {selected_text}")
|
| 470 |
+
st.write(f"**Prediction:** {selected_pred}")
|
| 471 |
+
|
| 472 |
+
available_methods = []
|
| 473 |
+
if explainability_available['lime']:
|
| 474 |
+
available_methods.append("LIME")
|
| 475 |
+
if explainability_available['shap']:
|
| 476 |
+
available_methods.append("SHAP")
|
| 477 |
+
if captum_available:
|
| 478 |
+
available_methods.append("Captum (IG)")
|
| 479 |
+
|
| 480 |
+
if not available_methods:
|
| 481 |
+
st.warning("⚠️ No explainability methods available.")
|
| 482 |
+
return
|
| 483 |
+
|
| 484 |
+
explain_method = st.selectbox(
|
| 485 |
+
"Explanation method:",
|
| 486 |
+
available_methods,
|
| 487 |
+
key=f"{mode_key}_method"
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
if st.button("🔍 Generate Explanation", key=f"{mode_key}_explain_btn"):
|
| 491 |
+
with st.spinner("Generating explanation..."):
|
| 492 |
+
try:
|
| 493 |
+
if st.session_state.model_wrapper is None:
|
| 494 |
+
st.session_state.model_wrapper = create_explainer_wrapper(
|
| 495 |
+
model, tokenizer, label_encoder, preprocessor
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
wrapper = st.session_state.model_wrapper
|
| 499 |
+
clean_selected = selected_text.replace('"', '').replace("'", '').replace('\u201c', '').replace('\u201d', '')
|
| 500 |
+
preprocessed, emoji_features = preprocessor.preprocess(clean_selected)
|
| 501 |
+
analysis = wrapper.predict_with_analysis(clean_selected)
|
| 502 |
+
|
| 503 |
+
if explain_method == "LIME":
|
| 504 |
+
lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
|
| 505 |
+
result = lime_exp.explain_and_visualize(
|
| 506 |
+
analysis['original_text'],
|
| 507 |
+
analysis['preprocessed_text'],
|
| 508 |
+
save_path=None,
|
| 509 |
+
show=False,
|
| 510 |
+
num_samples=200
|
| 511 |
+
)
|
| 512 |
+
st.subheader("LIME Explanation")
|
| 513 |
+
st.pyplot(result['figure'])
|
| 514 |
+
st.markdown("---")
|
| 515 |
+
st.markdown("**📊 Feature Importance Details:**")
|
| 516 |
+
word_scores = result['explanation']['word_scores']
|
| 517 |
+
if word_scores:
|
| 518 |
+
df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
|
| 519 |
+
df = df.sort_values('Score', ascending=False)
|
| 520 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 521 |
+
else:
|
| 522 |
+
st.warning("No word scores available")
|
| 523 |
+
|
| 524 |
+
elif explain_method == "SHAP":
|
| 525 |
+
shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
|
| 526 |
+
result = shap_exp.explain_and_visualize(
|
| 527 |
+
analysis['original_text'],
|
| 528 |
+
analysis['preprocessed_text'],
|
| 529 |
+
save_path=None,
|
| 530 |
+
show=False,
|
| 531 |
+
use_fallback=True
|
| 532 |
+
)
|
| 533 |
+
st.subheader("SHAP Explanation")
|
| 534 |
+
st.pyplot(result['figure'])
|
| 535 |
+
st.markdown("---")
|
| 536 |
+
st.markdown("**📊 Attribution Details:**")
|
| 537 |
+
st.write(f"**Method used:** {result['explanation']['method_used']}")
|
| 538 |
+
word_scores = result['explanation']['word_scores']
|
| 539 |
+
if word_scores:
|
| 540 |
+
df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
|
| 541 |
+
df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
|
| 542 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 543 |
+
else:
|
| 544 |
+
st.warning("No word scores available")
|
| 545 |
+
|
| 546 |
+
elif explain_method == "Captum (IG)":
|
| 547 |
+
captum_exp = CaptumExplainer(
|
| 548 |
+
model, tokenizer, label_encoder, preprocessor,
|
| 549 |
+
emoji_to_nepali_map=EMOJI_TO_NEPALI
|
| 550 |
+
)
|
| 551 |
+
result = captum_exp.explain_and_visualize(
|
| 552 |
+
analysis['original_text'],
|
| 553 |
+
target=None,
|
| 554 |
+
n_steps=50,
|
| 555 |
+
save_dir=None,
|
| 556 |
+
show=False,
|
| 557 |
+
nepali_font=nepali_font
|
| 558 |
+
)
|
| 559 |
+
st.subheader("Captum Integrated Gradients")
|
| 560 |
+
col1, col2 = st.columns(2)
|
| 561 |
+
with col1:
|
| 562 |
+
st.markdown("**Bar Chart**")
|
| 563 |
+
st.pyplot(result['bar_chart'])
|
| 564 |
+
with col2:
|
| 565 |
+
st.markdown("**Heatmap**")
|
| 566 |
+
st.pyplot(result['heatmap'])
|
| 567 |
+
st.markdown("---")
|
| 568 |
+
st.markdown("**📊 Attribution Details:**")
|
| 569 |
+
st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
|
| 570 |
+
word_attrs = result['explanation']['word_attributions']
|
| 571 |
+
if word_attrs:
|
| 572 |
+
df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
|
| 573 |
+
df = df.sort_values('Abs Score', ascending=False)
|
| 574 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 575 |
+
else:
|
| 576 |
+
st.warning("No word attributions available")
|
| 577 |
+
|
| 578 |
+
except Exception as e:
|
| 579 |
+
st.error(f"❌ Explanation failed: {str(e)}")
|
| 580 |
+
st.markdown("**🐛 Error Details:**")
|
| 581 |
+
import traceback
|
| 582 |
+
st.code(traceback.format_exc())
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
# ============================================================================
|
| 586 |
+
# MAIN APPLICATION
|
| 587 |
+
# ============================================================================
|
| 588 |
+
|
| 589 |
+
def main():
|
| 590 |
+
"""Main application."""
|
| 591 |
+
|
| 592 |
+
# Load Nepali font
|
| 593 |
+
if st.session_state.nepali_font is None:
|
| 594 |
+
st.session_state.nepali_font = load_nepali_font()
|
| 595 |
+
|
| 596 |
+
nepali_font = st.session_state.nepali_font
|
| 597 |
+
|
| 598 |
+
# Header
|
| 599 |
+
st.markdown('<h1 class="main-header">🛡️ Nepali Hate Content Detector</h1>', unsafe_allow_html=True)
|
| 600 |
+
st.markdown("""
|
| 601 |
+
<div class="sub-header">
|
| 602 |
+
AI-powered hate speech detection for Nepali text with advanced explainability
|
| 603 |
+
<br>
|
| 604 |
+
<strong>XLM-RoBERTa Large</strong> fine-tuned on Nepali social media data
|
| 605 |
+
</div>
|
| 606 |
+
""", unsafe_allow_html=True)
|
| 607 |
+
|
| 608 |
+
# ========================================================================
|
| 609 |
+
# SIDEBAR
|
| 610 |
+
# ========================================================================
|
| 611 |
+
|
| 612 |
+
with st.sidebar:
|
| 613 |
+
st.header("ℹ️ About")
|
| 614 |
+
st.markdown("""
|
| 615 |
+
**Model**: XLM-RoBERTa Large
|
| 616 |
+
**Task**: Multi-class hate speech detection
|
| 617 |
+
**Language**: Nepali (Devanagari & Romanized)
|
| 618 |
+
|
| 619 |
+
**Classes:**
|
| 620 |
+
- **NO**: Non-offensive
|
| 621 |
+
- **OO**: General offensive
|
| 622 |
+
- **OR**: Racist/ethnic hate
|
| 623 |
+
- **OS**: Sexist/gender hate
|
| 624 |
+
""")
|
| 625 |
+
|
| 626 |
+
st.markdown("---")
|
| 627 |
+
|
| 628 |
+
st.header("🔧 Features")
|
| 629 |
+
st.markdown("""
|
| 630 |
+
✅ **Preprocessing**
|
| 631 |
+
- Script detection
|
| 632 |
+
- Transliteration
|
| 633 |
+
- Translation
|
| 634 |
+
- Emoji mapping
|
| 635 |
+
|
| 636 |
+
✅ **Explainability**
|
| 637 |
+
- LIME
|
| 638 |
+
- SHAP
|
| 639 |
+
- Captum (IG)
|
| 640 |
+
|
| 641 |
+
✅ **Batch Analysis**
|
| 642 |
+
- CSV upload
|
| 643 |
+
- Text area input
|
| 644 |
+
""")
|
| 645 |
+
|
| 646 |
+
st.markdown("---")
|
| 647 |
+
|
| 648 |
+
st.header("🎨 Font Settings")
|
| 649 |
+
with st.expander("Nepali Font Info", expanded=False):
|
| 650 |
+
st.markdown(f"""
|
| 651 |
+
**Status:** {'✅ Loaded' if nepali_font else '❌ Not loaded'}
|
| 652 |
+
|
| 653 |
+
**Fix squares in Devanagari:**
|
| 654 |
+
1. Download Kalimati.ttf
|
| 655 |
+
2. Create `fonts/` directory
|
| 656 |
+
3. Place font file there
|
| 657 |
+
4. Restart app
|
| 658 |
+
""")
|
| 659 |
+
|
| 660 |
+
st.markdown("---")
|
| 661 |
+
|
| 662 |
+
st.header("📊 Statistics")
|
| 663 |
+
|
| 664 |
+
# Session Statistics
|
| 665 |
+
st.subheader("🔄 Current Session")
|
| 666 |
+
if st.session_state.session_predictions > 0:
|
| 667 |
+
st.metric("Predictions", st.session_state.session_predictions)
|
| 668 |
+
session_counts = st.session_state.session_class_counts
|
| 669 |
+
if any(count > 0 for count in session_counts.values()):
|
| 670 |
+
st.write("**Session Distribution:**")
|
| 671 |
+
for label in ['NO', 'OO', 'OR', 'OS']:
|
| 672 |
+
count = session_counts.get(label, 0)
|
| 673 |
+
if count > 0:
|
| 674 |
+
pct = (count / st.session_state.session_predictions) * 100
|
| 675 |
+
st.write(f"• {label}: {count} ({pct:.0f}%)")
|
| 676 |
+
else:
|
| 677 |
+
st.info("No predictions in this session yet.")
|
| 678 |
+
|
| 679 |
+
st.markdown("---")
|
| 680 |
+
|
| 681 |
+
# History Statistics
|
| 682 |
+
st.subheader("📚 All Time")
|
| 683 |
+
if os.path.exists(HISTORY_FILE):
|
| 684 |
+
try:
|
| 685 |
+
with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
|
| 686 |
+
history = json.load(f)
|
| 687 |
+
if history:
|
| 688 |
+
st.metric("Total Saved", len(history))
|
| 689 |
+
pred_counts = pd.Series([h['prediction'] for h in history]).value_counts()
|
| 690 |
+
st.write("**Distribution:**")
|
| 691 |
+
for label, count in pred_counts.items():
|
| 692 |
+
st.write(f"• {label}: {count}")
|
| 693 |
+
else:
|
| 694 |
+
st.info("No saved predictions yet.")
|
| 695 |
+
except Exception as e:
|
| 696 |
+
st.warning("⚠️ History file error")
|
| 697 |
+
with st.expander("Error details"):
|
| 698 |
+
st.code(str(e))
|
| 699 |
+
else:
|
| 700 |
+
st.info("📝 No history file\n\nEnable 'Save to history' in Tab 1 to track predictions.")
|
| 701 |
+
|
| 702 |
+
st.markdown("---")
|
| 703 |
+
st.markdown("""
|
| 704 |
+
<div style='text-align: center; font-size: 0.9rem; color: #666;'>
|
| 705 |
+
<a href='https://huggingface.co/UDHOV/xlm-roberta-large-nepali-hate-classification' target='_blank'>
|
| 706 |
+
Model on HuggingFace 🤗
|
| 707 |
+
</a>
|
| 708 |
+
</div>
|
| 709 |
+
""", unsafe_allow_html=True)
|
| 710 |
+
|
| 711 |
+
# ========================================================================
|
| 712 |
+
# LOAD MODEL
|
| 713 |
+
# ========================================================================
|
| 714 |
+
|
| 715 |
+
with st.spinner("Loading model..."):
|
| 716 |
+
model, tokenizer, label_encoder, preprocessor, device = load_model_and_preprocessor()
|
| 717 |
+
|
| 718 |
+
if model is None:
|
| 719 |
+
st.error("❌ Failed to load model!")
|
| 720 |
+
st.stop()
|
| 721 |
+
|
| 722 |
+
# Check explainability availability
|
| 723 |
+
explainability_available = check_explainability() if CUSTOM_MODULES_AVAILABLE else {'lime': False, 'shap': False}
|
| 724 |
+
captum_available = check_captum_availability() if CUSTOM_MODULES_AVAILABLE else False
|
| 725 |
+
|
| 726 |
+
# ========================================================================
|
| 727 |
+
# TABS
|
| 728 |
+
# ========================================================================
|
| 729 |
+
|
| 730 |
+
tabs = st.tabs([
|
| 731 |
+
"🔍 Single Prediction",
|
| 732 |
+
"💡 Explainability",
|
| 733 |
+
"📝 Batch Analysis",
|
| 734 |
+
"📈 History"
|
| 735 |
+
])
|
| 736 |
+
|
| 737 |
+
# ========================================================================
|
| 738 |
+
# TAB 1: SINGLE PREDICTION
|
| 739 |
+
# ========================================================================
|
| 740 |
+
|
| 741 |
+
with tabs[0]:
|
| 742 |
+
st.subheader("🔍 Single Text Analysis")
|
| 743 |
+
|
| 744 |
+
col1, col2 = st.columns([2, 1])
|
| 745 |
+
|
| 746 |
+
with col1:
|
| 747 |
+
text_input = st.text_area(
|
| 748 |
+
"Enter Nepali Text",
|
| 749 |
+
height=200,
|
| 750 |
+
placeholder="यहाँ आफ्नो पाठ लेख्नुहोस्...\nOr enter romanized Nepali: ma khusi xu\nOr English: This is a test",
|
| 751 |
+
help="Enter text in Devanagari, Romanized Nepali, or English."
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
col_a, col_b = st.columns(2)
|
| 755 |
+
with col_a:
|
| 756 |
+
analyze_button = st.button("🔍 Analyze Text", type="primary", use_container_width=True)
|
| 757 |
+
with col_b:
|
| 758 |
+
save_to_history = st.checkbox("Save to history", value=True)
|
| 759 |
+
|
| 760 |
+
with col2:
|
| 761 |
+
st.markdown("##### 💡 Quick Info")
|
| 762 |
+
st.info("""
|
| 763 |
+
**Supported:**
|
| 764 |
+
- Devanagari: नेपाली
|
| 765 |
+
- Romanized: ma nepali xu
|
| 766 |
+
- English: I am Nepali
|
| 767 |
+
- Mixed scripts
|
| 768 |
+
- Emojis: 😀😡🙏
|
| 769 |
+
|
| 770 |
+
**Auto-processing:**
|
| 771 |
+
- Script detection
|
| 772 |
+
- Transliteration
|
| 773 |
+
- Translation
|
| 774 |
+
- Emoji → Nepali words
|
| 775 |
+
- URL/mention removal
|
| 776 |
+
""")
|
| 777 |
+
|
| 778 |
+
if analyze_button and text_input.strip():
|
| 779 |
+
with st.spinner("🔄 Analyzing text..."):
|
| 780 |
+
result = predict_text(
|
| 781 |
+
text_input, model, tokenizer,
|
| 782 |
+
label_encoder, preprocessor
|
| 783 |
+
)
|
| 784 |
+
|
| 785 |
+
st.session_state.last_prediction = result
|
| 786 |
+
st.session_state.last_text = text_input
|
| 787 |
+
|
| 788 |
+
if 'prediction' in result:
|
| 789 |
+
st.session_state.session_predictions += 1
|
| 790 |
+
pred_label = result['prediction']
|
| 791 |
+
if pred_label in st.session_state.session_class_counts:
|
| 792 |
+
st.session_state.session_class_counts[pred_label] += 1
|
| 793 |
+
|
| 794 |
+
if save_to_history:
|
| 795 |
+
save_prediction_to_history(text_input, result)
|
| 796 |
+
|
| 797 |
+
if 'error' in result:
|
| 798 |
+
st.warning(f"⚠️ {result['error']}")
|
| 799 |
+
st.stop()
|
| 800 |
+
|
| 801 |
+
st.markdown("---")
|
| 802 |
+
st.subheader("📊 Analysis Results")
|
| 803 |
+
|
| 804 |
+
pred_label = result['prediction']
|
| 805 |
+
confidence = result['confidence']
|
| 806 |
+
|
| 807 |
+
box_class = {
|
| 808 |
+
'NO': 'no-box',
|
| 809 |
+
'OO': 'oo-box',
|
| 810 |
+
'OR': 'or-box',
|
| 811 |
+
'OS': 'os-box'
|
| 812 |
+
}.get(pred_label, 'no-box')
|
| 813 |
+
|
| 814 |
+
st.markdown(f"""
|
| 815 |
+
<div class='prediction-box {box_class}'>
|
| 816 |
+
<h2 style='margin:0;'>Prediction: {pred_label}</h2>
|
| 817 |
+
<p style='font-size:1.3rem; margin:0.5rem 0;'>
|
| 818 |
+
Confidence: <strong>{confidence:.2%}</strong>
|
| 819 |
+
</p>
|
| 820 |
+
<p style='margin:0; font-size:1rem;'>{get_label_description(pred_label)}</p>
|
| 821 |
+
</div>
|
| 822 |
+
""", unsafe_allow_html=True)
|
| 823 |
+
|
| 824 |
+
st.plotly_chart(plot_probabilities(result['probabilities']), use_container_width=True)
|
| 825 |
+
|
| 826 |
+
with st.expander("🔍 Preprocessing Details", expanded=False):
|
| 827 |
+
col1, col2, col3 = st.columns(3)
|
| 828 |
+
|
| 829 |
+
with col1:
|
| 830 |
+
st.markdown("**Original Text:**")
|
| 831 |
+
st.code(text_input, language=None)
|
| 832 |
+
|
| 833 |
+
with col2:
|
| 834 |
+
st.markdown("**Preprocessed:**")
|
| 835 |
+
st.code(result['preprocessed_text'], language=None)
|
| 836 |
+
|
| 837 |
+
with col3:
|
| 838 |
+
if CUSTOM_MODULES_AVAILABLE and preprocessor:
|
| 839 |
+
script_info = get_script_info(text_input)
|
| 840 |
+
st.markdown("**Script Detected:**")
|
| 841 |
+
st.write(f"• Type: {script_info['script_type']}")
|
| 842 |
+
confidence_pct = min(script_info['confidence'] * 100, 100.0)
|
| 843 |
+
st.write(f"• Confidence: {confidence_pct:.1f}%")
|
| 844 |
+
|
| 845 |
+
if result.get('emoji_features', {}).get('total_emoji_count', 0) > 0:
|
| 846 |
+
with st.expander("😊 Emoji Analysis", expanded=False):
|
| 847 |
+
features = result['emoji_features']
|
| 848 |
+
|
| 849 |
+
col1, col2, col3 = st.columns(3)
|
| 850 |
+
with col1:
|
| 851 |
+
st.metric("Total Emojis", features['total_emoji_count'])
|
| 852 |
+
st.metric("Hate Emojis", features['hate_emoji_count'])
|
| 853 |
+
with col2:
|
| 854 |
+
st.metric("Positive Emojis", features['positive_emoji_count'])
|
| 855 |
+
st.metric("Mockery Emojis", features['mockery_emoji_count'])
|
| 856 |
+
with col3:
|
| 857 |
+
st.metric("Sadness Emojis", features['sadness_emoji_count'])
|
| 858 |
+
st.metric("Fear Emojis", features['fear_emoji_count'])
|
| 859 |
+
|
| 860 |
+
if CUSTOM_MODULES_AVAILABLE:
|
| 861 |
+
emoji_info = get_emoji_info(text_input)
|
| 862 |
+
if emoji_info['emojis_found']:
|
| 863 |
+
st.markdown("**Emojis Found:**")
|
| 864 |
+
st.write(" ".join(emoji_info['emojis_found']))
|
| 865 |
+
|
| 866 |
+
with st.expander("📊 Detailed Probabilities", expanded=False):
|
| 867 |
+
prob_df = pd.DataFrame({
|
| 868 |
+
'Class': list(result['probabilities'].keys()),
|
| 869 |
+
'Probability': list(result['probabilities'].values())
|
| 870 |
+
})
|
| 871 |
+
prob_df['Probability'] = prob_df['Probability'].apply(lambda x: f"{x:.4f}")
|
| 872 |
+
st.dataframe(prob_df, hide_index=True, use_container_width=True)
|
| 873 |
+
|
| 874 |
+
# ========================================================================
|
| 875 |
+
# TAB 2: EXPLAINABILITY
|
| 876 |
+
# ========================================================================
|
| 877 |
+
|
| 878 |
+
with tabs[1]:
|
| 879 |
+
st.subheader("💡 Model Explainability")
|
| 880 |
+
|
| 881 |
+
if not CUSTOM_MODULES_AVAILABLE:
|
| 882 |
+
st.error("❌ Explainability modules not available. Please check scripts directory.")
|
| 883 |
+
st.stop()
|
| 884 |
+
|
| 885 |
+
st.info(f"""
|
| 886 |
+
**Available Methods:**
|
| 887 |
+
- LIME: {'✅' if explainability_available['lime'] else '❌ (install: pip install lime)'}
|
| 888 |
+
- SHAP: {'✅' if explainability_available['shap'] else '❌ (install: pip install shap)'}
|
| 889 |
+
- Captum: {'✅' if captum_available else '❌ (install: pip install captum)'}
|
| 890 |
+
""")
|
| 891 |
+
|
| 892 |
+
explain_text = st.text_area(
|
| 893 |
+
"Enter text to explain",
|
| 894 |
+
height=150,
|
| 895 |
+
value=st.session_state.last_text if st.session_state.last_text else "",
|
| 896 |
+
placeholder="Enter Nepali text..."
|
| 897 |
+
)
|
| 898 |
+
|
| 899 |
+
available_methods = []
|
| 900 |
+
if explainability_available['lime']:
|
| 901 |
+
available_methods.append("LIME")
|
| 902 |
+
if explainability_available['shap']:
|
| 903 |
+
available_methods.append("SHAP")
|
| 904 |
+
if captum_available:
|
| 905 |
+
available_methods.append("Captum (IG)")
|
| 906 |
+
|
| 907 |
+
if not available_methods:
|
| 908 |
+
st.warning("⚠️ No explainability methods available. Please install required packages.")
|
| 909 |
+
st.code("pip install lime shap captum", language="bash")
|
| 910 |
+
st.stop()
|
| 911 |
+
|
| 912 |
+
method = st.selectbox("Select explanation method", available_methods)
|
| 913 |
+
|
| 914 |
+
with st.expander("⚙️ Configuration", expanded=False):
|
| 915 |
+
if method == "LIME":
|
| 916 |
+
num_samples = st.slider("Number of samples", 100, 500, 200, 50)
|
| 917 |
+
elif method == "SHAP":
|
| 918 |
+
use_fallback = st.checkbox("Use fallback if SHAP fails", value=True)
|
| 919 |
+
elif method == "Captum (IG)":
|
| 920 |
+
n_steps = st.slider("Integration steps", 10, 100, 50, 10)
|
| 921 |
+
|
| 922 |
+
explain_button = st.button("🔍 Generate Explanation", type="primary", use_container_width=True)
|
| 923 |
+
|
| 924 |
+
if explain_button and explain_text.strip():
|
| 925 |
+
with st.spinner("Generating explanation..."):
|
| 926 |
+
if st.session_state.model_wrapper is None:
|
| 927 |
+
st.session_state.model_wrapper = create_explainer_wrapper(
|
| 928 |
+
model, tokenizer, label_encoder, preprocessor
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
wrapper = st.session_state.model_wrapper
|
| 932 |
+
preprocessed, emoji_features = preprocessor.preprocess(explain_text)
|
| 933 |
+
analysis = wrapper.predict_with_analysis(explain_text)
|
| 934 |
+
|
| 935 |
+
st.success(f"**Prediction:** {analysis['predicted_label']} ({analysis['confidence']:.2%})")
|
| 936 |
+
|
| 937 |
+
col1, col2 = st.columns(2)
|
| 938 |
+
with col1:
|
| 939 |
+
st.write("**Original:**", explain_text)
|
| 940 |
+
with col2:
|
| 941 |
+
st.write("**Preprocessed:**", preprocessed)
|
| 942 |
+
|
| 943 |
+
st.markdown("---")
|
| 944 |
+
|
| 945 |
+
try:
|
| 946 |
+
if method == "LIME":
|
| 947 |
+
lime_exp = LIMEExplainer(wrapper, nepali_font=nepali_font)
|
| 948 |
+
result = lime_exp.explain_and_visualize(
|
| 949 |
+
analysis['original_text'],
|
| 950 |
+
analysis['preprocessed_text'],
|
| 951 |
+
save_path=None,
|
| 952 |
+
show=False,
|
| 953 |
+
num_samples=num_samples
|
| 954 |
+
)
|
| 955 |
+
st.subheader("LIME Explanation")
|
| 956 |
+
st.pyplot(result['figure'])
|
| 957 |
+
with st.expander("📊 Feature Importance Details"):
|
| 958 |
+
word_scores = result['explanation']['word_scores']
|
| 959 |
+
df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
|
| 960 |
+
df = df.sort_values('Score', ascending=False)
|
| 961 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 962 |
+
|
| 963 |
+
elif method == "SHAP":
|
| 964 |
+
shap_exp = SHAPExplainer(wrapper, nepali_font=nepali_font)
|
| 965 |
+
result = shap_exp.explain_and_visualize(
|
| 966 |
+
analysis['original_text'],
|
| 967 |
+
analysis['preprocessed_text'],
|
| 968 |
+
save_path=None,
|
| 969 |
+
show=False,
|
| 970 |
+
use_fallback=use_fallback
|
| 971 |
+
)
|
| 972 |
+
st.subheader("SHAP Explanation")
|
| 973 |
+
st.pyplot(result['figure'])
|
| 974 |
+
with st.expander("📊 Attribution Details"):
|
| 975 |
+
st.write(f"**Method used:** {result['explanation']['method_used']}")
|
| 976 |
+
word_scores = result['explanation']['word_scores']
|
| 977 |
+
df = pd.DataFrame(word_scores, columns=['Word', 'Score'])
|
| 978 |
+
df = df.sort_values('Score', key=lambda x: abs(x), ascending=False)
|
| 979 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 980 |
+
|
| 981 |
+
elif method == "Captum (IG)":
|
| 982 |
+
captum_exp = CaptumExplainer(
|
| 983 |
+
model, tokenizer, label_encoder, preprocessor,
|
| 984 |
+
emoji_to_nepali_map=EMOJI_TO_NEPALI
|
| 985 |
+
)
|
| 986 |
+
result = captum_exp.explain_and_visualize(
|
| 987 |
+
analysis['original_text'],
|
| 988 |
+
target=None,
|
| 989 |
+
n_steps=n_steps,
|
| 990 |
+
save_dir=None,
|
| 991 |
+
show=False,
|
| 992 |
+
nepali_font=nepali_font
|
| 993 |
+
)
|
| 994 |
+
st.subheader("Captum Integrated Gradients")
|
| 995 |
+
col1, col2 = st.columns(2)
|
| 996 |
+
with col1:
|
| 997 |
+
st.markdown("**Bar Chart**")
|
| 998 |
+
st.pyplot(result['bar_chart'])
|
| 999 |
+
with col2:
|
| 1000 |
+
st.markdown("**Heatmap**")
|
| 1001 |
+
st.pyplot(result['heatmap'])
|
| 1002 |
+
with st.expander("📊 Attribution Details"):
|
| 1003 |
+
st.write(f"**Convergence Delta:** {result['explanation']['convergence_delta']:.6f}")
|
| 1004 |
+
word_attrs = result['explanation']['word_attributions']
|
| 1005 |
+
df = pd.DataFrame(word_attrs, columns=['Word', 'Abs Score', 'Signed Score'])
|
| 1006 |
+
df = df.sort_values('Abs Score', ascending=False)
|
| 1007 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
| 1008 |
+
|
| 1009 |
+
except Exception as e:
|
| 1010 |
+
st.error(f"❌ Explanation failed: {str(e)}")
|
| 1011 |
+
with st.expander("🐛 Error Details"):
|
| 1012 |
+
st.exception(e)
|
| 1013 |
+
|
| 1014 |
+
# ========================================================================
|
| 1015 |
+
# TAB 3: BATCH ANALYSIS
|
| 1016 |
+
# ========================================================================
|
| 1017 |
+
|
| 1018 |
+
with tabs[2]:
|
| 1019 |
+
st.subheader("📝 Batch Analysis")
|
| 1020 |
+
|
| 1021 |
+
st.markdown("### 📥 Download Example Files")
|
| 1022 |
+
col1, col2 = st.columns(2)
|
| 1023 |
+
|
| 1024 |
+
with col1:
|
| 1025 |
+
example_csv_data = {
|
| 1026 |
+
'text': [
|
| 1027 |
+
'यो राम्रो छ',
|
| 1028 |
+
'तिमी मुर्ख हौ',
|
| 1029 |
+
'मुस्लिम हरु सबै खराब छन्',
|
| 1030 |
+
'केटीहरु घरमा बस्नु पर्छ',
|
| 1031 |
+
'नमस्ते, कस्तो छ?'
|
| 1032 |
+
]
|
| 1033 |
+
}
|
| 1034 |
+
example_csv = pd.DataFrame(example_csv_data).to_csv(index=False)
|
| 1035 |
+
st.download_button(
|
| 1036 |
+
label="📄 Download Example CSV",
|
| 1037 |
+
data=example_csv,
|
| 1038 |
+
file_name="example_batch.csv",
|
| 1039 |
+
mime="text/csv",
|
| 1040 |
+
use_container_width=True
|
| 1041 |
+
)
|
| 1042 |
+
|
| 1043 |
+
with col2:
|
| 1044 |
+
example_text = "यो राम्रो छ\nतिमी मुर्ख हौ\nमुस्लिम हरु सबै खराब छन्\nकेटीहरु घरमा बस्नु पर्छ\nनमस्ते, कस्तो छ?"
|
| 1045 |
+
st.download_button(
|
| 1046 |
+
label="📝 Download Example Text",
|
| 1047 |
+
data=example_text,
|
| 1048 |
+
file_name="example_batch.txt",
|
| 1049 |
+
mime="text/plain",
|
| 1050 |
+
use_container_width=True
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
st.markdown("---")
|
| 1054 |
+
|
| 1055 |
+
input_method = st.radio("Input method:", ["Text Area", "CSV Upload"])
|
| 1056 |
+
|
| 1057 |
+
# ---- TEXT AREA ----
|
| 1058 |
+
if input_method == "Text Area":
|
| 1059 |
+
st.info("💡 Enter one text per line")
|
| 1060 |
+
|
| 1061 |
+
batch_text = st.text_area(
|
| 1062 |
+
"Enter texts (one per line)",
|
| 1063 |
+
height=250,
|
| 1064 |
+
placeholder="यो राम्रो छ\nतिमी मुर्ख हौ\n..."
|
| 1065 |
+
)
|
| 1066 |
+
|
| 1067 |
+
if st.button("🚀 Analyze Batch", type="primary"):
|
| 1068 |
+
if batch_text.strip():
|
| 1069 |
+
texts = [line.strip() for line in batch_text.split('\n') if line.strip()]
|
| 1070 |
+
|
| 1071 |
+
with st.spinner(f"Analyzing {len(texts)} texts..."):
|
| 1072 |
+
results = []
|
| 1073 |
+
progress_bar = st.progress(0)
|
| 1074 |
+
|
| 1075 |
+
for idx, text in enumerate(texts):
|
| 1076 |
+
try:
|
| 1077 |
+
result = predict_text(
|
| 1078 |
+
text, model, tokenizer,
|
| 1079 |
+
label_encoder, preprocessor
|
| 1080 |
+
)
|
| 1081 |
+
results.append({
|
| 1082 |
+
'Text': text[:60] + '...' if len(text) > 60 else text,
|
| 1083 |
+
'Full_Text': text,
|
| 1084 |
+
'Prediction': result['prediction'],
|
| 1085 |
+
'Confidence': result['confidence'],
|
| 1086 |
+
'Preprocessed': result['preprocessed_text']
|
| 1087 |
+
})
|
| 1088 |
+
except Exception as e:
|
| 1089 |
+
results.append({
|
| 1090 |
+
'Text': text[:60],
|
| 1091 |
+
'Full_Text': text,
|
| 1092 |
+
'Prediction': 'Error',
|
| 1093 |
+
'Confidence': 0.0,
|
| 1094 |
+
'Preprocessed': str(e)
|
| 1095 |
+
})
|
| 1096 |
+
progress_bar.progress((idx + 1) / len(texts))
|
| 1097 |
+
|
| 1098 |
+
st.session_state.batch_results = pd.DataFrame(results)
|
| 1099 |
+
st.session_state.batch_mode = 'text_area'
|
| 1100 |
+
|
| 1101 |
+
for result in results:
|
| 1102 |
+
if result['Prediction'] != 'Error':
|
| 1103 |
+
st.session_state.session_predictions += 1
|
| 1104 |
+
pred_label = result['Prediction']
|
| 1105 |
+
if pred_label in st.session_state.session_class_counts:
|
| 1106 |
+
st.session_state.session_class_counts[pred_label] += 1
|
| 1107 |
+
|
| 1108 |
+
st.rerun()
|
| 1109 |
+
else:
|
| 1110 |
+
st.warning("Please enter some texts.")
|
| 1111 |
+
|
| 1112 |
+
# Display results outside button block
|
| 1113 |
+
if (st.session_state.batch_results is not None and
|
| 1114 |
+
st.session_state.get('batch_mode') == 'text_area'):
|
| 1115 |
+
|
| 1116 |
+
results_df = st.session_state.batch_results
|
| 1117 |
+
|
| 1118 |
+
st.success(f"✅ Analyzed {len(results_df)} texts!")
|
| 1119 |
+
|
| 1120 |
+
display_df = results_df[['Text', 'Prediction', 'Confidence']].copy()
|
| 1121 |
+
display_df['Confidence'] = display_df['Confidence'].apply(lambda x: f"{x:.2%}")
|
| 1122 |
+
st.dataframe(display_df, use_container_width=True, hide_index=True, height=400)
|
| 1123 |
+
|
| 1124 |
+
st.markdown("---")
|
| 1125 |
+
st.subheader("📊 Summary Statistics")
|
| 1126 |
+
|
| 1127 |
+
col1, col2, col3 = st.columns(3)
|
| 1128 |
+
with col1:
|
| 1129 |
+
st.metric("Total Texts", len(results_df))
|
| 1130 |
+
st.metric("Avg Confidence", f"{results_df['Confidence'].mean():.2%}")
|
| 1131 |
+
with col2:
|
| 1132 |
+
summary = results_df['Prediction'].value_counts()
|
| 1133 |
+
fig = px.pie(
|
| 1134 |
+
values=summary.values,
|
| 1135 |
+
names=summary.index,
|
| 1136 |
+
title="Prediction Distribution",
|
| 1137 |
+
color_discrete_sequence=px.colors.qualitative.Set2
|
| 1138 |
+
)
|
| 1139 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1140 |
+
with col3:
|
| 1141 |
+
st.markdown("**Class Breakdown:**")
|
| 1142 |
+
for label, count in summary.items():
|
| 1143 |
+
pct = count / len(results_df) * 100
|
| 1144 |
+
st.write(f"• {label}: {count} ({pct:.1f}%)")
|
| 1145 |
+
|
| 1146 |
+
st.markdown("---")
|
| 1147 |
+
download_df = results_df[['Full_Text', 'Prediction', 'Confidence', 'Preprocessed']].copy()
|
| 1148 |
+
download_df.columns = ['Text', 'Prediction', 'Confidence', 'Preprocessed']
|
| 1149 |
+
csv = download_df.to_csv(index=False)
|
| 1150 |
+
|
| 1151 |
+
col_download, col_explain = st.columns(2)
|
| 1152 |
+
with col_download:
|
| 1153 |
+
st.download_button(
|
| 1154 |
+
label="📥 Download Results CSV",
|
| 1155 |
+
data=csv,
|
| 1156 |
+
file_name=f"batch_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 1157 |
+
mime="text/csv",
|
| 1158 |
+
use_container_width=True,
|
| 1159 |
+
key="download_batch_text"
|
| 1160 |
+
)
|
| 1161 |
+
with col_explain:
|
| 1162 |
+
if st.button("💡 Explain Selected", use_container_width=True, key="hint_batch_text"):
|
| 1163 |
+
st.info("👇 Select a text below to explain")
|
| 1164 |
+
|
| 1165 |
+
render_batch_explainability(
|
| 1166 |
+
results_df=results_df,
|
| 1167 |
+
text_column='Full_Text',
|
| 1168 |
+
model=model,
|
| 1169 |
+
tokenizer=tokenizer,
|
| 1170 |
+
label_encoder=label_encoder,
|
| 1171 |
+
preprocessor=preprocessor,
|
| 1172 |
+
nepali_font=nepali_font,
|
| 1173 |
+
explainability_available=explainability_available,
|
| 1174 |
+
captum_available=captum_available,
|
| 1175 |
+
mode_key="text_area"
|
| 1176 |
+
)
|
| 1177 |
+
|
| 1178 |
+
# ---- CSV UPLOAD ----
|
| 1179 |
+
else:
|
| 1180 |
+
st.info("💡 Upload CSV with a 'text' column")
|
| 1181 |
+
|
| 1182 |
+
uploaded_file = st.file_uploader("Choose CSV file", type=['csv'])
|
| 1183 |
+
|
| 1184 |
+
if uploaded_file:
|
| 1185 |
+
try:
|
| 1186 |
+
df = pd.read_csv(uploaded_file)
|
| 1187 |
+
st.write("📄 **File Preview:**")
|
| 1188 |
+
st.dataframe(df.head(10), use_container_width=True)
|
| 1189 |
+
|
| 1190 |
+
text_column = st.selectbox("Select text column:", df.columns)
|
| 1191 |
+
|
| 1192 |
+
if st.button("🚀 Analyze CSV", type="primary"):
|
| 1193 |
+
texts = df[text_column].astype(str).tolist()
|
| 1194 |
+
|
| 1195 |
+
with st.spinner(f"Analyzing {len(texts)} texts..."):
|
| 1196 |
+
predictions = []
|
| 1197 |
+
confidences = []
|
| 1198 |
+
preprocessed_list = []
|
| 1199 |
+
progress_bar = st.progress(0)
|
| 1200 |
+
|
| 1201 |
+
for idx, text in enumerate(texts):
|
| 1202 |
+
try:
|
| 1203 |
+
result = predict_text(
|
| 1204 |
+
str(text), model, tokenizer,
|
| 1205 |
+
label_encoder, preprocessor
|
| 1206 |
+
)
|
| 1207 |
+
predictions.append(result['prediction'])
|
| 1208 |
+
confidences.append(result['confidence'])
|
| 1209 |
+
preprocessed_list.append(result['preprocessed_text'])
|
| 1210 |
+
except Exception as e:
|
| 1211 |
+
predictions.append('Error')
|
| 1212 |
+
confidences.append(0.0)
|
| 1213 |
+
preprocessed_list.append(str(e))
|
| 1214 |
+
progress_bar.progress((idx + 1) / len(texts))
|
| 1215 |
+
|
| 1216 |
+
df['Prediction'] = predictions
|
| 1217 |
+
df['Confidence'] = confidences
|
| 1218 |
+
df['Preprocessed'] = preprocessed_list
|
| 1219 |
+
|
| 1220 |
+
st.session_state.batch_results = df
|
| 1221 |
+
st.session_state.batch_mode = 'csv'
|
| 1222 |
+
st.session_state.csv_text_column = text_column
|
| 1223 |
+
|
| 1224 |
+
for pred in predictions:
|
| 1225 |
+
if pred != 'Error':
|
| 1226 |
+
st.session_state.session_predictions += 1
|
| 1227 |
+
if pred in st.session_state.session_class_counts:
|
| 1228 |
+
st.session_state.session_class_counts[pred] += 1
|
| 1229 |
+
|
| 1230 |
+
st.rerun()
|
| 1231 |
+
|
| 1232 |
+
# Display results outside button block
|
| 1233 |
+
if (st.session_state.batch_results is not None and
|
| 1234 |
+
st.session_state.get('batch_mode') == 'csv'):
|
| 1235 |
+
|
| 1236 |
+
df_results = st.session_state.batch_results
|
| 1237 |
+
text_col = st.session_state.get('csv_text_column', text_column)
|
| 1238 |
+
|
| 1239 |
+
st.success("✅ Analysis complete!")
|
| 1240 |
+
st.dataframe(df_results, use_container_width=True, height=400)
|
| 1241 |
+
|
| 1242 |
+
st.markdown("---")
|
| 1243 |
+
st.subheader("📊 Summary")
|
| 1244 |
+
|
| 1245 |
+
col1, col2 = st.columns(2)
|
| 1246 |
+
with col1:
|
| 1247 |
+
summary = df_results['Prediction'].value_counts()
|
| 1248 |
+
fig = px.bar(
|
| 1249 |
+
x=summary.index,
|
| 1250 |
+
y=summary.values,
|
| 1251 |
+
title="Prediction Distribution",
|
| 1252 |
+
labels={'x': 'Class', 'y': 'Count'},
|
| 1253 |
+
color=summary.index,
|
| 1254 |
+
color_discrete_map={
|
| 1255 |
+
'NO': '#28a745',
|
| 1256 |
+
'OO': '#ffc107',
|
| 1257 |
+
'OR': '#dc3545',
|
| 1258 |
+
'OS': '#6f42c1'
|
| 1259 |
+
}
|
| 1260 |
+
)
|
| 1261 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1262 |
+
with col2:
|
| 1263 |
+
st.metric("Total Texts", len(df_results))
|
| 1264 |
+
st.metric("Avg Confidence", f"{df_results['Confidence'].mean():.2%}")
|
| 1265 |
+
st.markdown("**Class Distribution:**")
|
| 1266 |
+
for label, count in summary.items():
|
| 1267 |
+
st.write(f"• {label}: {count}")
|
| 1268 |
+
|
| 1269 |
+
st.markdown("---")
|
| 1270 |
+
csv_data = df_results.to_csv(index=False)
|
| 1271 |
+
|
| 1272 |
+
col_download, col_explain = st.columns(2)
|
| 1273 |
+
with col_download:
|
| 1274 |
+
st.download_button(
|
| 1275 |
+
label="📥 Download Results CSV",
|
| 1276 |
+
data=csv_data,
|
| 1277 |
+
file_name=f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 1278 |
+
mime="text/csv",
|
| 1279 |
+
use_container_width=True,
|
| 1280 |
+
key="download_csv_results"
|
| 1281 |
+
)
|
| 1282 |
+
with col_explain:
|
| 1283 |
+
if st.button("💡 Explain Selected", use_container_width=True, key="csv_explain_hint"):
|
| 1284 |
+
st.info("👇 Use expander below to explain")
|
| 1285 |
+
|
| 1286 |
+
render_batch_explainability(
|
| 1287 |
+
results_df=df_results,
|
| 1288 |
+
text_column=text_col,
|
| 1289 |
+
model=model,
|
| 1290 |
+
tokenizer=tokenizer,
|
| 1291 |
+
label_encoder=label_encoder,
|
| 1292 |
+
preprocessor=preprocessor,
|
| 1293 |
+
nepali_font=nepali_font,
|
| 1294 |
+
explainability_available=explainability_available,
|
| 1295 |
+
captum_available=captum_available,
|
| 1296 |
+
mode_key="csv"
|
| 1297 |
+
)
|
| 1298 |
+
|
| 1299 |
+
except Exception as e:
|
| 1300 |
+
st.error(f"❌ Error processing file: {str(e)}")
|
| 1301 |
+
with st.expander("🐛 Error Details"):
|
| 1302 |
+
st.exception(e)
|
| 1303 |
+
|
| 1304 |
+
# ========================================================================
|
| 1305 |
+
# TAB 4: HISTORY
|
| 1306 |
+
# ========================================================================
|
| 1307 |
+
|
| 1308 |
+
with tabs[3]:
|
| 1309 |
+
st.subheader("📈 Prediction History")
|
| 1310 |
+
|
| 1311 |
+
col1, col2 = st.columns([3, 1])
|
| 1312 |
+
with col1:
|
| 1313 |
+
st.write("View and analyze your prediction history")
|
| 1314 |
+
with col2:
|
| 1315 |
+
if st.button("🔄 Refresh", use_container_width=True):
|
| 1316 |
+
st.rerun()
|
| 1317 |
+
|
| 1318 |
+
if os.path.exists(HISTORY_FILE):
|
| 1319 |
+
try:
|
| 1320 |
+
with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
|
| 1321 |
+
history = json.load(f)
|
| 1322 |
+
|
| 1323 |
+
if history:
|
| 1324 |
+
history_df = pd.DataFrame(history)
|
| 1325 |
+
history_df['timestamp'] = pd.to_datetime(history_df['timestamp'])
|
| 1326 |
+
|
| 1327 |
+
st.markdown("### 📊 Overview")
|
| 1328 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 1329 |
+
with col1:
|
| 1330 |
+
st.metric("Total Predictions", len(history_df))
|
| 1331 |
+
with col2:
|
| 1332 |
+
st.metric("Avg Confidence", f"{history_df['confidence'].mean():.2%}")
|
| 1333 |
+
with col3:
|
| 1334 |
+
if 'emoji_features' in history_df.columns:
|
| 1335 |
+
total_emojis = sum(
|
| 1336 |
+
e.get('total_emoji_count', 0)
|
| 1337 |
+
for e in history_df['emoji_features']
|
| 1338 |
+
if isinstance(e, dict)
|
| 1339 |
+
)
|
| 1340 |
+
st.metric("Total Emojis", total_emojis)
|
| 1341 |
+
else:
|
| 1342 |
+
st.metric("Total Emojis", "N/A")
|
| 1343 |
+
with col4:
|
| 1344 |
+
most_common = history_df['prediction'].mode()[0]
|
| 1345 |
+
st.metric("Most Common", most_common)
|
| 1346 |
+
|
| 1347 |
+
st.markdown("---")
|
| 1348 |
+
st.markdown("### 📈 Trends")
|
| 1349 |
+
|
| 1350 |
+
col1, col2 = st.columns(2)
|
| 1351 |
+
with col1:
|
| 1352 |
+
daily_counts = history_df.groupby(
|
| 1353 |
+
history_df['timestamp'].dt.date
|
| 1354 |
+
).size().reset_index(name='count')
|
| 1355 |
+
fig = px.line(
|
| 1356 |
+
daily_counts,
|
| 1357 |
+
x='timestamp',
|
| 1358 |
+
y='count',
|
| 1359 |
+
title="Predictions Over Time",
|
| 1360 |
+
labels={'timestamp': 'Date', 'count': 'Count'}
|
| 1361 |
+
)
|
| 1362 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1363 |
+
with col2:
|
| 1364 |
+
class_dist = history_df['prediction'].value_counts()
|
| 1365 |
+
fig = px.pie(
|
| 1366 |
+
values=class_dist.values,
|
| 1367 |
+
names=class_dist.index,
|
| 1368 |
+
title="Class Distribution",
|
| 1369 |
+
color=class_dist.index,
|
| 1370 |
+
color_discrete_map={
|
| 1371 |
+
'NO': '#28a745',
|
| 1372 |
+
'OO': '#ffc107',
|
| 1373 |
+
'OR': '#dc3545',
|
| 1374 |
+
'OS': '#6f42c1'
|
| 1375 |
+
}
|
| 1376 |
+
)
|
| 1377 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1378 |
+
|
| 1379 |
+
st.markdown("---")
|
| 1380 |
+
st.markdown("### 📋 Recent Predictions")
|
| 1381 |
+
|
| 1382 |
+
num_to_show = st.slider("Number to show", 5, 50, 20, 5)
|
| 1383 |
+
recent = history_df.tail(num_to_show).sort_values('timestamp', ascending=False)
|
| 1384 |
+
display = recent[['timestamp', 'text', 'prediction', 'confidence']].copy()
|
| 1385 |
+
display['confidence'] = display['confidence'].apply(lambda x: f"{x:.2%}")
|
| 1386 |
+
display['text'] = display['text'].apply(lambda x: x[:80] + '...' if len(x) > 80 else x)
|
| 1387 |
+
display['timestamp'] = display['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
|
| 1388 |
+
st.dataframe(display, use_container_width=True, hide_index=True, height=400)
|
| 1389 |
+
|
| 1390 |
+
st.markdown("---")
|
| 1391 |
+
col1, col2 = st.columns(2)
|
| 1392 |
+
with col1:
|
| 1393 |
+
csv = history_df.to_csv(index=False)
|
| 1394 |
+
st.download_button(
|
| 1395 |
+
label="📥 Download Full History",
|
| 1396 |
+
data=csv,
|
| 1397 |
+
file_name=f"history_{datetime.now().strftime('%Y%m%d')}.csv",
|
| 1398 |
+
mime="text/csv",
|
| 1399 |
+
use_container_width=True
|
| 1400 |
+
)
|
| 1401 |
+
with col2:
|
| 1402 |
+
if st.button("🗑️ Clear History", type="secondary", use_container_width=True):
|
| 1403 |
+
if os.path.exists(HISTORY_FILE):
|
| 1404 |
+
os.remove(HISTORY_FILE)
|
| 1405 |
+
st.success("✅ History cleared!")
|
| 1406 |
+
st.rerun()
|
| 1407 |
+
else:
|
| 1408 |
+
st.info("📝 No predictions in history yet.")
|
| 1409 |
+
|
| 1410 |
+
except Exception as e:
|
| 1411 |
+
st.error(f"❌ Error loading history: {str(e)}")
|
| 1412 |
+
with st.expander("🐛 Error Details"):
|
| 1413 |
+
st.exception(e)
|
| 1414 |
+
else:
|
| 1415 |
+
st.info("📝 No history file found yet.")
|
| 1416 |
+
st.markdown("""
|
| 1417 |
+
### How to Build History:
|
| 1418 |
+
1. Go to **Single Prediction** tab
|
| 1419 |
+
2. Enable "Save to history" checkbox
|
| 1420 |
+
3. Analyze some text
|
| 1421 |
+
4. Your predictions will appear here!
|
| 1422 |
+
""")
|
| 1423 |
|
|
|
|
|
|
|
| 1424 |
|
| 1425 |
+
if __name__ == "__main__":
|
| 1426 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|