#!/usr/bin/env python3 import gradio as gr import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import plotly.graph_objects as go import numpy as np import os class HateSpeechDetector: def __init__(self, model_path: str = "sadjava/multilingual-hate-speech-xlm-roberta"): """Initialize the hate speech detector with a trained model.""" self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"🔧 Using device: {self.device}") # Load model and tokenizer try: self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.model = AutoModelForSequenceClassification.from_pretrained(model_path) self.model.to(self.device) self.model.eval() print(f"✅ Model loaded successfully from {model_path}") except Exception as e: print(f"❌ Error loading model: {e}") # Fallback to a default model if custom model fails print("🔄 Falling back to default multilingual model...") self.tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base") self.model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert") self.model.to(self.device) self.model.eval() # Define hate speech categories self.categories = [ "Race", "Sexual Orientation", "Gender", "Physical Appearance", "Religion", "Class", "Disability", "Appropriate" ] def predict_with_context(self, text: str) -> tuple: """Predict hate speech category with contextual analysis.""" if not text.strip(): return "Please enter some text", 0.0, {}, "" try: # Tokenize input inputs = self.tokenizer( text, return_tensors="pt", truncation=True, padding=True, max_length=512, return_attention_mask=True ) # Move to device inputs = {k: v.to(self.device) for k, v in inputs.items()} # Get predictions with attention with torch.no_grad(): outputs = self.model(**inputs, output_attentions=True) logits = outputs.logits attentions = outputs.attentions # Calculate probabilities probabilities = F.softmax(logits, dim=-1) # Handle different model outputs if probabilities.shape[-1] == len(self.categories): predicted_class = torch.argmax(probabilities, dim=-1).item() predicted_category = self.categories[predicted_class] else: # Fallback for binary classification models predicted_class = torch.argmax(probabilities, dim=-1).item() predicted_category = "Inappropriate" if predicted_class == 1 else "Appropriate" # Create fake probabilities for visualization prob_inappropriate = float(probabilities[0][1]) if probabilities.shape[-1] > 1 else 0.5 fake_probs = torch.zeros(len(self.categories)) fake_probs[-1] = 1 - prob_inappropriate # Appropriate fake_probs[0] = prob_inappropriate / 7 # Distribute across hate categories for i in range(1, 7): fake_probs[i] = prob_inappropriate / 7 probabilities = fake_probs.unsqueeze(0) confidence = float(torch.max(probabilities[0])) # Create confidence chart confidence_chart = self.create_confidence_chart(probabilities[0]) # Create word highlighting highlighted_html = self.create_word_highlighting(text, inputs, attentions) return predicted_category, confidence, confidence_chart, highlighted_html except Exception as e: print(f"Error in prediction: {e}") return f"Error: {str(e)}", 0.0, {}, "" def create_confidence_chart(self, probabilities): """Create confidence visualization.""" scores = [float(prob) for prob in probabilities] colors = ['#ff6b6b' if cat != 'Appropriate' else '#51cf66' for cat in self.categories] fig = go.Figure(data=[ go.Bar( x=self.categories, y=scores, marker_color=colors, text=[f'{score:.1%}' for score in scores], textposition='auto', ) ]) fig.update_layout( title="Confidence Scores by Category", xaxis_title="Categories", yaxis_title="Confidence", yaxis_range=[0, 1], height=400, xaxis_tickangle=-45 ) return fig def create_word_highlighting(self, text, inputs, attentions): """Create word-level importance highlighting.""" try: # Use multiple attention heads and layers for better analysis last_layer_attention = attentions[-1][0] # [num_heads, seq_len, seq_len] avg_attention = torch.mean(last_layer_attention, dim=0) # [seq_len, seq_len] # Calculate importance as sum of attention TO each token token_importance = torch.sum(avg_attention, dim=0).cpu().numpy() tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) # Remove special tokens content_tokens = tokens[1:-1] if len(tokens) > 2 else tokens content_importance = token_importance[1:-1] if len(token_importance) > 2 else token_importance # Normalize importance scores if len(content_importance) > 1: importance_norm = (content_importance - content_importance.min()) / (content_importance.max() - content_importance.min() + 1e-8) importance_norm = np.power(importance_norm, 0.5) else: importance_norm = np.array([0.5]) # Map tokens back to words words = text.split() word_scores = [] # Simple word-token mapping token_idx = 0 for word in words: word_importance_scores = [] word_tokens = self.tokenizer.tokenize(word) for _ in word_tokens: if token_idx < len(importance_norm): word_importance_scores.append(importance_norm[token_idx]) token_idx += 1 if word_importance_scores: word_score = np.mean(word_importance_scores) else: word_score = 0.2 word_scores.append(word_score) # Create HTML with highlighting html_parts = [] for word, score in zip(words, word_scores): if score > 0.7: color = "rgba(220, 53, 69, 0.8)" # Red elif score > 0.5: color = "rgba(255, 193, 7, 0.8)" # Orange elif score > 0.3: color = "rgba(255, 235, 59, 0.6)" # Yellow else: color = "rgba(248, 249, 250, 0.3)" # Light gray html_parts.append( f'{word}' ) return '