Spaces:

na1in
/

explainable-troll-defender

Sleeping

Nalin Gupta

Disable Phi-3 temporarily - use RoBERTa only for now

a493939 3 months ago

10.4 kB

	"""
	Hugging Face Spaces Demo App for Explainable Troll Defender V7
	Uses Gradio for the interface
	"""

	import gradio as gr
	import torch
	import json
	import re
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
	from peft import PeftModel

	# Configuration
	ROBERTA_MODEL = "na1in/roberta-v7-troll-defender" # Your custom V7 model
	PHI_BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
	PHI_ADAPTER = "na1in/phi3-troll-explainer" # Your custom LoRA adapters
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Global models
	roberta_model = None
	roberta_tokenizer = None
	phi_model = None
	phi_tokenizer = None

	class RobustJSONParser:
	"""Handles malformed JSON from LLM outputs."""

	@staticmethod
	def extract_json(text: str) -> dict:
	"""Try multiple strategies to extract JSON from LLM output."""
	# Strategy 1: Direct parse
	try:
	return json.loads(text)
	except:
	pass

	# Strategy 2: Find JSON boundaries
	try:
	start = text.find("{")
	end = text.rfind("}") + 1
	if start != -1 and end > start:
	json_str = text[start:end]
	return json.loads(json_str)
	except:
	pass

	# Strategy 3: Regex extraction
	result = {}
	target_match = re.search(r'["\']?target["\']?\s[:\s]\s["\']([^"\']+)["\']', text, re.IGNORECASE)
	if target_match:
	result["target"] = target_match.group(1).strip()

	category_match = re.search(r'["\']?category["\']?\s[:\s]\s["\']([^"\']+)["\']', text, re.IGNORECASE)
	if category_match:
	result["category"] = category_match.group(1).strip()

	reasoning_match = re.search(r'["\']?reasoning["\']?\s[:\s]\s["\']([^"\']+)["\']', text, re.IGNORECASE)
	if reasoning_match:
	result["reasoning"] = reasoning_match.group(1).strip()

	return result if result else {"reasoning": "Toxic content detected.", "target": "Unknown", "category": "Detected"}

	def load_models():
	"""Load models on startup."""
	global roberta_model, roberta_tokenizer, phi_model, phi_tokenizer

	print("Loading RoBERTa V7 (Gatekeeper)...")
	try:
	# Try loading from HF Hub first
	roberta_tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
	roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_MODEL)
	except:
	# Fallback to base model if custom model not uploaded yet
	print("Custom model not found, using base model...")
	roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")
	roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")

	roberta_model.to(DEVICE)
	roberta_model.eval()
	print("✅ RoBERTa loaded")

	# TEMPORARILY DISABLED: Phi-3 has compatibility issues on HF Spaces
	# Uncomment below when transformers compatibility is resolved
	print("⚠️ Phi-3 Explainer temporarily disabled (compatibility issue)")
	print("💡 Demo will show RoBERTa classification only")
	phi_model = None
	phi_tokenizer = None

	# print("Loading Phi-3 (Explainer)...")
	# try:
	# phi_tokenizer = AutoTokenizer.from_pretrained(PHI_BASE_MODEL)
	# phi_base = AutoModelForCausalLM.from_pretrained(
	# PHI_BASE_MODEL,
	# torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	# device_map=DEVICE,
	# trust_remote_code=True
	# )
	# # Try loading adapter if available
	# try:
	# phi_model = PeftModel.from_pretrained(phi_base, PHI_ADAPTER)
	# print("✅ Phi-3 with LoRA adapter loaded")
	# except:
	# phi_model = phi_base
	# print("✅ Phi-3 base model loaded (adapter not found)")
	# except Exception as e:
	# print(f"⚠️ Could not load Phi-3: {e}")
	# phi_model = None
	# phi_tokenizer = None

	def predict_toxicity(text):
	"""Main prediction function."""
	if not roberta_model:
	return "❌ Models not loaded", "", ""

	import time
	start_time = time.time()

	# Stage 1: RoBERTa Classification
	inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)

	with torch.no_grad():
	outputs = roberta_model(**inputs)
	probs = torch.softmax(outputs.logits, dim=-1)
	predicted_class = torch.argmax(probs, dim=-1).item()
	confidence = probs[0][predicted_class].item()

	# Map prediction (0=normal, 1=offensive, 2=hate)
	label_map = {0: "normal", 1: "offensive", 2: "hatespeech"}
	label = label_map.get(predicted_class, "normal")

	# Stage 2: Generate explanation if toxic
	explanation = ""
	rationale_json = ""

	if label != "normal" and phi_model and phi_tokenizer:
	try:
	prompt = f"""<\|user\|>
	Analyze this text for toxicity. Return JSON with: target, category, reasoning.

	Text: "{text}"

	Output:<\|end\|>
	<\|assistant\|>"""

	inputs = phi_tokenizer(prompt, return_tensors="pt").to(DEVICE)
	with torch.no_grad():
	outputs = phi_model.generate(
	input_ids=inputs.input_ids,
	attention_mask=inputs.attention_mask,
	max_new_tokens=100, # Reduced from 150 for speed
	temperature=0.7,
	do_sample=False, # Greedy decoding for speed
	pad_token_id=phi_tokenizer.eos_token_id,
	eos_token_id=phi_tokenizer.eos_token_id,
	use_cache=False
	)

	response = phi_tokenizer.decode(outputs[0], skip_special_tokens=True)
	rationale_json = response.split("Output:")[-1].strip()

	# Parse and format
	parsed = RobustJSONParser.extract_json(rationale_json)
	target = parsed.get("target", "Unspecified")
	category = parsed.get("category", "Toxic")
	reasoning = parsed.get("reasoning", "Toxic content detected.")

	explanation = f"🚨 Target: {target}\n📂 Category: {category}\n💭 Reasoning: {reasoning}"
	except Exception as e:
	print(f"⚠️ Phi-3 explanation failed: {e}")
	explanation = f"🚨 {label.upper()} content detected\n⚠️ Detailed explanation unavailable (LLM disabled for compatibility)"
	rationale_json = f"Error: {str(e)}"
	else:
	if label == "normal":
	explanation = "✅ No toxic content detected"
	rationale_json = "Safe content."
	else:
	explanation = f"🚨 {label.upper()} content detected\n⚠️ Detailed explanation unavailable (LLM temporarily disabled)"
	rationale_json = "Explainer model disabled for compatibility."

	latency = time.time() - start_time

	# Format output
	status_emoji = "✅" if label == "normal" else "🚨"
	result = f"{status_emoji} Label: {label.upper()}\n⏱️ Latency: {latency:.2f}s\n📊 Confidence: {confidence:.2%}"

	return result, explanation, rationale_json

	# Demo Examples
	examples = [
	["Hitler was wrong about the jews"],
	["Women are too pure for politics"],
	["My female dog is a bitch"],
	["You are a stupid bitch"],
	["Muslims represent terrorism"],
	["Muslims are often wrongly accused of terrorism"],
	["I will kill for a pizza right now"],
	["Go back to your country"],
	]

	# Create Gradio Interface
	with gr.Blocks(title="Explainable Troll Defender V7", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🛡️ Explainable Troll Defender V7

	State-of-the-Art Hate Speech Detection with Context Awareness

	⚠️ Notice: Currently showing RoBERTa classification only. The Phi-3 explainer is temporarily disabled due to compatibility issues on HuggingFace Spaces. You'll still see accurate toxic/safe predictions!

	This system uses:
	- 🔍 RoBERTa V7 (Gatekeeper): Detects toxicity with context awareness
	- ~~💬 Phi-3 + LoRA (Explainer): Generates human-readable rationales~~ (Temporarily disabled)

	### Key Features:
	- ✅ 100% accuracy on adversarial edge cases
	- ✅ Understands double negatives ("Hitler was wrong" = safe)
	- ✅ Detects benevolent sexism
	- ✅ Context-dependent word analysis

	[📚 GitHub](https://github.com/na1in/Explainable-Troll-Defender) \| [📖 Paper](https://github.com/na1in/Explainable-Troll-Defender/blob/main/CHANGELOG.md)
	""")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Enter text to analyze",
	placeholder="Type or select an example...",
	lines=3
	)
	analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")

	gr.Markdown("### 📋 Try These Examples:")
	gr.Examples(
	examples=examples,
	inputs=input_text,
	label="Click to load"
	)

	with gr.Column():
	result_output = gr.Markdown(label="Classification Result")
	explanation_output = gr.Markdown(label="Detailed Explanation")
	json_output = gr.Textbox(label="Raw JSON (for developers)", lines=3)

	analyze_btn.click(
	fn=predict_toxicity,
	inputs=input_text,
	outputs=[result_output, explanation_output, json_output]
	)

	gr.Markdown("""
	---
	### 🎯 Performance Metrics
	- Edge Case Accuracy: 100% (14/14 test cases)
	- General Accuracy: 82.35%
	- Toxic Recall: 95%

	### 🧠 Model Architecture
	This is an Expert Gating Pipeline where RoBERTa handles binary classification (logic)
	and Phi-3 generates explanations (language). Neither model alone could achieve this performance.

	### ⚠️ Limitations
	- Spelling variants (k1ll, h@te): ~30% accuracy
	- Latency on toxic posts: 3-15 seconds
	- May over-flag reclaimed slurs without full context

	Built by [Nalin](https://github.com/na1in) \| [Report Issues](https://github.com/na1in/Explainable-Troll-Defender/issues)
	""")

	# Load models on startup
	load_models()

	# Launch
	if __name__ == "__main__":
	demo.launch()