Nalin Gupta
Disable Phi-3 temporarily - use RoBERTa only for now
a493939
"""
Hugging Face Spaces Demo App for Explainable Troll Defender V7
Uses Gradio for the interface
"""
import gradio as gr
import torch
import json
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from peft import PeftModel
# Configuration
ROBERTA_MODEL = "na1in/roberta-v7-troll-defender" # Your custom V7 model
PHI_BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
PHI_ADAPTER = "na1in/phi3-troll-explainer" # Your custom LoRA adapters
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Global models
roberta_model = None
roberta_tokenizer = None
phi_model = None
phi_tokenizer = None
class RobustJSONParser:
"""Handles malformed JSON from LLM outputs."""
@staticmethod
def extract_json(text: str) -> dict:
"""Try multiple strategies to extract JSON from LLM output."""
# Strategy 1: Direct parse
try:
return json.loads(text)
except:
pass
# Strategy 2: Find JSON boundaries
try:
start = text.find("{")
end = text.rfind("}") + 1
if start != -1 and end > start:
json_str = text[start:end]
return json.loads(json_str)
except:
pass
# Strategy 3: Regex extraction
result = {}
target_match = re.search(r'["\']?target["\']?\s*[:\s]\s*["\']([^"\']+)["\']', text, re.IGNORECASE)
if target_match:
result["target"] = target_match.group(1).strip()
category_match = re.search(r'["\']?category["\']?\s*[:\s]\s*["\']([^"\']+)["\']', text, re.IGNORECASE)
if category_match:
result["category"] = category_match.group(1).strip()
reasoning_match = re.search(r'["\']?reasoning["\']?\s*[:\s]\s*["\']([^"\']+)["\']', text, re.IGNORECASE)
if reasoning_match:
result["reasoning"] = reasoning_match.group(1).strip()
return result if result else {"reasoning": "Toxic content detected.", "target": "Unknown", "category": "Detected"}
def load_models():
"""Load models on startup."""
global roberta_model, roberta_tokenizer, phi_model, phi_tokenizer
print("Loading RoBERTa V7 (Gatekeeper)...")
try:
# Try loading from HF Hub first
roberta_tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_MODEL)
except:
# Fallback to base model if custom model not uploaded yet
print("Custom model not found, using base model...")
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")
roberta_model.to(DEVICE)
roberta_model.eval()
print("βœ… RoBERTa loaded")
# TEMPORARILY DISABLED: Phi-3 has compatibility issues on HF Spaces
# Uncomment below when transformers compatibility is resolved
print("⚠️ Phi-3 Explainer temporarily disabled (compatibility issue)")
print("πŸ’‘ Demo will show RoBERTa classification only")
phi_model = None
phi_tokenizer = None
# print("Loading Phi-3 (Explainer)...")
# try:
# phi_tokenizer = AutoTokenizer.from_pretrained(PHI_BASE_MODEL)
# phi_base = AutoModelForCausalLM.from_pretrained(
# PHI_BASE_MODEL,
# torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
# device_map=DEVICE,
# trust_remote_code=True
# )
# # Try loading adapter if available
# try:
# phi_model = PeftModel.from_pretrained(phi_base, PHI_ADAPTER)
# print("βœ… Phi-3 with LoRA adapter loaded")
# except:
# phi_model = phi_base
# print("βœ… Phi-3 base model loaded (adapter not found)")
# except Exception as e:
# print(f"⚠️ Could not load Phi-3: {e}")
# phi_model = None
# phi_tokenizer = None
def predict_toxicity(text):
"""Main prediction function."""
if not roberta_model:
return "❌ Models not loaded", "", ""
import time
start_time = time.time()
# Stage 1: RoBERTa Classification
inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
with torch.no_grad():
outputs = roberta_model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
confidence = probs[0][predicted_class].item()
# Map prediction (0=normal, 1=offensive, 2=hate)
label_map = {0: "normal", 1: "offensive", 2: "hatespeech"}
label = label_map.get(predicted_class, "normal")
# Stage 2: Generate explanation if toxic
explanation = ""
rationale_json = ""
if label != "normal" and phi_model and phi_tokenizer:
try:
prompt = f"""<|user|>
Analyze this text for toxicity. Return JSON with: target, category, reasoning.
Text: "{text}"
Output:<|end|>
<|assistant|>"""
inputs = phi_tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = phi_model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=100, # Reduced from 150 for speed
temperature=0.7,
do_sample=False, # Greedy decoding for speed
pad_token_id=phi_tokenizer.eos_token_id,
eos_token_id=phi_tokenizer.eos_token_id,
use_cache=False
)
response = phi_tokenizer.decode(outputs[0], skip_special_tokens=True)
rationale_json = response.split("Output:")[-1].strip()
# Parse and format
parsed = RobustJSONParser.extract_json(rationale_json)
target = parsed.get("target", "Unspecified")
category = parsed.get("category", "Toxic")
reasoning = parsed.get("reasoning", "Toxic content detected.")
explanation = f"🚨 **Target**: {target}\nπŸ“‚ **Category**: {category}\nπŸ’­ **Reasoning**: {reasoning}"
except Exception as e:
print(f"⚠️ Phi-3 explanation failed: {e}")
explanation = f"🚨 **{label.upper()} content detected**\n⚠️ Detailed explanation unavailable (LLM disabled for compatibility)"
rationale_json = f"Error: {str(e)}"
else:
if label == "normal":
explanation = "βœ… **No toxic content detected**"
rationale_json = "Safe content."
else:
explanation = f"🚨 **{label.upper()} content detected**\n⚠️ Detailed explanation unavailable (LLM temporarily disabled)"
rationale_json = "Explainer model disabled for compatibility."
latency = time.time() - start_time
# Format output
status_emoji = "βœ…" if label == "normal" else "🚨"
result = f"{status_emoji} **Label**: {label.upper()}\n⏱️ **Latency**: {latency:.2f}s\nπŸ“Š **Confidence**: {confidence:.2%}"
return result, explanation, rationale_json
# Demo Examples
examples = [
["Hitler was wrong about the jews"],
["Women are too pure for politics"],
["My female dog is a bitch"],
["You are a stupid bitch"],
["Muslims represent terrorism"],
["Muslims are often wrongly accused of terrorism"],
["I will kill for a pizza right now"],
["Go back to your country"],
]
# Create Gradio Interface
with gr.Blocks(title="Explainable Troll Defender V7", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ›‘οΈ Explainable Troll Defender V7
**State-of-the-Art Hate Speech Detection with Context Awareness**
⚠️ **Notice**: Currently showing RoBERTa classification only. The Phi-3 explainer is temporarily disabled due to compatibility issues on HuggingFace Spaces. You'll still see accurate toxic/safe predictions!
This system uses:
- πŸ” **RoBERTa V7** (Gatekeeper): Detects toxicity with context awareness
- ~~πŸ’¬ **Phi-3 + LoRA** (Explainer): Generates human-readable rationales~~ (Temporarily disabled)
### Key Features:
- βœ… 100% accuracy on adversarial edge cases
- βœ… Understands double negatives ("Hitler was wrong" = safe)
- βœ… Detects benevolent sexism
- βœ… Context-dependent word analysis
[πŸ“š GitHub](https://github.com/na1in/Explainable-Troll-Defender) | [πŸ“– Paper](https://github.com/na1in/Explainable-Troll-Defender/blob/main/CHANGELOG.md)
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Enter text to analyze",
placeholder="Type or select an example...",
lines=3
)
analyze_btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
gr.Markdown("### πŸ“‹ Try These Examples:")
gr.Examples(
examples=examples,
inputs=input_text,
label="Click to load"
)
with gr.Column():
result_output = gr.Markdown(label="Classification Result")
explanation_output = gr.Markdown(label="Detailed Explanation")
json_output = gr.Textbox(label="Raw JSON (for developers)", lines=3)
analyze_btn.click(
fn=predict_toxicity,
inputs=input_text,
outputs=[result_output, explanation_output, json_output]
)
gr.Markdown("""
---
### 🎯 Performance Metrics
- **Edge Case Accuracy**: 100% (14/14 test cases)
- **General Accuracy**: 82.35%
- **Toxic Recall**: 95%
### 🧠 Model Architecture
This is an Expert Gating Pipeline where RoBERTa handles binary classification (logic)
and Phi-3 generates explanations (language). Neither model alone could achieve this performance.
### ⚠️ Limitations
- Spelling variants (k1ll, h@te): ~30% accuracy
- Latency on toxic posts: 3-15 seconds
- May over-flag reclaimed slurs without full context
Built by [Nalin](https://github.com/na1in) | [Report Issues](https://github.com/na1in/Explainable-Troll-Defender/issues)
""")
# Load models on startup
load_models()
# Launch
if __name__ == "__main__":
demo.launch()