prof-demo / src /core /ai_classifier.py
sbicy's picture
Upload 17 files
deff797 verified
from typing import Optional
from .classifier import ToxicityLevel
from ..utils.config import config
class AIClassifier:
"""AI-powered toxicity classifier using Hugging Face models."""
def __init__(self):
self.model = None
self.tokenizer = None
self._initialized = False
def _initialize(self):
"""Lazy load the model to avoid startup delays."""
if self._initialized:
return
try:
from transformers import pipeline
# Use a toxicity detection model
# This model works without authentication
self.model = pipeline(
"text-classification",
model="unitary/toxic-bert",
top_k=None,
token=config.HUGGINGFACE_TOKEN
)
self._initialized = True
print("✓ AI Classifier initialized with toxic-bert model")
except Exception as e:
print(f"⚠ Could not initialize AI model: {e}")
print(" Falling back to rule-based classification")
self._initialized = False
def classify(self, text: str) -> tuple[ToxicityLevel, dict]:
"""
Classify text using AI model.
Returns:
Tuple of (ToxicityLevel, confidence_scores)
"""
self._initialize()
if not self._initialized or self.model is None:
# Fallback to basic classification
return ToxicityLevel.SAFE, {}
try:
results = self.model(text)[0]
# toxic-bert returns labels like 'toxic', 'severe_toxic', 'obscene', etc.
scores = {item['label']: item['score'] for item in results}
# Determine toxicity level based on scores
if scores.get('severe_toxic', 0) > 0.5:
return ToxicityLevel.THREAT, scores
elif scores.get('obscene', 0) > 0.5:
return ToxicityLevel.EXPLICIT, scores
elif scores.get('insult', 0) > 0.4:
return ToxicityLevel.SLUR, scores
elif scores.get('toxic', 0) > 0.3:
return ToxicityLevel.MILD, scores
else:
return ToxicityLevel.SAFE, scores
except Exception as e:
print(f"Error during AI classification: {e}")
return ToxicityLevel.SAFE, {}