File size: 2,464 Bytes
deff797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from typing import Optional
from .classifier import ToxicityLevel
from ..utils.config import config

class AIClassifier:
    """AI-powered toxicity classifier using Hugging Face models."""
    
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self._initialized = False
        
    def _initialize(self):
        """Lazy load the model to avoid startup delays."""
        if self._initialized:
            return
            
        try:
            from transformers import pipeline
            
            # Use a toxicity detection model
            # This model works without authentication
            self.model = pipeline(
                "text-classification",
                model="unitary/toxic-bert",
                top_k=None,
                token=config.HUGGINGFACE_TOKEN
            )
            self._initialized = True
            print("✓ AI Classifier initialized with toxic-bert model")
        except Exception as e:
            print(f"⚠ Could not initialize AI model: {e}")
            print("  Falling back to rule-based classification")
            self._initialized = False
    
    def classify(self, text: str) -> tuple[ToxicityLevel, dict]:
        """
        Classify text using AI model.
        
        Returns:
            Tuple of (ToxicityLevel, confidence_scores)
        """
        self._initialize()
        
        if not self._initialized or self.model is None:
            # Fallback to basic classification
            return ToxicityLevel.SAFE, {}
        
        try:
            results = self.model(text)[0]
            
            # toxic-bert returns labels like 'toxic', 'severe_toxic', 'obscene', etc.
            scores = {item['label']: item['score'] for item in results}
            
            # Determine toxicity level based on scores
            if scores.get('severe_toxic', 0) > 0.5:
                return ToxicityLevel.THREAT, scores
            elif scores.get('obscene', 0) > 0.5:
                return ToxicityLevel.EXPLICIT, scores
            elif scores.get('insult', 0) > 0.4:
                return ToxicityLevel.SLUR, scores
            elif scores.get('toxic', 0) > 0.3:
                return ToxicityLevel.MILD, scores
            else:
                return ToxicityLevel.SAFE, scores
                
        except Exception as e:
            print(f"Error during AI classification: {e}")
            return ToxicityLevel.SAFE, {}