File size: 3,608 Bytes
1ac7b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
"""
Hugging Face compatible inference for content moderation
"""
import pickle
from huggingface_hub import hf_hub_download
from enum import Enum

class AgeMode(Enum):
    UNDER_13 = "under_13"
    TEEN_PLUS = "teen_plus"

class ContentLabel(Enum):
    SAFE = 0
    HARASSMENT = 1
    SWEARING_REACTION = 2
    SWEARING_AGGRESSIVE = 3
    HATE_SPEECH = 4
    SPAM = 5

class DualModeFilter:
    """
    Dual-mode content filter for Hugging Face
    
    Usage:
        filter = DualModeFilter("Naymmm/content-moderation-dual-mode")
        result = filter.check("text here", age=15)
    """
    
    def __init__(self, repo_id="darwinkernelpanic/moderat", token=None):
        # Download model from HF
        model_path = hf_hub_download(
            repo_id=repo_id,
            filename="moderation_model.pkl",
            token=token
        )
        
        # Load model
        with open(model_path, 'rb') as f:
            self.pipeline = pickle.load(f)
        
        self.under_13_blocked = [1, 2, 3, 4, 5]
        self.teen_plus_blocked = [1, 3, 4, 5]
        self.label_names = [l.name for l in ContentLabel]
    
    def predict(self, text):
        """Predict label for text"""
        prediction = self.pipeline.predict([text])[0]
        probs = self.pipeline.predict_proba([text])[0]
        confidence = max(probs)
        return ContentLabel(prediction), confidence
    
    def check(self, text, age):
        """
        Check content against age-appropriate filters
        
        Args:
            text: Text to check
            age: User age (determines strict vs laxed mode)
        
        Returns:
            dict with 'allowed', 'label', 'confidence', 'mode', 'reason'
        """
        label, confidence = self.predict(text)
        mode = AgeMode.TEEN_PLUS if age >= 13 else AgeMode.UNDER_13
        
        # Low confidence check
        if confidence < 0.5:
            return {
                "allowed": True,
                "label": "UNCERTAIN",
                "confidence": confidence,
                "mode": mode.value,
                "reason": "Low confidence - manual review recommended"
            }
        
        # Check if blocked for this age
        if age >= 13:
            allowed = label.value not in self.teen_plus_blocked
        else:
            allowed = label.value not in self.under_13_blocked
        
        reason = "Safe"
        if not allowed:
            if label == ContentLabel.SWEARING_REACTION and age >= 13:
                reason = "Swearing permitted as reaction (13+)"
                allowed = True
            else:
                reason = f"{label.name} detected"
        
        return {
            "allowed": allowed,
            "label": label.name,
            "confidence": confidence,
            "mode": mode.value,
            "reason": reason
        }

# Example usage
if __name__ == "__main__":
    print("Testing Dual-Mode Content Filter")
    print("="*50)
    
    # Initialize (downloads model from HF)
    filter_sys = DualModeFilter()
    
    tests = [
        ("that was a great game", 10),
        ("that was a great game", 15),
        ("shit that sucks", 10),
        ("shit that sucks", 15),
        ("you're a piece of shit", 15),
        ("kill yourself", 15),
    ]
    
    for text, age in tests:
        result = filter_sys.check(text, age)
        status = "✅ ALLOWED" if result["allowed"] else "❌ BLOCKED"
        print(f"\nAge {age}: '{text}'")
        print(f"  {status} - {result['reason']}")
        print(f"  Confidence: {result['confidence']:.2f}")