|
|
|
|
|
""" |
|
|
Hugging Face compatible inference for content moderation |
|
|
""" |
|
|
import pickle |
|
|
from huggingface_hub import hf_hub_download |
|
|
from enum import Enum |
|
|
|
|
|
class AgeMode(Enum): |
|
|
UNDER_13 = "under_13" |
|
|
TEEN_PLUS = "teen_plus" |
|
|
|
|
|
class ContentLabel(Enum): |
|
|
SAFE = 0 |
|
|
HARASSMENT = 1 |
|
|
SWEARING_REACTION = 2 |
|
|
SWEARING_AGGRESSIVE = 3 |
|
|
HATE_SPEECH = 4 |
|
|
SPAM = 5 |
|
|
|
|
|
class DualModeFilter: |
|
|
""" |
|
|
Dual-mode content filter for Hugging Face |
|
|
|
|
|
Usage: |
|
|
filter = DualModeFilter("Naymmm/content-moderation-dual-mode") |
|
|
result = filter.check("text here", age=15) |
|
|
""" |
|
|
|
|
|
def __init__(self, repo_id="darwinkernelpanic/moderat", token=None): |
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id=repo_id, |
|
|
filename="moderation_model.pkl", |
|
|
token=token |
|
|
) |
|
|
|
|
|
|
|
|
with open(model_path, 'rb') as f: |
|
|
self.pipeline = pickle.load(f) |
|
|
|
|
|
self.under_13_blocked = [1, 2, 3, 4, 5] |
|
|
self.teen_plus_blocked = [1, 3, 4, 5] |
|
|
self.label_names = [l.name for l in ContentLabel] |
|
|
|
|
|
def predict(self, text): |
|
|
"""Predict label for text""" |
|
|
prediction = self.pipeline.predict([text])[0] |
|
|
probs = self.pipeline.predict_proba([text])[0] |
|
|
confidence = max(probs) |
|
|
return ContentLabel(prediction), confidence |
|
|
|
|
|
def check(self, text, age): |
|
|
""" |
|
|
Check content against age-appropriate filters |
|
|
|
|
|
Args: |
|
|
text: Text to check |
|
|
age: User age (determines strict vs laxed mode) |
|
|
|
|
|
Returns: |
|
|
dict with 'allowed', 'label', 'confidence', 'mode', 'reason' |
|
|
""" |
|
|
label, confidence = self.predict(text) |
|
|
mode = AgeMode.TEEN_PLUS if age >= 13 else AgeMode.UNDER_13 |
|
|
|
|
|
|
|
|
if confidence < 0.5: |
|
|
return { |
|
|
"allowed": True, |
|
|
"label": "UNCERTAIN", |
|
|
"confidence": confidence, |
|
|
"mode": mode.value, |
|
|
"reason": "Low confidence - manual review recommended" |
|
|
} |
|
|
|
|
|
|
|
|
if age >= 13: |
|
|
allowed = label.value not in self.teen_plus_blocked |
|
|
else: |
|
|
allowed = label.value not in self.under_13_blocked |
|
|
|
|
|
reason = "Safe" |
|
|
if not allowed: |
|
|
if label == ContentLabel.SWEARING_REACTION and age >= 13: |
|
|
reason = "Swearing permitted as reaction (13+)" |
|
|
allowed = True |
|
|
else: |
|
|
reason = f"{label.name} detected" |
|
|
|
|
|
return { |
|
|
"allowed": allowed, |
|
|
"label": label.name, |
|
|
"confidence": confidence, |
|
|
"mode": mode.value, |
|
|
"reason": reason |
|
|
} |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Testing Dual-Mode Content Filter") |
|
|
print("="*50) |
|
|
|
|
|
|
|
|
filter_sys = DualModeFilter() |
|
|
|
|
|
tests = [ |
|
|
("that was a great game", 10), |
|
|
("that was a great game", 15), |
|
|
("shit that sucks", 10), |
|
|
("shit that sucks", 15), |
|
|
("you're a piece of shit", 15), |
|
|
("kill yourself", 15), |
|
|
] |
|
|
|
|
|
for text, age in tests: |
|
|
result = filter_sys.check(text, age) |
|
|
status = "✅ ALLOWED" if result["allowed"] else "❌ BLOCKED" |
|
|
print(f"\nAge {age}: '{text}'") |
|
|
print(f" {status} - {result['reason']}") |
|
|
print(f" Confidence: {result['confidence']:.2f}") |
|
|
|