Spaces:
Sleeping
Sleeping
File size: 1,643 Bytes
5c5b473 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | import numpy as np
from app.models.toxicity_model import predict_toxicity
class ModerationEnv:
def __init__(self, data):
self.data = data
self.index = 0
def reset(self):
self.index = 0
return self._get_state()
def step(self, action):
text, true_label = self.data[self.index]
reward = self.get_reward(action, true_label)
self.index += 1
done = self.index >= len(self.data)
next_state = None if done else self._get_state()
return next_state, reward, done
# 🔥 NEW: Convert text → state vector
def _get_state(self):
text, _ = self.data[self.index]
ai_scores = predict_toxicity(text)
state = np.array([
ai_scores.get("toxicity", 0.0),
ai_scores.get("insult", 0.0),
ai_scores.get("threat", 0.0),
ai_scores.get("obscene", 0.0),
])
return state
# 🔥 IMPROVED REWARD FUNCTION
def get_reward(self, action, true_label):
"""
action: 0=allow, 1=flag, 2=remove
true_label: "safe", "flag", "remove"
"""
action_map = ["allow", "flag", "remove"]
predicted = action
# ✅ Perfect decision
if predicted == true_label:
return 3
# ⚠️ Slight mistake
if predicted == "flag" and true_label in ["allow", "remove"]:
return 1
# ❌ Dangerous mistakes
if predicted == "allow" and true_label == "remove":
return -4
if predicted == "remove" and true_label == "allow":
return -3
return -1 |