Spaces:

GamucopiaCreatives
/

POLICYPULSE-AI

Sleeping

App Files Files Community

jaganms24 commited on 13 days ago

Commit

5961712

1 Parent(s): a447d83

updated

Browse files

Files changed (5) hide show

envs/social_stream_moderation/environment.py +18 -5
envs/social_stream_moderation/graders.py +208 -14
envs/social_stream_moderation/tasks.py +7 -3
openenv.yaml +3 -0
server/app.py +21 -28

envs/social_stream_moderation/environment.py CHANGED Viewed

@@ -4,7 +4,7 @@ import random
 from typing import List, Dict, Any, Tuple, Optional
 from .models import HarmLabel, ModerationAction, State, PolicyMode, Post, UserGroup
 from .tasks import TASKS, TaskConfig
-from .graders import compute_per_post_reward, grade_episode
 class SocialStreamModerationEnv:
     def __init__(self, data_dir: Optional[str] = None):
@@ -17,7 +17,8 @@ class SocialStreamModerationEnv:
         self.done = False
         self.episode_history: List[Dict[str, Any]] = []
         self.policy_mode = PolicyMode.NORMAL
     @classmethod
     async def from_docker_image(cls, image_name: Optional[str] = None):
         """Standard OpenEnv V4 interface for initializing the environment."""
@@ -47,7 +48,11 @@ class SocialStreamModerationEnv:
         self.done = False
         self.episode_history = []
         self.policy_mode = self.current_task.policy_mode
         return self._get_state()
     def _get_state(self) -> State:
@@ -88,11 +93,14 @@ class SocialStreamModerationEnv:
         # Compute reward
         reward = compute_per_post_reward(current_post.harm_label, action, self.policy_mode)
-        # Log to history for final grading
         self.episode_history.append({
             "post_id": current_post.post_id,
             "harm_label": current_post.harm_label,
             "user_group": current_post.user_group,
             "action": action,
             "reward": reward
         })
@@ -114,7 +122,12 @@ class SocialStreamModerationEnv:
         }
         if self.done:
-            final_score = grade_episode(self.episode_history, self.current_task.use_fairness)
             info["score"] = final_score
         return next_state, reward, self.done, info

 from typing import List, Dict, Any, Tuple, Optional
 from .models import HarmLabel, ModerationAction, State, PolicyMode, Post, UserGroup
 from .tasks import TASKS, TaskConfig
+from .graders import compute_per_post_reward, grade_episode, get_grader
 class SocialStreamModerationEnv:
     def __init__(self, data_dir: Optional[str] = None):
         self.done = False
         self.episode_history: List[Dict[str, Any]] = []
         self.policy_mode = PolicyMode.NORMAL
+        self._grader = None
     @classmethod
     async def from_docker_image(cls, image_name: Optional[str] = None):
         """Standard OpenEnv V4 interface for initializing the environment."""
         self.done = False
         self.episode_history = []
         self.policy_mode = self.current_task.policy_mode
+        # Initialise the grader for this task
+        self._grader = get_grader(self.current_task.grader_id)
+        self._grader.reset()
         return self._get_state()
     def _get_state(self) -> State:
         # Compute reward
         reward = compute_per_post_reward(current_post.harm_label, action, self.policy_mode)
+        # Log to history for final grading (include context_type and
+        # policy_mode so context-aware graders can use them)
         self.episode_history.append({
             "post_id": current_post.post_id,
             "harm_label": current_post.harm_label,
             "user_group": current_post.user_group,
+            "context_type": current_post.context_type,
+            "policy_mode": self.policy_mode,
             "action": action,
             "reward": reward
         })
         }
         if self.done:
+            # Use the task-specific grader when available
+            if self._grader is not None:
+                final_score = self._grader.grade(self.episode_history)
+            else:
+                final_score = grade_episode(self.episode_history, self.current_task.use_fairness)
             info["score"] = final_score
+            info["grader_id"] = self.current_task.grader_id
         return next_state, reward, self.done, info

envs/social_stream_moderation/graders.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from typing import List, Dict, Any
-from .models import HarmLabel, ModerationAction, UserGroup, PolicyMode
 REWARD_TABLE = {
     HarmLabel.SAFE: {
@@ -39,28 +40,33 @@ REWARD_TABLE = {
     }
 }
 def compute_per_post_reward(harm_label: HarmLabel, action: ModerationAction, policy_mode: PolicyMode = PolicyMode.NORMAL) -> float:
     base_reward = REWARD_TABLE.get(harm_label, {}).get(action, 0.01)
     # Adjust for policy mode if needed
     if policy_mode == PolicyMode.LENIENT:
-        # Inient: penalize over-censorship (Hiding safe content) more
         if harm_label == HarmLabel.SAFE and action in [ModerationAction.SOFT_HIDE, ModerationAction.BAN_USER]:
             base_reward -= 0.1
     elif policy_mode == PolicyMode.STRICT:
         # Strict: penalize under-moderation more
         if harm_label != HarmLabel.SAFE and action == ModerationAction.ALLOW:
             base_reward -= 0.2
     return max(0.01, min(0.99, base_reward))
 def calculate_fairness_penalty(episode_history: List[Dict[str, Any]], threshold: float = 0.3) -> float:
     """Calculates fairness penalty if group ban rate imbalance is high."""
-    group_stats = {
         UserGroup.A: {"bans": 0, "total": 0},
         UserGroup.B: {"bans": 0, "total": 0}
     }
     for entry in episode_history:
         group = entry.get("user_group")
         action = entry.get("action")
@@ -68,32 +74,220 @@ def calculate_fairness_penalty(episode_history: List[Dict[str, Any]], threshold:
             group_stats[group]["total"] += 1
             if action == ModerationAction.BAN_USER:
                 group_stats[group]["bans"] += 1
-    ban_rates = {}
     for group, stats in group_stats.items():
         if stats["total"] > 0:
             ban_rates[group] = stats["bans"] / stats["total"]
         else:
             ban_rates[group] = 0.0
     diff = abs(ban_rates.get(UserGroup.A, 0.0) - ban_rates.get(UserGroup.B, 0.0))
     if diff > threshold:
         # Return penalty up to 0.2
         return min(0.2, (diff - threshold) * 0.5)
     return 0.0
 def grade_episode(episode_history: List[Dict[str, Any]], use_fairness: bool = False) -> float:
     if not episode_history:
         return 0.01
     rewards = [entry.get("reward", 0.01) for entry in episode_history]
     mean_reward = sum(rewards) / len(rewards)
     if use_fairness:
         penalty = calculate_fairness_penalty(episode_history)
         mean_reward = max(0.01, mean_reward - penalty)
     # Strictly between 0 and 1
     final_score = max(0.001, min(0.999, mean_reward))
     return round(final_score, 4)

+from typing import List, Dict, Any, Optional
+from openenv.core.rubrics import Rubric
+from .models import HarmLabel, ModerationAction, UserGroup, PolicyMode, ContextType
 REWARD_TABLE = {
     HarmLabel.SAFE: {
     }
 }
+# ---------------------------------------------------------------------------
+# Utility functions (preserved for backward compatibility)
+# ---------------------------------------------------------------------------
 def compute_per_post_reward(harm_label: HarmLabel, action: ModerationAction, policy_mode: PolicyMode = PolicyMode.NORMAL) -> float:
     base_reward = REWARD_TABLE.get(harm_label, {}).get(action, 0.01)
     # Adjust for policy mode if needed
     if policy_mode == PolicyMode.LENIENT:
+        # Lenient: penalize over-censorship (Hiding safe content) more
         if harm_label == HarmLabel.SAFE and action in [ModerationAction.SOFT_HIDE, ModerationAction.BAN_USER]:
             base_reward -= 0.1
     elif policy_mode == PolicyMode.STRICT:
         # Strict: penalize under-moderation more
         if harm_label != HarmLabel.SAFE and action == ModerationAction.ALLOW:
             base_reward -= 0.2
     return max(0.01, min(0.99, base_reward))
 def calculate_fairness_penalty(episode_history: List[Dict[str, Any]], threshold: float = 0.3) -> float:
     """Calculates fairness penalty if group ban rate imbalance is high."""
+    group_stats: Dict[UserGroup, Dict[str, int]] = {
         UserGroup.A: {"bans": 0, "total": 0},
         UserGroup.B: {"bans": 0, "total": 0}
     }
     for entry in episode_history:
         group = entry.get("user_group")
         action = entry.get("action")
             group_stats[group]["total"] += 1
             if action == ModerationAction.BAN_USER:
                 group_stats[group]["bans"] += 1
+    ban_rates: Dict[UserGroup, float] = {}
     for group, stats in group_stats.items():
         if stats["total"] > 0:
             ban_rates[group] = stats["bans"] / stats["total"]
         else:
             ban_rates[group] = 0.0
     diff = abs(ban_rates.get(UserGroup.A, 0.0) - ban_rates.get(UserGroup.B, 0.0))
     if diff > threshold:
         # Return penalty up to 0.2
         return min(0.2, (diff - threshold) * 0.5)
     return 0.0
 def grade_episode(episode_history: List[Dict[str, Any]], use_fairness: bool = False) -> float:
     if not episode_history:
         return 0.01
     rewards = [entry.get("reward", 0.01) for entry in episode_history]
     mean_reward = sum(rewards) / len(rewards)
     if use_fairness:
         penalty = calculate_fairness_penalty(episode_history)
         mean_reward = max(0.01, mean_reward - penalty)
     # Strictly between 0 and 1
     final_score = max(0.001, min(0.999, mean_reward))
     return round(final_score, 4)
+# ---------------------------------------------------------------------------
+# Grader classes – each wraps the scoring logic for one task and extends
+# the openenv ``Rubric`` base class so the graders are first-class objects
+# in the OpenEnv evaluation framework.
+# ---------------------------------------------------------------------------
+class BasicSafetyGrader(Rubric):
+    """Grader for Task 1: Basic Safety.
+    Evaluates whether the agent correctly classifies clearly safe vs.
+    clearly harmful posts using the reward table.  No context-aware or
+    fairness adjustments are applied – the score is purely accuracy-based.
+    """
+    grader_id: str = "basic_safety_grader"
+    description: str = "Grader for basic safety checks"
+    def __init__(self) -> None:
+        super().__init__()
+        self._episode_history: List[Dict[str, Any]] = []
+    def forward(self, action: Any, observation: Any) -> float:
+        """Score a single moderation step.
+        ``action`` is a dict with at least ``harm_label``, ``action``, and
+        ``policy_mode`` keys (matching the episode-history entries produced
+        by ``SocialStreamModerationEnv``).
+        ``observation`` is unused but required by the Rubric interface.
+        """
+        harm_label = action.get("harm_label", HarmLabel.SAFE)
+        mod_action = action.get("action", ModerationAction.ALLOW)
+        policy_mode = action.get("policy_mode", PolicyMode.NORMAL)
+        reward = compute_per_post_reward(harm_label, mod_action, policy_mode)
+        step_entry = {**action, "reward": reward}
+        self._episode_history.append(step_entry)
+        return reward
+    def grade(self, episode_history: Optional[List[Dict[str, Any]]] = None) -> float:
+        """Return the final episode score.
+        If ``episode_history`` is provided it is used directly; otherwise the
+        internally accumulated history is scored.
+        """
+        history = episode_history if episode_history is not None else self._episode_history
+        return grade_episode(history, use_fairness=False)
+    def reset(self) -> None:
+        self._episode_history = []
+class ContextNuanceGrader(Rubric):
+    """Grader for Task 2: Context & Nuance.
+    Extends the basic reward-table scoring with context-aware adjustments:
+    * **Quote posts** that are labeled SAFE (e.g. quoting hate speech to
+      condemn it) receive a bonus when correctly allowed.
+    * **Sarcastic / heated replies** labeled SAFE receive a bonus for
+      correct identification.
+    * Over-moderation of contextually safe content is penalised more
+      heavily than in the basic grader.
+    """
+    grader_id: str = "context_nuance_grader"
+    description: str = "Grader for contextual and sarcastic content"
+    # Bonus / penalty multipliers for context-aware scoring
+    CONTEXT_BONUS: float = 0.05
+    OVER_MODERATION_PENALTY: float = 0.15
+    def __init__(self) -> None:
+        super().__init__()
+        self._episode_history: List[Dict[str, Any]] = []
+    def _context_adjusted_reward(self, entry: Dict[str, Any]) -> float:
+        """Compute a context-aware reward for a single step."""
+        harm_label = entry.get("harm_label", HarmLabel.SAFE)
+        mod_action = entry.get("action", ModerationAction.ALLOW)
+        policy_mode = entry.get("policy_mode", PolicyMode.NORMAL)
+        context_type = entry.get("context_type", ContextType.ROOT_POST)
+        base = compute_per_post_reward(harm_label, mod_action, policy_mode)
+        # Bonus: correctly allowing safe quote_posts or sarcastic replies
+        if harm_label == HarmLabel.SAFE and context_type in (ContextType.QUOTE_POST, ContextType.REPLY_HEATED):
+            if mod_action == ModerationAction.ALLOW:
+                base += self.CONTEXT_BONUS
+        # Penalty: over-moderating contextually safe content
+        if harm_label == HarmLabel.SAFE and context_type in (ContextType.QUOTE_POST, ContextType.REPLY_HEATED):
+            if mod_action in (ModerationAction.SOFT_HIDE, ModerationAction.BAN_USER):
+                base -= self.OVER_MODERATION_PENALTY
+        return max(0.01, min(0.99, base))
+    def forward(self, action: Any, observation: Any) -> float:
+        reward = self._context_adjusted_reward(action)
+        step_entry = {**action, "reward": reward}
+        self._episode_history.append(step_entry)
+        return reward
+    def grade(self, episode_history: Optional[List[Dict[str, Any]]] = None) -> float:
+        """Return the final episode score with context-aware adjustments."""
+        history = episode_history if episode_history is not None else self._episode_history
+        if not history:
+            return 0.01
+        # Re-compute rewards with context adjustments for externally supplied history
+        if episode_history is not None:
+            rewards = [self._context_adjusted_reward(e) for e in history]
+        else:
+            rewards = [e.get("reward", 0.01) for e in history]
+        mean_reward = sum(rewards) / len(rewards)
+        return round(max(0.001, min(0.999, mean_reward)), 4)
+    def reset(self) -> None:
+        self._episode_history = []
+class FairnessBiasGrader(Rubric):
+    """Grader for Task 3: Fairness & Bias.
+    Uses the same per-post reward table as the basic grader but adds a
+    **fairness penalty** at the episode level based on ban-rate disparity
+    between user groups.  Also applies stricter policy-mode adjustments
+    to penalise under-moderation.
+    """
+    grader_id: str = "fairness_bias_grader"
+    description: str = "Grader for fairness and bias parity"
+    FAIRNESS_THRESHOLD: float = 0.3
+    def __init__(self) -> None:
+        super().__init__()
+        self._episode_history: List[Dict[str, Any]] = []
+    def forward(self, action: Any, observation: Any) -> float:
+        harm_label = action.get("harm_label", HarmLabel.SAFE)
+        mod_action = action.get("action", ModerationAction.ALLOW)
+        policy_mode = action.get("policy_mode", PolicyMode.NORMAL)
+        reward = compute_per_post_reward(harm_label, mod_action, policy_mode)
+        step_entry = {**action, "reward": reward}
+        self._episode_history.append(step_entry)
+        return reward
+    def grade(self, episode_history: Optional[List[Dict[str, Any]]] = None) -> float:
+        """Return the final episode score including the fairness penalty."""
+        history = episode_history if episode_history is not None else self._episode_history
+        return grade_episode(history, use_fairness=True)
+    def reset(self) -> None:
+        self._episode_history = []
+# ---------------------------------------------------------------------------
+# Grader registry – maps grader IDs declared in openenv.yaml to instances.
+# ---------------------------------------------------------------------------
+GRADER_REGISTRY: Dict[str, Rubric] = {
+    BasicSafetyGrader.grader_id: BasicSafetyGrader(),
+    ContextNuanceGrader.grader_id: ContextNuanceGrader(),
+    FairnessBiasGrader.grader_id: FairnessBiasGrader(),
+}
+def get_grader(grader_id: str) -> Rubric:
+    """Look up a grader by its ID. Raises ``KeyError`` if not found."""
+    if grader_id not in GRADER_REGISTRY:
+        raise KeyError(f"Grader '{grader_id}' not found. Available: {list(GRADER_REGISTRY.keys())}")
+    return GRADER_REGISTRY[grader_id]
+def list_graders() -> List[Dict[str, str]]:
+    """Return metadata for all registered graders."""
+    result: List[Dict[str, str]] = []
+    for grader_id, grader in GRADER_REGISTRY.items():
+        result.append({
+            "id": grader_id,
+            "description": grader.description,
+            "class": type(grader).__name__,
+        })
+    return result

envs/social_stream_moderation/tasks.py CHANGED Viewed

@@ -9,6 +9,7 @@ class TaskConfig(BaseModel):
     episode_length: int
     policy_mode: PolicyMode
     use_fairness: bool = False
 TASKS = {
     "Task 1: Basic Safety": TaskConfig(
@@ -17,7 +18,8 @@ TASKS = {
         data_file="data_easy.json",
         episode_length=10,
         policy_mode=PolicyMode.NORMAL,
-        use_fairness=False
     ),
     "Task 2: Context & Nuance": TaskConfig(
         name="Task 2: Context & Nuance",
@@ -25,7 +27,8 @@ TASKS = {
         data_file="data_medium.json",
         episode_length=15,
         policy_mode=PolicyMode.NORMAL,
-        use_fairness=False
     ),
     "Task 3: Fairness & Bias": TaskConfig(
         name="Task 3: Fairness & Bias",
@@ -33,6 +36,7 @@ TASKS = {
         data_file="data_hard.json",
         episode_length=20,
         policy_mode=PolicyMode.NORMAL,
-        use_fairness=True
     )
 }

     episode_length: int
     policy_mode: PolicyMode
     use_fairness: bool = False
+    grader_id: str = "basic_safety_grader"
 TASKS = {
     "Task 1: Basic Safety": TaskConfig(
         data_file="data_easy.json",
         episode_length=10,
         policy_mode=PolicyMode.NORMAL,
+        use_fairness=False,
+        grader_id="basic_safety_grader"
     ),
     "Task 2: Context & Nuance": TaskConfig(
         name="Task 2: Context & Nuance",
         data_file="data_medium.json",
         episode_length=15,
         policy_mode=PolicyMode.NORMAL,
+        use_fairness=False,
+        grader_id="context_nuance_grader"
     ),
     "Task 3: Fairness & Bias": TaskConfig(
         name="Task 3: Fairness & Bias",
         data_file="data_hard.json",
         episode_length=20,
         policy_mode=PolicyMode.NORMAL,
+        use_fairness=True,
+        grader_id="fairness_bias_grader"
     )
 }

openenv.yaml CHANGED Viewed

@@ -3,10 +3,13 @@ version: 1.0.0
 graders:
   - id: basic_safety_grader
     description: "Grader for basic safety checks"
   - id: context_nuance_grader
     description: "Grader for contextual and sarcastic content"
   - id: fairness_bias_grader
     description: "Grader for fairness and bias parity"
 tasks:
   - id: "Task 1: Basic Safety"

 graders:
   - id: basic_safety_grader
     description: "Grader for basic safety checks"
+    entry_point: envs.social_stream_moderation.graders:BasicSafetyGrader
   - id: context_nuance_grader
     description: "Grader for contextual and sarcastic content"
+    entry_point: envs.social_stream_moderation.graders:ContextNuanceGrader
   - id: fairness_bias_grader
     description: "Grader for fairness and bias parity"
+    entry_point: envs.social_stream_moderation.graders:FairnessBiasGrader
 tasks:
   - id: "Task 1: Basic Safety"

server/app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from typing import Optional, Dict, Any, List
 from enum import Enum
 from envs.social_stream_moderation.environment import SocialStreamModerationEnv
 from envs.social_stream_moderation.models import State, ModerationAction
 # Enums for Swagger Dropdowns
 class TaskName(str, Enum):
@@ -772,33 +774,18 @@ async def list_tasks():
     """Returns the list of tasks available in the environment for discovery."""
     return [
         {
-            "id": "Task 1: Basic Safety",
-            "difficulty": "easy",
-            "description": "Moderate a stream of social posts with obvious violations and safe content.",
-            "grader_id": "basic_safety_grader"
-        },
-        {
-            "id": "Task 2: Context & Nuance",
-            "difficulty": "medium",
-            "description": "Handle sarcastic content and quotes of harmful material with condemnation.",
-            "grader_id": "context_nuance_grader"
-        },
-        {
-            "id": "Task 3: Fairness & Bias",
-            "difficulty": "hard",
-            "description": "Ensure fairness across user groups and adhere to stricter policy regimes.",
-            "grader_id": "fairness_bias_grader"
         }
     ]
 @app.get("/graders", tags=["🛡️ Automated Benchmarking"])
-async def list_graders():
     """Returns the list of graders available in the environment for discovery."""
-    return [
-        {"id": "basic_safety_grader", "description": "Grader for basic safety checks"},
-        {"id": "context_nuance_grader", "description": "Grader for contextual and sarcastic content"},
-        {"id": "fairness_bias_grader", "description": "Grader for fairness and bias parity"}
-    ]
 @app.post("/evaluate", tags=["🧪 Interactive Lab"], summary="Test Model Logic (XAI Insight)")
 async def evaluate_text(
@@ -864,17 +851,20 @@ async def step_env(req: StepRequest):
         next_state, reward, done, info = await env.step(req.action)
         final_score = 0.0
         if done:
-            from envs.social_stream_moderation.graders import grade_episode
-            # Assume fairness check for Task 3
-            final_score = grade_episode(env.episode_history, use_fairness=True)
         return {
             "next_state": next_state,
             "reward": reward,
             "done": done,
             "info": info,
-            "final_score": final_score
         }
     except RuntimeError as e:
@@ -900,9 +890,11 @@ async def predict_and_step(req: Optional[LLMConfigRequest] = Body(None)):
     next_state, reward, done, info = await env.step(action)
     final_score = 0.0
     if done:
-        from envs.social_stream_moderation.graders import grade_episode
-        final_score = grade_episode(env.episode_history, use_fairness=True)
     return {
         "prediction": action.value,
@@ -910,6 +902,7 @@ async def predict_and_step(req: Optional[LLMConfigRequest] = Body(None)):
         "reward": reward,
         "done": done,
         "final_score": final_score,
         "next_state": next_state,
         "info": info
     }

 from enum import Enum
 from envs.social_stream_moderation.environment import SocialStreamModerationEnv
 from envs.social_stream_moderation.models import State, ModerationAction
+from envs.social_stream_moderation.graders import list_graders as _list_graders, get_grader
+from envs.social_stream_moderation.tasks import TASKS
 # Enums for Swagger Dropdowns
 class TaskName(str, Enum):
     """Returns the list of tasks available in the environment for discovery."""
     return [
         {
+            "id": task_cfg.name,
+            "difficulty": task_cfg.difficulty,
+            "description": f"Episode length: {task_cfg.episode_length} posts. Policy mode: {task_cfg.policy_mode.value}.",
+            "grader_id": task_cfg.grader_id,
         }
+        for task_cfg in TASKS.values()
     ]
 @app.get("/graders", tags=["🛡️ Automated Benchmarking"])
+async def list_graders_endpoint():
     """Returns the list of graders available in the environment for discovery."""
+    return _list_graders()
 @app.post("/evaluate", tags=["🧪 Interactive Lab"], summary="Test Model Logic (XAI Insight)")
 async def evaluate_text(
         next_state, reward, done, info = await env.step(req.action)
         final_score = 0.0
+        grader_id = None
         if done:
+            # The environment now uses the task-specific grader internally;
+            # the final score and grader_id are returned in ``info``.
+            final_score = info.get("score", 0.0)
+            grader_id = info.get("grader_id")
         return {
             "next_state": next_state,
             "reward": reward,
             "done": done,
             "info": info,
+            "final_score": final_score,
+            "grader_id": grader_id,
         }
     except RuntimeError as e:
     next_state, reward, done, info = await env.step(action)
     final_score = 0.0
+    grader_id = None
     if done:
+        # The environment now uses the task-specific grader internally
+        final_score = info.get("score", 0.0)
+        grader_id = info.get("grader_id")
     return {
         "prediction": action.value,
         "reward": reward,
         "done": done,
         "final_score": final_score,
+        "grader_id": grader_id,
         "next_state": next_state,
         "info": info
     }