Spaces:

SujanMidatani
/

openenv-multimodal-moderation

Sleeping

App Files Files Community

SujanMidatani commited on 16 days ago

Commit

9e245c9

verified ·

1 Parent(s): 1cde9d1

Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

__init__.py +15 -15
client.py +39 -39
models.py +18 -18
openenv.yaml +74 -48
server/__init__.py +2 -2
server/app.py +148 -148
server/env.py +172 -172
server/models.py +67 -67
server/rag/__init__.py +2 -2
server/rag/retriever.py +97 -97
server/server_routes.py +1 -1

__init__.py CHANGED Viewed

@@ -1,16 +1,16 @@
-from client import ModerationEnv, ModerationEnvAction, ModerationEnvObservation, ModerationEnvState
-from models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
-__all__ = [
-    "ModerationEnv",
-    "ModerationEnvAction",
-    "ModerationEnvObservation",
-    "ModerationEnvState",
-    "Action",
-    "ActionType",
-    "Content",
-    "Observation",
-    "PolicyChunk",
-    "State",
-    "StepType",
 ]

+from client import ModerationEnv, ModerationEnvAction, ModerationEnvObservation, ModerationEnvState
+from models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
+__all__ = [
+    "ModerationEnv",
+    "ModerationEnvAction",
+    "ModerationEnvObservation",
+    "ModerationEnvState",
+    "Action",
+    "ActionType",
+    "Content",
+    "Observation",
+    "PolicyChunk",
+    "State",
+    "StepType",
 ]

client.py CHANGED Viewed

@@ -1,40 +1,40 @@
-from __future__ import annotations
-from typing import Any
-from openenv.core import EnvClient
-from openenv.core.client_types import StepResult
-try:
-    from .models import Action, Observation, State
-except ImportError:
-    from models import Action, Observation, State
-class ModerationEnv(EnvClient[Action, Observation, State]):
-    def _step_payload(self, action: Action) -> dict[str, Any]:
-        return action.model_dump(mode="json")
-    def _parse_result(self, payload: dict[str, Any]) -> StepResult[Observation]:
-        observation_payload = payload.get("observation", {})
-        return StepResult(
-            observation=Observation(**observation_payload),
-            reward=payload.get("reward"),
-            done=bool(payload.get("done", False)),
-        )
-    def _parse_state(self, payload: dict[str, Any]) -> State:
-        return State(**payload)
-ModerationEnvAction = Action
-ModerationEnvObservation = Observation
-ModerationEnvState = State
-__all__ = [
-    "ModerationEnv",
-    "ModerationEnvAction",
-    "ModerationEnvObservation",
-    "ModerationEnvState",
 ]

+from __future__ import annotations
+from typing import Any
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+try:
+    from .models import Action, Observation, State
+except ImportError:
+    from models import Action, Observation, State
+class ModerationEnv(EnvClient[Action, Observation, State]):
+    def _step_payload(self, action: Action) -> dict[str, Any]:
+        return action.model_dump(mode="json")
+    def _parse_result(self, payload: dict[str, Any]) -> StepResult[Observation]:
+        observation_payload = payload.get("observation", {})
+        return StepResult(
+            observation=Observation(**observation_payload),
+            reward=payload.get("reward"),
+            done=bool(payload.get("done", False)),
+        )
+    def _parse_state(self, payload: dict[str, Any]) -> State:
+        return State(**payload)
+ModerationEnvAction = Action
+ModerationEnvObservation = Observation
+ModerationEnvState = State
+__all__ = [
+    "ModerationEnv",
+    "ModerationEnvAction",
+    "ModerationEnvObservation",
+    "ModerationEnvState",
 ]

models.py CHANGED Viewed

@@ -1,19 +1,19 @@
-from server.models import (
-    Action,
-    ActionType,
-    Content,
-    Observation,
-    PolicyChunk,
-    State,
-    StepType,
-)
-__all__ = [
-    "Action",
-    "ActionType",
-    "Content",
-    "Observation",
-    "PolicyChunk",
-    "State",
-    "StepType",
 ]

+from server.models import (
+    Action,
+    ActionType,
+    Content,
+    Observation,
+    PolicyChunk,
+    State,
+    StepType,
+)
+__all__ = [
+    "Action",
+    "ActionType",
+    "Content",
+    "Observation",
+    "PolicyChunk",
+    "State",
+    "StepType",
 ]

openenv.yaml CHANGED Viewed

@@ -1,62 +1,88 @@
-spec_version: "1"
-name: "openenv-multimodal-moderation"
-type: "environment"
-runtime: "docker"
-app: "server.app:app"
-port: 8000
 description: >
   OpenEnv-compliant multimodal content moderation environment with staged
   analysis, policy retrieval, moderation, reviewer validation, and dense reward.
 tasks:
-  - id: "violence-remove"
-    name: "violence-remove"
-    description: "Agent must identify violent content and issue a remove action."
-    difficulty: "hard"  # Explicit violence detection + removal is most complex
     grader:
-      type: "reward_threshold"
       threshold: 0.5
-      endpoint: "/episode_summary"
-      reward_field: "total_reward"
-  - id: "nudity-flag"
-    name: "nudity-flag"
-    description: "Agent must identify nudity/sexual content and issue a flag action."
-    difficulty: "medium"  # Visual content analysis required
     grader:
-      type: "reward_threshold"
       threshold: 0.5
-      endpoint: "/episode_summary"
-      reward_field: "total_reward"
-  - id: "harassment-flag"
-    name: "harassment-flag"
-    description: "Agent must identify harassment and issue a flag action."
-    difficulty: "medium"  # Context/pattern recognition needed
     grader:
-      type: "reward_threshold"
       threshold: 0.5
-      endpoint: "/episode_summary"
-      reward_field: "total_reward"
-  - id: "misinfo-escalate"
-    name: "misinfo-escalate"
-    description: "Agent must identify misinformation and escalate for review."
-    difficulty: "hard"  # Requires fact-checking nuance + escalation judgment
     grader:
-      type: "reward_threshold"
       threshold: 0.5
-      endpoint: "/episode_summary"
-      reward_field: "total_reward"
-  - id: "benign-allow"
-    name: "benign-allow"
-    description: "Agent must correctly allow benign/safe content."
-    difficulty: "easy"  # Baseline safe content recognition
     grader:
-      type: "reward_threshold"
       threshold: 0.5
-      endpoint: "/episode_summary"
-      reward_field: "total_reward"

+name: openenv-multimodal-moderation
+version: "1.0.0"
 description: >
   OpenEnv-compliant multimodal content moderation environment with staged
   analysis, policy retrieval, moderation, reviewer validation, and dense reward.
+framework: meta-pytorch-openenv
+license: Apache-2.0
+entrypoint: server:app
+episode_flow:
+  - analyze
+  - retrieve_policy
+  - decide
+  - review
+  - finalize
+models:
+  action:
+    fields:
+      action_type: allow|flag|remove|escalate
+      reason: string
+  observation:
+    fields:
+      content: text + image metadata
+      policy: retrieved policy chunks
+      step_type: analyze|retrieve_policy|decide|review|finalize
+      step_count: integer
+  state:
+    fields:
+      episode_id: string
+      step_count: integer
+      done: boolean
+policy_retrieval:
+  source: server/rag/policies.json
+  strategy: keyword_overlap
+  top_k: 3
 tasks:
+  - name: violence-remove
+    description: Agent must identify violent content and issue a remove action
     grader:
+      type: reward_threshold
       threshold: 0.5
+      endpoint: /episode_summary
+      reward_field: total_reward
+  - name: nudity-flag
+    description: Agent must identify nudity/sexual content and issue a flag action
     grader:
+      type: reward_threshold
       threshold: 0.5
+      endpoint: /episode_summary
+      reward_field: total_reward
+  - name: harassment-flag
+    description: Agent must identify harassment and issue a flag action
     grader:
+      type: reward_threshold
       threshold: 0.5
+      endpoint: /episode_summary
+      reward_field: total_reward
+  - name: misinfo-escalate
+    description: Agent must identify misinformation and escalate for review
     grader:
+      type: reward_threshold
       threshold: 0.5
+      endpoint: /episode_summary
+      reward_field: total_reward
+  - name: benign-allow
+    description: Agent must correctly allow benign/safe content
     grader:
+      type: reward_threshold
       threshold: 0.5
+      endpoint: /episode_summary
+      reward_field: total_reward
+rewards:
+  analysis_step: 0.2
+  retrieval_step: 0.2
+  correct_decision: 1.0
+  reviewer_agreement: 0.2
+  unsafe_penalty: -0.6
+server:
+  reset: POST /reset
+  step: POST /step
+  state: GET /state
+  state_full: GET /state_full
+  episode_summary: GET /episode_summary
+  schema: GET /schema
+  docs: GET /docs

server/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .app import app
 __all__ = ["app"]


1	+ from .app import app
2	+
3	__all__ = ["app"]

server/app.py CHANGED Viewed

@@ -1,148 +1,148 @@
-from __future__ import annotations
-import traceback
-from typing import Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-try:
-    from .models import Action, Observation, State
-    from .env import ModerationEnvironment
-    from .logic import CASE_IDS
-except ImportError:
-    from models import Action, Observation, State
-    from env import ModerationEnvironment
-    from logic import CASE_IDS
-# ---------------------------------------------------------------------------
-# Single persistent environment — shared across ALL HTTP requests
-# ---------------------------------------------------------------------------
-_env = ModerationEnvironment()
-app = FastAPI(
-    title="OpenEnv Multimodal Moderation",
-    description="Multimodal content moderation RL environment",
-    version="1.0.0",
-)
-# ---------------------------------------------------------------------------
-# Request schemas
-# ---------------------------------------------------------------------------
-class ResetOptions(BaseModel):
-    case_id: Optional[str] = None
-    seed: Optional[int] = None
-    episode_id: Optional[str] = None
-class ResetRequest(BaseModel):
-    options: Optional[ResetOptions] = None
-class StepRequest(BaseModel):
-    action: Action
-# ---------------------------------------------------------------------------
-# Core OpenEnv endpoints
-# ---------------------------------------------------------------------------
-@app.post("/reset")
-async def reset(req: Optional[ResetRequest] = None) -> JSONResponse:
-    try:
-        opts = (req.options if req and req.options else None) or ResetOptions()
-        obs: Observation = _env.reset(
-            seed=opts.seed,
-            episode_id=opts.episode_id,
-            case_id=opts.case_id or "",
-        )
-        return JSONResponse({
-            "observation": obs.model_dump(mode="json"),
-            "reward": 0.0,
-            "done": False,
-        })
-    except Exception as e:
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/step")
-async def step(req: StepRequest) -> JSONResponse:
-    try:
-        obs: Observation = _env.step(req.action)
-        return JSONResponse({
-            "observation": obs.model_dump(mode="json"),
-            "reward": obs.reward,
-            "done": obs.done,
-        })
-    except RuntimeError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/state")
-async def get_state() -> JSONResponse:
-    return JSONResponse(_env.state.model_dump(mode="json"))
-@app.get("/schema")
-async def schema() -> JSONResponse:
-    return JSONResponse({
-        "action": Action.model_json_schema(),
-        "observation": Observation.model_json_schema(),
-        "state": State.model_json_schema(),
-    })
-# ---------------------------------------------------------------------------
-# /episode_summary — read by the reward_threshold graders in openenv.yaml
-# ---------------------------------------------------------------------------
-@app.get("/episode_summary")
-async def episode_summary() -> JSONResponse:
-    state = _env.state
-    breakdown = dict(state.reward_breakdown or {})
-    total_reward = round(sum(breakdown.values()), 4)
-    return JSONResponse({
-        "episode_id": state.episode_id,
-        "step_count": state.step_count,
-        "done": state.done,
-        "total_reward": total_reward,
-        "reward_breakdown": breakdown,
-        "final_action": state.final_action,
-        "reviewer_note": state.reviewer_note,
-    })
-# ---------------------------------------------------------------------------
-# Helper endpoints
-# ---------------------------------------------------------------------------
-@app.get("/cases")
-async def list_cases() -> JSONResponse:
-    return JSONResponse({"cases": CASE_IDS})
-@app.get("/state_full")
-async def state_full() -> JSONResponse:
-    return JSONResponse(_env.state.model_dump(mode="json"))
-@app.get("/health")
-async def health() -> JSONResponse:
-    return JSONResponse({"status": "ok"})
-def main(host: str = "0.0.0.0", port: int = 8000) -> None:
-    import uvicorn
-    uvicorn.run(app, host=host, port=port)
-if __name__ == "__main__":
-    main()

+from __future__ import annotations
+import traceback
+from typing import Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+try:
+    from .models import Action, Observation, State
+    from .env import ModerationEnvironment
+    from .logic import CASE_IDS
+except ImportError:
+    from models import Action, Observation, State
+    from env import ModerationEnvironment
+    from logic import CASE_IDS
+# ---------------------------------------------------------------------------
+# Single persistent environment — shared across ALL HTTP requests
+# ---------------------------------------------------------------------------
+_env = ModerationEnvironment()
+app = FastAPI(
+    title="OpenEnv Multimodal Moderation",
+    description="Multimodal content moderation RL environment",
+    version="1.0.0",
+)
+# ---------------------------------------------------------------------------
+# Request schemas
+# ---------------------------------------------------------------------------
+class ResetOptions(BaseModel):
+    case_id: Optional[str] = None
+    seed: Optional[int] = None
+    episode_id: Optional[str] = None
+class ResetRequest(BaseModel):
+    options: Optional[ResetOptions] = None
+class StepRequest(BaseModel):
+    action: Action
+# ---------------------------------------------------------------------------
+# Core OpenEnv endpoints
+# ---------------------------------------------------------------------------
+@app.post("/reset")
+async def reset(req: Optional[ResetRequest] = None) -> JSONResponse:
+    try:
+        opts = (req.options if req and req.options else None) or ResetOptions()
+        obs: Observation = _env.reset(
+            seed=opts.seed,
+            episode_id=opts.episode_id,
+            case_id=opts.case_id or "",
+        )
+        return JSONResponse({
+            "observation": obs.model_dump(mode="json"),
+            "reward": 0.0,
+            "done": False,
+        })
+    except Exception as e:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/step")
+async def step(req: StepRequest) -> JSONResponse:
+    try:
+        obs: Observation = _env.step(req.action)
+        return JSONResponse({
+            "observation": obs.model_dump(mode="json"),
+            "reward": obs.reward,
+            "done": obs.done,
+        })
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/state")
+async def get_state() -> JSONResponse:
+    return JSONResponse(_env.state.model_dump(mode="json"))
+@app.get("/schema")
+async def schema() -> JSONResponse:
+    return JSONResponse({
+        "action": Action.model_json_schema(),
+        "observation": Observation.model_json_schema(),
+        "state": State.model_json_schema(),
+    })
+# ---------------------------------------------------------------------------
+# /episode_summary — read by the reward_threshold graders in openenv.yaml
+# ---------------------------------------------------------------------------
+@app.get("/episode_summary")
+async def episode_summary() -> JSONResponse:
+    state = _env.state
+    breakdown = dict(state.reward_breakdown or {})
+    total_reward = round(sum(breakdown.values()), 4)
+    return JSONResponse({
+        "episode_id": state.episode_id,
+        "step_count": state.step_count,
+        "done": state.done,
+        "total_reward": total_reward,
+        "reward_breakdown": breakdown,
+        "final_action": state.final_action,
+        "reviewer_note": state.reviewer_note,
+    })
+# ---------------------------------------------------------------------------
+# Helper endpoints
+# ---------------------------------------------------------------------------
+@app.get("/cases")
+async def list_cases() -> JSONResponse:
+    return JSONResponse({"cases": CASE_IDS})
+@app.get("/state_full")
+async def state_full() -> JSONResponse:
+    return JSONResponse(_env.state.model_dump(mode="json"))
+@app.get("/health")
+async def health() -> JSONResponse:
+    return JSONResponse({"status": "ok"})
+def main(host: str = "0.0.0.0", port: int = 8000) -> None:
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/env.py CHANGED Viewed

@@ -1,173 +1,173 @@
-from __future__ import annotations
-import uuid
-from typing import Any, Dict, Optional
-from openenv.core.env_server.interfaces import Environment
-try:
-    from .models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
-    from .logic import (
-        CASE_IDS,
-        get_case,
-        get_expected_action,
-        compute_step_reward,
-    )
-    from .rag.retriever import retrieve_policy_chunks
-except ImportError:
-    from models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
-    from logic import (
-        CASE_IDS,
-        get_case,
-        get_expected_action,
-        compute_step_reward,
-    )
-    from rag.retriever import retrieve_policy_chunks
-# Episode step flow — each step() call advances to the next stage
-EPISODE_FLOW = ["analyze", "retrieve_policy", "decide", "review", "finalize"]
-class ModerationEnvironment(Environment):
-    """OpenEnv environment for multimodal content moderation."""
-    def __init__(self) -> None:
-        super().__init__()
-        self._state = State()
-        self._case: Optional[Dict[str, Any]] = None
-        self._current_step_index: int = 0
-    # ------------------------------------------------------------------
-    # OpenEnv interface
-    # ------------------------------------------------------------------
-    def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs) -> Observation:
-        eid = episode_id or str(uuid.uuid4())
-        # Determine which case to use
-        # Allow caller to pass case_id via kwargs (used by inference.py)
-        case_id = kwargs.get("case_id")
-        if case_id and case_id in CASE_IDS:
-            chosen_id = case_id
-        elif seed is not None:
-            chosen_id = CASE_IDS[seed % len(CASE_IDS)]
-        else:
-            import random
-            chosen_id = random.choice(CASE_IDS)
-        self._case = get_case(chosen_id)
-        self._current_step_index = 0
-        self._state = State(
-            episode_id=eid,
-            step_count=0,
-            done=False,
-            selected_case_id=chosen_id,
-            reward_breakdown={
-                "analysis_step": 0.0,
-                "retrieval_step": 0.0,
-                "correct_decision": 0.0,
-                "reviewer_agreement": 0.0,
-                "unsafe_penalty": 0.0,
-            },
-            final_action=None,
-            reviewer_note=None,
-            action_history=[],
-            retrieved_policy_chunks=[],
-        )
-        content = Content(**self._case["content"])
-        return Observation(
-            content=content,
-            policy=[],
-            step_type=StepType.analyze,
-            step_count=0,
-            message=f"Episode started. Case: {chosen_id}. Begin with analysis.",
-            reward=0.0,
-            done=False,
-        )
-    def step(self, action: Action, **kwargs) -> Observation:
-        if self._case is None:
-            raise RuntimeError("Call reset() before step()")
-        if self._state.done:
-            return Observation(
-                step_type=StepType.finalize,
-                step_count=self._state.step_count,
-                message="Episode already finished.",
-                reward=0.0,
-                done=True,
-            )
-        step_name = EPISODE_FLOW[self._current_step_index]
-        reward = compute_step_reward(step_name, action.action_type.value, self._case)
-        # Record reward into breakdown
-        breakdown = self._state.reward_breakdown
-        if step_name == "analyze":
-            breakdown["analysis_step"] += reward
-        elif step_name == "retrieve_policy":
-            breakdown["retrieval_step"] += reward
-        elif step_name == "decide":
-            if reward > 0:
-                breakdown["correct_decision"] += reward
-            else:
-                breakdown["unsafe_penalty"] += reward
-        elif step_name == "review":
-            breakdown["reviewer_agreement"] += reward
-        # Record action history
-        self._state.action_history.append({
-            "step": step_name,
-            "action_type": action.action_type.value,
-            "reason": action.reason,
-            "reward": reward,
-        })
-        self._state.step_count += 1
-        self._current_step_index += 1
-        # Build observation for next step
-        policy_chunks: list[PolicyChunk] = []
-        message = ""
-        next_step_type = StepType.finalize
-        if step_name == "retrieve_policy":
-            # Actually retrieve now that we're done with retrieve_policy
-            raw_chunks = retrieve_policy_chunks(self._case["content"].get("text", ""), top_k=3)
-            policy_chunks = [PolicyChunk(**c) for c in raw_chunks]
-            self._state.retrieved_policy_chunks = policy_chunks
-            message = "Policy retrieved. Now make your moderation decision."
-        elif step_name == "analyze":
-            message = "Analysis complete. Retrieve relevant policy next."
-        elif step_name == "decide":
-            self._state.final_action = action.action_type.value
-            message = "Decision recorded. Awaiting reviewer validation."
-        elif step_name == "review":
-            self._state.reviewer_note = action.reason or "Reviewer note recorded."
-            message = "Review complete. Finalizing episode."
-        elif step_name == "finalize":
-            message = "Episode finalized."
-        done = self._current_step_index >= len(EPISODE_FLOW)
-        self._state.done = done
-        # Determine next step type for observation
-        if not done and self._current_step_index < len(EPISODE_FLOW):
-            next_step_type = StepType(EPISODE_FLOW[self._current_step_index])
-        return Observation(
-            content=Content(**self._case["content"]),
-            policy=policy_chunks or self._state.retrieved_policy_chunks,
-            step_type=next_step_type,
-            step_count=self._state.step_count,
-            message=message,
-            reward=reward,
-            done=done,
-        )
-    @property
-    def state(self) -> State:
         return self._state

+from __future__ import annotations
+import uuid
+from typing import Any, Dict, Optional
+from openenv.core.env_server.interfaces import Environment
+try:
+    from .models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
+    from .logic import (
+        CASE_IDS,
+        get_case,
+        get_expected_action,
+        compute_step_reward,
+    )
+    from .rag.retriever import retrieve_policy_chunks
+except ImportError:
+    from models import Action, ActionType, Content, Observation, PolicyChunk, State, StepType
+    from logic import (
+        CASE_IDS,
+        get_case,
+        get_expected_action,
+        compute_step_reward,
+    )
+    from rag.retriever import retrieve_policy_chunks
+# Episode step flow — each step() call advances to the next stage
+EPISODE_FLOW = ["analyze", "retrieve_policy", "decide", "review", "finalize"]
+class ModerationEnvironment(Environment):
+    """OpenEnv environment for multimodal content moderation."""
+    def __init__(self) -> None:
+        super().__init__()
+        self._state = State()
+        self._case: Optional[Dict[str, Any]] = None
+        self._current_step_index: int = 0
+    # ------------------------------------------------------------------
+    # OpenEnv interface
+    # ------------------------------------------------------------------
+    def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs) -> Observation:
+        eid = episode_id or str(uuid.uuid4())
+        # Determine which case to use
+        # Allow caller to pass case_id via kwargs (used by inference.py)
+        case_id = kwargs.get("case_id")
+        if case_id and case_id in CASE_IDS:
+            chosen_id = case_id
+        elif seed is not None:
+            chosen_id = CASE_IDS[seed % len(CASE_IDS)]
+        else:
+            import random
+            chosen_id = random.choice(CASE_IDS)
+        self._case = get_case(chosen_id)
+        self._current_step_index = 0
+        self._state = State(
+            episode_id=eid,
+            step_count=0,
+            done=False,
+            selected_case_id=chosen_id,
+            reward_breakdown={
+                "analysis_step": 0.0,
+                "retrieval_step": 0.0,
+                "correct_decision": 0.0,
+                "reviewer_agreement": 0.0,
+                "unsafe_penalty": 0.0,
+            },
+            final_action=None,
+            reviewer_note=None,
+            action_history=[],
+            retrieved_policy_chunks=[],
+        )
+        content = Content(**self._case["content"])
+        return Observation(
+            content=content,
+            policy=[],
+            step_type=StepType.analyze,
+            step_count=0,
+            message=f"Episode started. Case: {chosen_id}. Begin with analysis.",
+            reward=0.0,
+            done=False,
+        )
+    def step(self, action: Action, **kwargs) -> Observation:
+        if self._case is None:
+            raise RuntimeError("Call reset() before step()")
+        if self._state.done:
+            return Observation(
+                step_type=StepType.finalize,
+                step_count=self._state.step_count,
+                message="Episode already finished.",
+                reward=0.0,
+                done=True,
+            )
+        step_name = EPISODE_FLOW[self._current_step_index]
+        reward = compute_step_reward(step_name, action.action_type.value, self._case)
+        # Record reward into breakdown
+        breakdown = self._state.reward_breakdown
+        if step_name == "analyze":
+            breakdown["analysis_step"] += reward
+        elif step_name == "retrieve_policy":
+            breakdown["retrieval_step"] += reward
+        elif step_name == "decide":
+            if reward > 0:
+                breakdown["correct_decision"] += reward
+            else:
+                breakdown["unsafe_penalty"] += reward
+        elif step_name == "review":
+            breakdown["reviewer_agreement"] += reward
+        # Record action history
+        self._state.action_history.append({
+            "step": step_name,
+            "action_type": action.action_type.value,
+            "reason": action.reason,
+            "reward": reward,
+        })
+        self._state.step_count += 1
+        self._current_step_index += 1
+        # Build observation for next step
+        policy_chunks: list[PolicyChunk] = []
+        message = ""
+        next_step_type = StepType.finalize
+        if step_name == "retrieve_policy":
+            # Actually retrieve now that we're done with retrieve_policy
+            raw_chunks = retrieve_policy_chunks(self._case["content"].get("text", ""), top_k=3)
+            policy_chunks = [PolicyChunk(**c) for c in raw_chunks]
+            self._state.retrieved_policy_chunks = policy_chunks
+            message = "Policy retrieved. Now make your moderation decision."
+        elif step_name == "analyze":
+            message = "Analysis complete. Retrieve relevant policy next."
+        elif step_name == "decide":
+            self._state.final_action = action.action_type.value
+            message = "Decision recorded. Awaiting reviewer validation."
+        elif step_name == "review":
+            self._state.reviewer_note = action.reason or "Reviewer note recorded."
+            message = "Review complete. Finalizing episode."
+        elif step_name == "finalize":
+            message = "Episode finalized."
+        done = self._current_step_index >= len(EPISODE_FLOW)
+        self._state.done = done
+        # Determine next step type for observation
+        if not done and self._current_step_index < len(EPISODE_FLOW):
+            next_step_type = StepType(EPISODE_FLOW[self._current_step_index])
+        return Observation(
+            content=Content(**self._case["content"]),
+            policy=policy_chunks or self._state.retrieved_policy_chunks,
+            step_type=next_step_type,
+            step_count=self._state.step_count,
+            message=message,
+            reward=reward,
+            done=done,
+        )
+    @property
+    def state(self) -> State:
         return self._state

server/models.py CHANGED Viewed

@@ -1,68 +1,68 @@
-from __future__ import annotations
-from enum import Enum
-from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, Field
-class ActionType(str, Enum):
-    allow = "allow"
-    flag = "flag"
-    remove = "remove"
-    escalate = "escalate"
-class StepType(str, Enum):
-    analyze = "analyze"
-    retrieve_policy = "retrieve_policy"
-    decide = "decide"
-    review = "review"
-    finalize = "finalize"
-class Content(BaseModel):
-    text: str = ""
-    image_url: Optional[str] = None
-    image_description: Optional[str] = None
-class PolicyChunk(BaseModel):
-    policy_id: str = ""
-    text: str = ""
-    score: float = 0.0
-class Action(BaseModel):
-    action_type: ActionType
-    reason: str = ""
-class Observation(BaseModel):
-    content: Optional[Content] = None
-    policy: List[PolicyChunk] = Field(default_factory=list)
-    step_type: StepType = StepType.analyze
-    step_count: int = 0
-    message: str = ""
-    reward: float = 0.0
-    done: bool = False
-class State(BaseModel):
-    episode_id: str = ""
-    step_count: int = 0
-    done: bool = False
-    selected_case_id: Optional[str] = None
-    reward_breakdown: Dict[str, float] = Field(
-        default_factory=lambda: {
-            "analysis_step": 0.0,
-            "retrieval_step": 0.0,
-            "correct_decision": 0.0,
-            "reviewer_agreement": 0.0,
-            "unsafe_penalty": 0.0,
-        }
-    )
-    final_action: Optional[str] = None
-    reviewer_note: Optional[str] = None
-    action_history: List[Dict[str, Any]] = Field(default_factory=list)
     retrieved_policy_chunks: List[PolicyChunk] = Field(default_factory=list)

+from __future__ import annotations
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+class ActionType(str, Enum):
+    allow = "allow"
+    flag = "flag"
+    remove = "remove"
+    escalate = "escalate"
+class StepType(str, Enum):
+    analyze = "analyze"
+    retrieve_policy = "retrieve_policy"
+    decide = "decide"
+    review = "review"
+    finalize = "finalize"
+class Content(BaseModel):
+    text: str = ""
+    image_url: Optional[str] = None
+    image_description: Optional[str] = None
+class PolicyChunk(BaseModel):
+    policy_id: str = ""
+    text: str = ""
+    score: float = 0.0
+class Action(BaseModel):
+    action_type: ActionType
+    reason: str = ""
+class Observation(BaseModel):
+    content: Optional[Content] = None
+    policy: List[PolicyChunk] = Field(default_factory=list)
+    step_type: StepType = StepType.analyze
+    step_count: int = 0
+    message: str = ""
+    reward: float = 0.0
+    done: bool = False
+class State(BaseModel):
+    episode_id: str = ""
+    step_count: int = 0
+    done: bool = False
+    selected_case_id: Optional[str] = None
+    reward_breakdown: Dict[str, float] = Field(
+        default_factory=lambda: {
+            "analysis_step": 0.0,
+            "retrieval_step": 0.0,
+            "correct_decision": 0.0,
+            "reviewer_agreement": 0.0,
+            "unsafe_penalty": 0.0,
+        }
+    )
+    final_action: Optional[str] = None
+    reviewer_note: Optional[str] = None
+    action_history: List[Dict[str, Any]] = Field(default_factory=list)
     retrieved_policy_chunks: List[PolicyChunk] = Field(default_factory=list)

server/rag/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .retriever import retrieve_policy_chunks
 __all__ = ["retrieve_policy_chunks"]


1	+ from .retriever import retrieve_policy_chunks
2	+
3	__all__ = ["retrieve_policy_chunks"]

server/rag/retriever.py CHANGED Viewed

@@ -1,98 +1,98 @@
-from __future__ import annotations
-import json
-import os
-from typing import Any, Dict, List
-_POLICIES_PATH = os.path.join(os.path.dirname(__file__), "policies.json")
-_policies_cache: List[Dict[str, Any]] = []
-def _load_policies() -> List[Dict[str, Any]]:
-    global _policies_cache
-    if _policies_cache:
-        return _policies_cache
-    if os.path.exists(_POLICIES_PATH):
-        with open(_POLICIES_PATH, "r", encoding="utf-8") as f:
-            _policies_cache = json.load(f)
-    else:
-        # Fallback built-in policies when policies.json is absent
-        _policies_cache = [
-  {
-    "chunk_id": "violence-01",
-    "title": "Violence and Credible Threats",
-    "text": "Content that includes direct threats, instructions to kill, or praise for murder must be removed immediately. Escalate only when context is ambiguous or safety risk cannot be resolved with the available evidence."
-  },
-  {
-    "chunk_id": "violence-02",
-    "title": "Violence Context Review",
-    "text": "Fictional, quoted, or documentary references to violence may remain allowed when the speaker is clearly not endorsing harm. When intent is uncertain, flag or escalate instead of allowing outright."
-  },
-  {
-    "chunk_id": "nudity-01",
-    "title": "Adult Nudity",
-    "text": "Mentions or metadata indicating nudity should be flagged for human review when the material may be adult sexual content. Remove only if the content is explicit or paired with exploitative cues."
-  },
-  {
-    "chunk_id": "nudity-02",
-    "title": "Suggestive Imagery Metadata",
-    "text": "Image tags such as exposed body parts, provocative posing, or synthetic nude prompts increase risk and should bias the system toward flagging rather than allowing."
-  },
-  {
-    "chunk_id": "self-harm-01",
-    "title": "Self-Harm and Crisis",
-    "text": "Content asking for methods of self-harm or celebrating imminent self-harm should be escalated or removed depending on immediacy. Support-seeking or recovery discussion should generally remain allowed."
-  },
-  {
-    "chunk_id": "harassment-01",
-    "title": "Harassment and Abuse",
-    "text": "Targeted abuse, slurs, and repeated demeaning language should be flagged or removed based on severity. Benign disagreement or criticism alone is not enough for removal."
-  },
-  {
-    "chunk_id": "spam-01",
-    "title": "Spam and Low-Quality Promotion",
-    "text": "Commercial spam, scam bait, or repetitive promotional text should usually be flagged. Remove only when the content is clearly malicious or part of coordinated abuse."
-  },
-  {
-    "chunk_id": "misinfo-01",
-    "title": "Misinformation Handling",
-    "text": "Potential misinformation without direct physical harm usually requires flagging or escalation, not automatic removal. High-risk medical or civic deception should lean toward escalation."
-  },
-  {
-    "chunk_id": "graphic-01",
-    "title": "Graphic Media",
-    "text": "Graphic injury descriptions or metadata indicating gore should be removed or escalated when public safety and age exposure concerns are high."
-  },
-  {
-    "chunk_id": "default-01",
-    "title": "Default Safe Handling",
-    "text": "When evidence is weak and no policy trigger is present, prefer allowing the content. Use escalation only when confidence is low or policy signals conflict."
-  }
-]
-    return _policies_cache
-def _keyword_score(text: str, policy_text: str) -> float:
-    """Simple overlap score: fraction of content words found in policy text."""
-    content_words = set(text.lower().split())
-    policy_words = set(policy_text.lower().split())
-    if not content_words:
-        return 0.0
-    return len(content_words & policy_words) / len(content_words)
-def retrieve_policy_chunks(query_text: str, top_k: int = 3) -> List[Dict[str, Any]]:
-    """Return top_k policy chunks most relevant to query_text."""
-    policies = _load_policies()
-    scored = [
-        {
-            "policy_id": p["chunk_id"],
-            "text": p["text"],
-            "score": _keyword_score(query_text, p["text"]),
-        }
-        for p in policies
-    ]
-    scored.sort(key=lambda x: x["score"], reverse=True)
     return scored[:top_k]

+from __future__ import annotations
+import json
+import os
+from typing import Any, Dict, List
+_POLICIES_PATH = os.path.join(os.path.dirname(__file__), "policies.json")
+_policies_cache: List[Dict[str, Any]] = []
+def _load_policies() -> List[Dict[str, Any]]:
+    global _policies_cache
+    if _policies_cache:
+        return _policies_cache
+    if os.path.exists(_POLICIES_PATH):
+        with open(_POLICIES_PATH, "r", encoding="utf-8") as f:
+            _policies_cache = json.load(f)
+    else:
+        # Fallback built-in policies when policies.json is absent
+        _policies_cache = [
+  {
+    "chunk_id": "violence-01",
+    "title": "Violence and Credible Threats",
+    "text": "Content that includes direct threats, instructions to kill, or praise for murder must be removed immediately. Escalate only when context is ambiguous or safety risk cannot be resolved with the available evidence."
+  },
+  {
+    "chunk_id": "violence-02",
+    "title": "Violence Context Review",
+    "text": "Fictional, quoted, or documentary references to violence may remain allowed when the speaker is clearly not endorsing harm. When intent is uncertain, flag or escalate instead of allowing outright."
+  },
+  {
+    "chunk_id": "nudity-01",
+    "title": "Adult Nudity",
+    "text": "Mentions or metadata indicating nudity should be flagged for human review when the material may be adult sexual content. Remove only if the content is explicit or paired with exploitative cues."
+  },
+  {
+    "chunk_id": "nudity-02",
+    "title": "Suggestive Imagery Metadata",
+    "text": "Image tags such as exposed body parts, provocative posing, or synthetic nude prompts increase risk and should bias the system toward flagging rather than allowing."
+  },
+  {
+    "chunk_id": "self-harm-01",
+    "title": "Self-Harm and Crisis",
+    "text": "Content asking for methods of self-harm or celebrating imminent self-harm should be escalated or removed depending on immediacy. Support-seeking or recovery discussion should generally remain allowed."
+  },
+  {
+    "chunk_id": "harassment-01",
+    "title": "Harassment and Abuse",
+    "text": "Targeted abuse, slurs, and repeated demeaning language should be flagged or removed based on severity. Benign disagreement or criticism alone is not enough for removal."
+  },
+  {
+    "chunk_id": "spam-01",
+    "title": "Spam and Low-Quality Promotion",
+    "text": "Commercial spam, scam bait, or repetitive promotional text should usually be flagged. Remove only when the content is clearly malicious or part of coordinated abuse."
+  },
+  {
+    "chunk_id": "misinfo-01",
+    "title": "Misinformation Handling",
+    "text": "Potential misinformation without direct physical harm usually requires flagging or escalation, not automatic removal. High-risk medical or civic deception should lean toward escalation."
+  },
+  {
+    "chunk_id": "graphic-01",
+    "title": "Graphic Media",
+    "text": "Graphic injury descriptions or metadata indicating gore should be removed or escalated when public safety and age exposure concerns are high."
+  },
+  {
+    "chunk_id": "default-01",
+    "title": "Default Safe Handling",
+    "text": "When evidence is weak and no policy trigger is present, prefer allowing the content. Use escalation only when confidence is low or policy signals conflict."
+  }
+]
+    return _policies_cache
+def _keyword_score(text: str, policy_text: str) -> float:
+    """Simple overlap score: fraction of content words found in policy text."""
+    content_words = set(text.lower().split())
+    policy_words = set(policy_text.lower().split())
+    if not content_words:
+        return 0.0
+    return len(content_words & policy_words) / len(content_words)
+def retrieve_policy_chunks(query_text: str, top_k: int = 3) -> List[Dict[str, Any]]:
+    """Return top_k policy chunks most relevant to query_text."""
+    policies = _load_policies()
+    scored = [
+        {
+            "policy_id": p["chunk_id"],
+            "text": p["text"],
+            "score": _keyword_score(query_text, p["text"]),
+        }
+        for p in policies
+    ]
+    scored.sort(key=lambda x: x["score"], reverse=True)
     return scored[:top_k]

server/server_routes.py CHANGED Viewed

@@ -13,7 +13,7 @@ def register_routes(app, env) -> None:
     async def episode_summary() -> JSONResponse:
         state = env.state
         breakdown = state.reward_breakdown or {}
-        total_reward = round(sum(breakdown.values()), 4)
         return JSONResponse({
             "episode_id": state.episode_id,
             "step_count": state.step_count,

     async def episode_summary() -> JSONResponse:
         state = env.state
         breakdown = state.reward_breakdown or {}
+        total_reward = max(0.01, min(0.99, float(sum(breakdown.values()))))
         return JSONResponse({
             "episode_id": state.episode_id,
             "step_count": state.step_count,