Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

File size: 8,416 Bytes

8cd3fa7
8520614
8cd3fa7
 
 
 
 
6b7794e
 
 
a541c0b
8cd3fa7
 
 
a541c0b
 
8cd3fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8520614
6aa8acb
8cd3fa7
 
 
 
 
 
 
 
 
 
 
 
6aa8acb
8cd3fa7
 
 
 
 
 
 
 
8520614
8cd3fa7
 
 
6aa8acb
 
 
 
 
 
 
 
 
 
 
 
8cd3fa7
 
 
 
6aa8acb
 
 
8cd3fa7
 
 
 
 
 
8520614
 
 
 
6aa8acb
8520614
8cd3fa7
 
 
 
 
 
 
 
 
 
 
a9f749a
 
 
 
8cd3fa7
 
 
 
 
 
8520614
 
 
8cd3fa7
 
6aa8acb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95a7dc0
6aa8acb
8cd3fa7
 
8520614
d78cfdc
8cd3fa7
 
 
 
70f8688
 
6aa8acb
70f8688
 
 
 
 
6aa8acb
 
 
70f8688
 
6aa8acb
 
 
 
 
70f8688
 
 
 
 
 
 
 
 
 
 
 
 
 
6aa8acb
8cd3fa7
 
 
 
 
 
 
 
 
6aa8acb
 
 
8cd3fa7
 
 
 
 
 
 
 
6aa8acb
d78cfdc
6aa8acb
8cd3fa7
4553b37
 
 
 
 
8cd3fa7

# server/environment.py
# HF Force Rebuild: 2026-03-30T17:16:00Z
from __future__ import annotations
import uuid
import random
from typing import Optional, Any, Dict
from openenv.core.env_server import Environment
import logging

logger = logging.getLogger(__name__)
from models import (
    Action, Observation, State,
    ProposeClarificationAction, ProposeNewRuleAction, EvolveProcessAction,
)
from server.grader import grade
from server.tasks import TASK_REGISTRY


class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
    """
    Real-world environment: AI agent learns to evolve governance policies
    through meta-reasoning over real-world data.
    """
    _instance = None

    def __new__(cls, *args, **kwargs):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._initialized = False
        return cls._instance

    def __init__(self):
        if getattr(self, "_initialized", False):
            return
        super().__init__()
        self._state = State()
        self._current_task = None
        self._persistent_best_score = 0.0
        self._seen_action_hashes = set()
        self._initialized = True

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        **kwargs: Any,
    ) -> Observation:
        task_id = kwargs.get("task_id")
        if task_id is None:
            task_id = random.choice(list(TASK_REGISTRY.keys()))

        self._seen_action_hashes = set()
        task = TASK_REGISTRY[task_id]
        self._current_task = task
        self._state = State(
            episode_id=episode_id or str(uuid.uuid4()),
            task_id=task_id,
            step_count=0,
            max_steps=5,
            current_score=0.0,
            best_score=self._persistent_best_score,
            actions_taken=[],
        )

        # Deepcopy to keep episode state
        import copy
        self._episode_corpus = copy.deepcopy(task.get("data_corpus", []))
        # Ensure all incidents follow CorpusIncident schema properly
        for item in self._episode_corpus:
            if "content" not in item:
                item["content"] = item.pop("text", None) or item.pop("desc", None) or str(item.get("flags", ""))
            if "system_action" not in item:
                item["system_action"] = "pending"

        shown_corpus = self._episode_corpus[:10]

        return Observation(
            task_id=task_id,
            episode_id=self._state.episode_id,
            step_count=0,
            corpus_size=len(self._episode_corpus),
            corpus_shown=len(shown_corpus),
            data_corpus=shown_corpus,
            current_policies=task["current_policies"],
            policy_outcomes=task.get("policy_outcomes"),
            system_metrics=task.get("system_metrics", {}),
            identified_issues=task.get("identified_issues", []),
            reward=0.0,
            done=False,
            info={
                "task_description": task["description"], 
                "difficulty": task["difficulty"],
                "best_score": self._persistent_best_score,
                "steps_remaining": self._state.max_steps
            },
        )

    def step(
        self,
        action: Action,
        timeout_s: Optional[float] = None,
        **kwargs: Any,
    ) -> Observation:
        if self._current_task is None:
            raise RuntimeError("Call reset() before step()")

        if self._state.step_count >= self._state.max_steps:
             logger.warning(f"[EXPLOIT] Step-count limit exceeded for episode {self._state.episode_id}")
             return self.reset(task_id=self._state.task_id) # Force reset if they keep pushing or return an empty observation
        
        self._state.step_count += 1

        # action can be a dict from the API or a Pydantic model
        if isinstance(action, dict):
            action_dict = action
        else:
            # Handle Pydantic RootModel used for discriminated unions
            if hasattr(action, "root"):
                action = action.root
            action_dict = action.model_dump() if hasattr(action, "model_dump") else dict(action)

        # Repetition Penalty logic
        import json as _json
        try:
            action_hash = hash(_json.dumps(action_dict, sort_keys=True, default=str))
        except Exception:
            action_hash = hash(str(action_dict))

        if action_hash in self._seen_action_hashes:
            repetition_penalty = 0.30
        else:
            repetition_penalty = 0.0
            self._seen_action_hashes.add(action_hash)

        previous_score = self._state.current_score
        raw_reward = grade(action_dict, self._state.task_id, previous_score=previous_score)
        reward = max(0.001, min(0.999, raw_reward - repetition_penalty))
        
        self._state.current_score = reward
        self._state.best_score = max(self._state.best_score, reward)
        self._persistent_best_score = max(self._persistent_best_score, reward)
        self._state.rewards_history.append(reward)

        action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
        self._state.actions_taken.append(action_type)

        # Reactive Corpus: Prioritize items relevant to the agent's action domain
        # This makes the world visibly react to agent choices
        target_term = action_dict.get("ambiguous_term") or action_dict.get("rule_domain") or ""
        t_term = str(target_term).lower()

        # Partition: relevant items first, then remaining
        relevant = []
        remaining = []
        for item in self._episode_corpus:
            c_type = str(item.get("type", "")).lower()
            c_text = str(item.get("content", "")).lower()

            # Update system_action based on reward (stateful corpus)
            if t_term in c_text or t_term in c_type or action_type == "evolve_policy":
                if reward >= 0.7:
                    item["system_action"] = "policy_applied"
                elif 0.3 <= reward < 0.7:
                    item["system_action"] = "flagged"

            # Sort into buckets
            if t_term and (t_term in c_text or t_term in c_type):
                relevant.append(item)
            else:
                remaining.append(item)

        # Rotate the remaining window by step count so agent sees fresh data each step
        step_offset = (self._state.step_count - 1) * 3
        rotated_remaining = remaining[step_offset:] + remaining[:step_offset]

        # Build shown corpus: relevant items first, then rotated remaining, cap at 10
        prioritized_corpus = relevant + rotated_remaining
        shown_corpus = prioritized_corpus[:10]

        done = (
            reward >= 0.90 or
            self._state.step_count >= self._state.max_steps
        )

        return Observation(
            task_id=self._state.task_id,
            episode_id=self._state.episode_id,
            step_count=self._state.step_count,
            corpus_size=len(self._episode_corpus),
            corpus_shown=len(shown_corpus),
            data_corpus=shown_corpus,
            current_policies=self._current_task["current_policies"],
            policy_outcomes=self._current_task.get("policy_outcomes"),
            system_metrics=self._current_task.get("system_metrics", {}),
            identified_issues=self._current_task.get("identified_issues", []),
            reward=reward,
            done=done,
            info={
                "best_score": self._state.best_score,
                "last_reward": reward,
                "rewards_history": self._state.rewards_history,
                "action_history": self._state.actions_taken,
                "steps_remaining": self._state.max_steps - self._state.step_count,
                "staff_feedback": {
                    "strategic_rating": "Senior Architect" if reward >= 0.85 else "Staff Specialist" if reward >= 0.65 else "Junior Associate",
                    "focus": "Signal detected" if reward >= 0.5 else "Burying the lede or distracted by noise",
                    "recommendation": "Maintain high signal-to-noise ratio and lead with the fix." if reward < 0.8 else "Excellent prioritization."
                }
            },
        )

    @property
    def state(self) -> State:
        return self._state