name: "PolicyEvolverEnv"
description: "Policy Design and Evolution Sandbox — agents refine their strategy to evolve real-world governance frameworks through meta-reasoning"
version: "1.0.0"
author: "PolicyEvolution Team"
tags:
  - "policy"
  - "governance"
  - "meta-reasoning"
  - "content-moderation"
  - "AI-safety"

environment:
  module: "server.environment"
  class: "PolicyEvolverEnvironment"
  variables:
    HF_TOKEN:
      description: "API key for LLM inference provider (Groq recommended)"
      required: true
    API_BASE_URL:
      description: "OpenAI-compatible endpoint. Default: Groq"
      default: "https://api.groq.com/openai/v1"
    MODEL_NAME:
      description: "Model identifier for the inference provider"
      default: "llama-3.1-8b-instant"

  observation_schema:
    type: "object"
    description: "Policy context, data corpus, and system state"

  action_schema:
    type: "object"
    description: "Discriminated union on action_type field"
    discriminator: "action_type"
    variants:
      - action_type: "propose_clarification"
        schema: "ProposeClarificationAction"
      - action_type: "propose_new_rule"
        schema: "ProposeNewRuleAction"
      - action_type: "evolve_policy"
        schema: "EvolveProcessAction"
        description: "Hard task metric keys: fraud_rate (aliases: fraud_detection, fraud), revenue_velocity (aliases: queue_overload, revenue), seller_trust (aliases: seller_confidence, trust)."

  reward_range: [0.0, 1.0]

runtime:
  max_steps: 5
  timeout_seconds: 1200
  vcpu: 2
  memory_gb: 8

tasks:
  - id: "task_easy"
    difficulty: "easy"
    description: "Identify and clarify ambiguous policy terms in a social media community guidelines"
    expected_min_score: 0.70

  - id: "task_medium"
    difficulty: "medium"
    description: "Detect policy gaps in corporate HR policies and propose new rules for emerging scenarios"
    expected_min_score: 0.55

  - id: "task_hard"
    difficulty: "hard"
    description: "Holistically evolve an e-commerce Trust & Safety framework with trade-off reasoning"
    expected_min_score: 0.40

grading:
  module: "server.grader"
  function: "grade"
  return_range: [0.0, 1.0]

endpoints:
  required:
    - path: "/reset"
      method: "POST"
    - path: "/step"
      method: "POST"
    - path: "/state"
      method: "GET"
    - path: "/tasks"
      method: "GET"
    - path: "/grader"
      method: "POST"
    - path: "/baseline"
      method: "GET"
  optional:
    - path: "/health"
      method: "GET"