# openenv.yaml — consumed by `openenv validate`
# Schema source: https://github.com/meta-pytorch/OpenEnv (v1.0).
# Deploy spec: docs/modules/deploy_env_space.md §4.3.
schema_version: "1.0"

env:
  id: driftcall
  version: "0.1.0"
  display_name: "DriftCall — Indic Voice Concierge under Schema Drift"
  description: >
    OpenEnv-compliant RL environment where a voice-first agent completes Indic
    consumer concierge tasks while vendor APIs undergo mid-episode schema,
    policy, T&C, pricing, and auth drift. Five independent reward components;
    deterministic seeded drift; Hindi/Tamil/Kannada/Hinglish briefs via
    Kokoro TTS + faster-whisper ASR.
  license: apache-2.0
  tags:
    - openenv
    - rl
    - voice
    - indic
    - schema-drift
    - grpo

  entrypoint:
    type: http
    base_url: "https://driftcall-driftcall-env.hf.space"
    endpoints:
      reset: "/reset"
      step: "/step"
      state: "/state"
      close: "/close"
      health: "/healthz"
    auth:
      type: bearer
      secret_env: DRIFTCALL_ENV_TOKEN

  action_space:
    ref: "cells.step_04_models:DriftCallAction"

  observation_space:
    ref: "cells.step_04_models:DriftCallObservation"

  episode:
    max_turns: 16
    reset_config:
      seed:
        type: int
        required: false
      curriculum_stage:
        type: int
        range: [1, 3]
        required: false
      language_weights:
        type: object
        required: false
      audio_boundary_enabled:
        type: bool
        required: false

  reward:
    shape: scalar
    range: [-1.0, 1.0]
    # The reward function lives in `cells/step_08_rewards.py`. Five independent
    # components are computed at episode termination; combined into a quality
    # score, calibrated by a Brier penalty + uncertain floor, then clamped.
    # Implementation entrypoint:
    impl: "cells.step_08_rewards:compute_rewards"
    pipeline:
      - "cells.step_08_rewards:combine_quality"   # weighted mix of R1..R5
      - "cells.step_08_rewards:brier_penalty"     # confidence calibration
      - "cells.step_08_rewards:apply_uncertain_floor"  # 0.50 floor when uncertain
      - "cells.step_08_rewards:final_reward"      # final scalar in [-1, 1]
    components:
      - id: R1
        name: task_completion
        weight: 0.40
        impl: "cells.step_08_rewards:task_completion"
        description: >
          Goal achieved (correct booking, payment success, vendor confirmation).
      - id: R2
        name: drift_detection
        weight: 0.20
        impl: "cells.step_08_rewards:drift_detection"
        description: >
          Agent detects mid-episode schema/policy/auth drift and adapts.
      - id: R3
        name: constraint_adherence
        weight: 0.20
        impl: "cells.step_08_rewards:constraint_adherence"
        description: >
          Honours user constraints (budget, time window, dietary, lang).
      - id: R4
        name: format_compliance
        weight: 0.10
        impl: "cells.step_08_rewards:format_compliance"
        description: >
          Tool args parse cleanly against the (possibly drifted) schema.
      - id: R5
        name: anti_hack_penalty
        weight: 0.10
        impl: "cells.step_08_rewards:anti_hack_penalty"
        description: >
          Penalty for known reward-hacking patterns flagged in probe set.
    docs: "docs/modules/rewards.md"