spec_version: 1
name: helpdesk_env
version: "0.1.0"
description: >
  An OpenEnv RL environment simulating UPI banking customer support workflows.
  An AI agent classifies issues, retrieves the correct FAQ or escalation path,
  and completes a safe multi-turn support flow across three graded tasks of
  increasing difficulty.
author: Freakdivi
tags:
  - openenv
  - banking
  - upi
  - customer-support
  - rl-environment

type: space
runtime: fastapi
app: server.app:app
port: 8000

tasks:
  - id: easy
    difficulty: easy
    description: Classify the customer's issue into the correct support category
    max_steps: 1
    reward_range: [0.0, 1.0]
    grader:
      type: llm
      prompt_template: >
        Score the agent's performance for the easy helpdesk task on a scale from
        0.001 to 0.999. Reward correct issue classification, safe behavior, and
        efficient completion. Penalize incorrect categories, unsafe requests for
        sensitive information, or invalid actions. Return only a numeric score.

  - id: medium
    difficulty: medium
    description: Select the correct FAQ or escalate cases that require manual handling
    max_steps: 3
    reward_range: [0.0, 1.0]
    grader:
      type: llm
      prompt_template: >
        Score the agent's performance for the medium helpdesk task on a scale
        from 0.001 to 0.999. Reward selecting the correct FAQ or making the
        correct escalation decision, while maintaining safe guidance and good
        efficiency. Penalize incorrect retrieval, missed escalation, unsafe
        behavior, or unnecessary extra steps. Return only a numeric score.

  - id: hard
    difficulty: hard
    description: Run a multi-turn support conversation with clarification, guidance, and safe closure
    max_steps: 8
    reward_range: [0.0, 1.0]
    grader:
      type: llm
      prompt_template: >
        Score the agent's performance for the hard helpdesk task on a scale from
        0.001 to 0.999. Reward appropriate clarification, correct FAQ retrieval,
        safe and useful guidance, and closing the case only when the issue is
        actually resolved. Penalize unsafe behavior, premature closure, missing
        clarification, or poor multi-turn handling. Return only a numeric score.

observation_space:
  type: object
  fields:
    case_id: string
    track: string
    customer_message: string
    conversation_history: array
    known_facts: object
    required_slots: array
    available_actions: array
    turn_number: integer

action_space:
  type: object
  fields:
    action_type: "classify | lookup_faq | ask_clarification | reply | escalate | resolve_ticket"
    category: string (optional)
    faq_id: string (optional)
    message: string (optional)
    fields_requested: array (optional)
    target: string (optional)
    operation: string (optional)

reward:
  type: float
  range: [0.0, 1.0]
  description: >
    Partial reward is produced at each step and normalized by the environment.
    The final reward combines correctness, safety, resolution, efficiency, and
    penalties, with score outputs constrained to the open interval (0, 1) for
    submission compatibility.

endpoints:
  reset: POST /reset
  step: POST /step
  state: GET /state
  health: GET /health

runtime_config:
  framework: fastapi
  python: "3.10"
  port: 8000