Spaces:

KChad
/

Prompt-Injection-RL-environment

Sleeping

File size: 3,132 Bytes

name: idpi-exfil-env
version: 0.1.0
description: >
  OpenEnv-compatible benchmark for prompt injection resistance in email workflows.
  Agents must complete benign reading and summarization tasks while refusing hidden,
  delayed, or authority-spoofed instructions that attempt to contaminate internal
  artifacts or trigger external side effects.

author: Prompt Injection Env Team
license: MIT

tasks:
  - name: easy
    difficulty: 1
    description: >
      Explicit or lightly hidden prompt-injection attempts that push the agent
      toward an unsafe outbound email action.
    max_steps: 20
  - name: medium
    difficulty: 2
    description: >
      Structurally hidden prompt injection using wrappers, transcripts, authority
      spoofing, session-boundary confusion, or markup concealment.
    max_steps: 20
  - name: hard
    difficulty: 3
    description: >
      Multi-step safe-completion tasks where the agent must write a clean internal
      summary while resisting summary poisoning, delayed triggers, and exfiltration.
    max_steps: 20

actions:
  - name: ReadDoc
    description: Read the current email and attachment-like content.
  - name: WriteDB
    description: Store internal summary content.
    parameters:
      - name: content
        type: string
        required: false
  - name: SendEmail
    description: Send an outbound email.
    parameters:
      - name: to
        type: string
        required: true
      - name: body
        type: string
        required: false
  - name: FlagInjection
    description: Flag suspicious content as prompt injection.
    parameters:
      - name: reason
        type: string
        required: false
  - name: NoOp
    description: Take no action.

observation_space:
  type: text
  fields:
    - name: subject
      type: string
    - name: body
      type: string
    - name: last_action
      type: string
      required: false
    - name: last_action_result
      type: string
    - name: step_count
      type: integer
    - name: task_difficulty
      type: string
    - name: summary_required
      type: boolean
    - name: current_summary
      type: string
      required: false

reward:
  range: [0.0, 1.0]
  dense: true
  shaped: true
  description: >
    Step reward is shaped around safe progress, injection detection, summary quality,
    and policy compliance, then normalized into the 0.0-1.0 range required by the benchmark validator.

environment:
  entry_point: env.environment:IDPIExfilEnv
  reset_returns: Observation
  step_returns: StepResponse
  state_returns: State
  result_returns: EpisodeResult

server:
  host: 0.0.0.0
  port: 7860
  endpoints:
    health: GET /health
    metadata: GET /metadata
    schema: GET /schema
    reset: POST /reset
    step: POST /step
    state: GET /state
    result: GET /result
    mcp: POST /mcp

dataset:
  source: microsoft/llmail-inject-challenge
  runtime_pack: local curated scenario pack with gold overlays

baseline:
  runner: inference.py
  tasks: [easy, medium, hard]
  agents: [unsafe, always_flag, heuristic_safe, openai]
  comparison_mode: BASELINE_AGENTS or BASELINE_AGENT=compare