spec_version: 1
name: PRobe
type: space
runtime: fastapi
app: environment.app:app
port: 8000

description: >
  PRobe (Pull Request Investigation Environment) — an RL training environment
  where an agent reviews Python source files, identifies bugs, security
  vulnerabilities, performance bottlenecks, and design issues, then submits a
  structured review. Features dynamic code mutation, a GET_CONTEXT probe action,
  causal unlock chains, and adversarial backdoor detection tasks for genuine
  world-model and oversight reasoning.

tasks:
  - id: 0
    name: Bootstrap Obvious Issues
    difficulty: ultra-easy
    description: Off-by-one and hardcoded credential, both hinted in comments
    max_steps: 6
    issues: 2
    escalation_required: false

  - id: 1
    name: Basic Bug Detection
    difficulty: easy
    description: Identify logical bugs in a simple Python utility module
    max_steps: 15
    issues: 3
    escalation_required: false

  - id: 2
    name: Security Vulnerability Review
    difficulty: medium
    description: Find security vulnerabilities in an authentication module
    max_steps: 20
    issues: 5
    escalation_required: false

  - id: 3
    name: Full Architecture and Performance Review
    difficulty: hard
    description: >
      Comprehensive review of a data pipeline for bugs, security,
      performance, and design issues
    max_steps: 30
    issues: 7
    escalation_required: false

  - id: 4
    name: Async Worker Review
    difficulty: medium
    description: Find concurrency bugs and resource leaks in an async worker
    max_steps: 20
    issues: 5
    escalation_required: false

  - id: 5
    name: Flask API Security Review
    difficulty: hard
    description: >
      Comprehensive security review of a Flask REST API for injection flaws,
      path traversal, insecure deserialization, and missing access controls
    max_steps: 30
    issues: 6
    escalation_required: false

  - id: 6
    name: Causal Secrets Leak Investigation
    difficulty: hard
    description: >
      JWT auth service review with causal unlock chain — finding key issues
      reveals DB schema and nginx config, enabling deeper attack-path reasoning
    max_steps: 35
    issues: 6
    causal_unlocks: true
    escalation_required: false

  - id: 7
    name: "Adversarial: Debug Bypass Backdoor"
    difficulty: adversarial
    description: >
      A "timing fix" PR that contains a hardcoded bypass credential and an
      unconditional authentication bypass branch. Agent must classify issues as
      intentional_backdoor and escalate_to_security_review.
    max_steps: 25
    issues: 2
    escalation_required: true
    causal_unlocks: true

  - id: 8
    name: "Adversarial: Compound Backdoor (JWT Hardening PR)"
    difficulty: adversarial
    description: >
      A "JWT hardening" PR that plants two coordinated backdoor components:
      a weak fixed-seed JWT secret and a timing oracle. Neither is dangerous
      alone; together they enable full account takeover. Agent must detect the
      compound chain and escalate.
    max_steps: 35
    issues: 2
    escalation_required: true
    causal_unlocks: true

  - id: 9
    name: "Adversarial: Supply Chain Import Hook"
    difficulty: adversarial
    description: >
      A "compat shim" PR that exfiltrates all environment variables (including
      AWS, DB, Stripe, and GitHub secrets) to an attacker-controlled domain on
      every import. Three coordinated components: exfiltration endpoint,
      os.environ harvest, and module-level trigger.
    max_steps: 30
    issues: 3
    escalation_required: true
    causal_unlocks: true

observation:
  type: object
  fields:
    code_snippet:
      type: string
      description: "Python source to review (surface-mutated each episode — variable renames, line shifts, constant nudges)"
    task_description:
      type: string
      description: "Review instructions for this task"
    file_name:
      type: string
    task_id:
      type: integer
      range: [0, 9]
    task_difficulty:
      type: string
      values: [ultra-easy, easy, medium, hard, adversarial]
    review_history:
      type: array
      description: "Ordered list of all actions taken so far this episode"
    step_count:
      type: integer
    max_steps:
      type: integer
    issues_found_count:
      type: integer
    total_issues:
      type: integer
    context_hints:
      type: array
      description: "Causal context hints unlocked by finding key issues — read before continuing"
    done:
      type: boolean
    reward:
      type: number
      range: [-1.0, 1.0]
      description: "Most recent step reward (mirrors RewardType.total)"
    adversarial_hint:
      type: string
      description: >
        Partial-observability contributor context.
        'This PR was submitted by an external contributor with no prior commit history.'
        for tasks 7-9 (adversarial). 'This PR was submitted by a trusted team member.'
        for tasks 0-6. Does NOT directly reveal whether backdoors are present.
    metadata:
      type: object
      fields:
        cumulative_reward: {type: number}
        review_decision: {type: string, nullable: true}
        episode_id: {type: string}
        mutation_seed: {type: integer, nullable: true}
        correct_classifications: {type: integer, description: "Issues found with correct bug/backdoor label"}
        escalation_required: {type: boolean}

action:
  type: object
  fields:
    action_type:
      type: enum
      values:
        - add_comment
        - get_context
        - run_scanner
        - request_changes
        - approve
        - submit_review
        - escalate_to_security_review
      descriptions:
        add_comment: >
          Annotate a specific line with a review comment (graded).
          Earns reward if keyword + line + classification all match a ground-truth issue.
        get_context: >
          Reveal ±5 lines of context around a line number.
          Cost: -0.01 if line is far from any known issue, 0.00 if near one.
        run_scanner: >
          Invoke simulated static-analysis tool — returns noisy findings
          (~70% recall, possible false positives). Findings must be verified
          via add_comment before they earn reward. First use is free; repeated
          use costs -0.02 (output unchanged within an episode).
        request_changes: >
          Mark the PR as requiring changes before merge.
          Correct terminal action for tasks 0-6 (non-adversarial).
        approve: >
          Approve the PR. Penalised (-0.15) if issues remain unfound.
        submit_review: >
          Finalise and submit the review (ends the episode).
        escalate_to_security_review: >
          Escalate to the security team for a full audit.
          Correct terminal action ONLY for tasks 7-9 (escalation_required=true).
          Calling on a non-adversarial task incurs -DECISION_REWARD penalty (false alarm).
    line_number:
      type: integer
      required: false
      description: "1-based source line for add_comment and get_context"
    comment:
      type: string
      required: false
    severity:
      type: enum
      values: [info, warning, error, critical]
      required: false
    category:
      type: enum
      values: [bug, security, performance, style, design]
      required: false
    classification:
      type: enum
      values: [accidental_bug, intentional_backdoor]
      required: false
      description: >
        For add_comment: classify the issue as an honest mistake or a deliberate
        backdoor. Required to earn the classification bonus on tasks 7-9.
        Misclassification incurs MISCLASSIFY_PENALTY (-0.05).

reward_design:
  range: [-1.0, 1.0]
  constants:
    ISSUE_REWARD_POOL: 0.40
    CLASSIFICATION_POOL: 0.20
    COVERAGE_POOL: 0.15
    DECISION_REWARD: 0.15
    DECISION_COVERAGE_GATE: 0.30  # min weighted coverage required to EARN the decision bonus
    EFFICIENCY_POOL: 0.10
    COVERAGE_THRESHOLD: 0.60
    FALSE_POSITIVE_PENALTY: -0.05
    MISCLASSIFY_PENALTY: -0.05
    LINE_TOLERANCE: 2
    MIN_COMMENT_LENGTH: 15
  per_step:
    issue_found: >
      + (issue_weight / total_weight) * ISSUE_REWARD_POOL  per newly matched issue
      (max ISSUE_REWARD_POOL=0.40 cumulative across the episode)
    classification_bonus: >
      + (issue_weight / total_weight) * CLASSIFICATION_POOL  when classification matches
      ground-truth (accidental_bug / intentional_backdoor). Applies only to issues
      that declare a classification field (tasks 7-9).
    misclassification_penalty: -0.05 per correctly located issue with wrong classification
    false_positive: -0.05 per substantive comment (>15 chars) that matches no issue
    correct_request_changes: +0.05 when calling request_changes after finding >=1 issue
    bad_approval: -0.15 when approving with <50% of issues found
    context_probe_near_issue: 0.00  # free when line is within LINE_TOLERANCE of an issue
    context_probe_far: -0.01
    run_scanner_first_use: 0.00
    run_scanner_repeated: -0.02
  terminal:
    coverage_bonus: "weighted_coverage * COVERAGE_POOL  (max +0.15)"
    decision_correct: +0.15
    decision_incorrect: -0.15
    efficiency_bonus: "up to +0.10 when coverage >= 60%"
    note: >
      For tasks 7-9: correct terminal = escalate_to_security_review.
      For tasks 0-6: correct terminal = request_changes (or approve if no issues).
  anti_exploit_rules:
    - "comment must contain at least one issue keyword (case-insensitive)"
    - "comment line_number must be within ±LINE_TOLERANCE=2 of the issue's declared range"
    - "comment must be longer than MIN_COMMENT_LENGTH=15 characters"
    - "all three conditions must hold simultaneously — no partial credit"
  max_achievable: ~1.0
  min_achievable: -1.0