Spaces:

PrakashCider
/

teamforge

Sleeping

File size: 7,378 Bytes

name: teamforge
version: "1.1.0"
description: >
  A structured multi-phase benchmark for autonomous software engineering agents.
  The agent simulates a full software development team: planning, coding, testing,
  reviewing, and reflecting — inside a real isolated Git repository.

author: TeamForge
license: MIT

# ── OpenEnv Interface ──────────────────────────────────────────────────────────
entry_point: environment.TeamForgeEnv

methods:
  reset:
    description: "Start a new episode for the given task_id. Returns initial Observation."
    parameters:
      task_id:
        type: string
        enum:
          - easy_bugfix_chunk_list
          - medium_refactor_stats
          - hard_lru_cache_performance
        description: "Which task to run this episode."
  step:
    description: "Execute one typed action. Returns updated Observation with reward."
    parameters:
      action:
        type: object
        description: "A typed Action model (see action_space below)."
  state:
    description: "Return current environment state as a plain serialisable dict."

# ── Action Space ───────────────────────────────────────────────────────────────
action_space:
  type: union
  description: "One of 8 structured actions. Discriminated by the `type` field."
  actions:
    - name: plan_step
      fields:
        type: {type: literal, value: plan_step}
        step_number: {type: integer, minimum: 1}
        description: {type: string, minLength: 5}
        estimated_effort: {type: string, enum: [low, medium, high]}
        depends_on: {type: array, items: integer}

    - name: edit_file
      fields:
        type: {type: literal, value: edit_file}
        file_path: {type: string}
        content: {type: string}
        reason: {type: string, minLength: 5}

    - name: run_tests
      fields:
        type: {type: literal, value: run_tests}
        test_path: {type: string, nullable: true}
        timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30}

    - name: run_lint
      fields:
        type: {type: literal, value: run_lint}
        fix: {type: boolean, default: false}
        file_path: {type: string, nullable: true}

    - name: generate_review
      fields:
        type: {type: literal, value: generate_review}
        focus_areas: {type: array, items: string}
        review_text: {type: string, minLength: 20}

    - name: commit
      fields:
        type: {type: literal, value: commit}
        message: {type: string, minLength: 10}
        files: {type: array, items: string}

    - name: self_reflect
      fields:
        type: {type: literal, value: self_reflect}
        what_went_well: {type: string, minLength: 10}
        what_to_improve: {type: string, minLength: 10}
        adjusted_plan: {type: string, nullable: true}

    - name: request_iteration
      fields:
        type: {type: literal, value: request_iteration}
        reason: {type: string, minLength: 10}
        target_issues: {type: array, items: string}

# ── Observation Space ──────────────────────────────────────────────────────────
observation_space:
  type: object
  description: "Full typed Observation returned after every step() and reset()."
  fields:
    task_id: {type: string}
    task_description: {type: string}
    difficulty: {type: string, enum: [easy, medium, hard]}
    step_number: {type: integer}
    max_steps: {type: integer}
    phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]}
    repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"}
    git_log: {type: array, items: string}
    last_action_type: {type: string, nullable: true}
    last_action_status: {type: string, enum: [success, failure, partial]}
    last_action_output: {type: string}
    test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"}
    lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"}
    plan: {type: array, description: "List of PlanStep actions issued so far"}
    reviews: {type: array, description: "List of ReviewArtifact objects"}
    reflections: {type: array, description: "List of ReflectionArtifact objects"}
    reward: {type: number, description: "Reward for the last action"}
    cumulative_reward: {type: number}
    done: {type: boolean}
    info: {type: object}

# ── Reward ─────────────────────────────────────────────────────────────────────
reward:
  range: [0.0, 1.0]
  type: dense
  description: >
    Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
    clean lint, reviews, reflections, commits. Always strictly between 0 and 1.

# ── Tasks ──────────────────────────────────────────────────────────────────────
tasks:
  - id: easy_bugfix_chunk_list
    difficulty: easy
    max_steps: 20
    description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

  - id: medium_refactor_stats
    difficulty: medium
    max_steps: 30
    description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

  - id: hard_lru_cache_performance
    difficulty: hard
    max_steps: 40
    description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
    grader: grader.grade_task
    score_range: [0.0, 1.0]
    
  - id: bonus_task
    difficulty: bonus
    max_steps: 10
    description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

# ── Infrastructure ─────────────────────────────────────────────────────────────
runtime:
  python: ">=3.11"
  memory_gb: 8
  vcpu: 2
  max_episode_minutes: 20

inference:
  script: inference.py
  env_vars:
    API_BASE_URL: "https://api.groq.com/openai/v1"
    MODEL_NAME: "llama3-8b-8192"
    HF_TOKEN: ""

deployment:
  dockerfile: Dockerfile
  huggingface_spaces: true
  gradio_app: server/app.py

# ── API Endpoints (for OpenEnv validator) ──────────────────────────────────────
api:
  reset:
    method: POST
    path: /reset
    body: '{"task_id": "easy_bugfix_chunk_list"}'
  step:
    method: POST
    path: /step
    body: '{"action": {"type": "run_tests"}}'
  state:
    method: GET
    path: /state
  health:
    method: GET
    path: /health