File size: 1,090 Bytes
761f203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc990fa
761f203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
spec_version: 1
name: flaky_sleuth
type: space
runtime: fastapi
app: server.app:app
port: 8000

version: 0.1.0
description: >
  An RL environment where an LLM agent investigates flaky tests in Python repositories.
  The agent uses tool-like actions to read files, search code, and run tests, then submits
  a terminal verdict for classification, root-cause detection, or fix proposal.

action_type: FlakySleuthAction
observation_type: FlakySleuthObservation
reward_range: (0.001, 0.999)
episode_max_steps: 20
baseline_script: inference.py

tasks:
  - id: task1_classify
    name: Flaky vs Stable Classification
    difficulty: easy
    description: Classify the target test as flaky or stable.
  - id: task2_root_cause
    name: Root Cause Category Identification
    difficulty: medium
    description: Predict flaky-test root-cause category (OD, NOD, TD, TZD, NIO, ID, etc.).
  - id: task3_fix_proposal
    name: Fix Proposal
    difficulty: hard
    description: Propose a concrete fix as unified diff for a known flaky test.

infra:
  vcpu: 2
  memory_gb: 8
  max_inference_minutes: 20