name: data-cleaning-env version: "1.0.0" description: | A reinforcement learning environment for data cleaning tasks. An AI agent receives messy datasets and must apply the correct cleaning operations. Supports 3 tasks of increasing difficulty: null removal (easy), date standardization (medium), and outlier detection (hard). author: "Soham Sandeep Kamathi" email: "2023.soham.kamathi@ves.ac.in" tasks: - id: 1 name: remove_nulls difficulty: easy description: "Remove rows with null values from the dataset" min_score: 0.0 max_score: 1.0 - id: 2 name: fix_dates difficulty: medium description: "Standardise inconsistent date formats to YYYY-MM-DD" min_score: 0.0 max_score: 1.0 - id: 3 name: remove_outliers difficulty: hard description: "Detect and remove statistical outliers using IQR method" min_score: 0.0 max_score: 1.0 api: endpoints: - path: /reset method: POST description: "Start a new episode, returns DatasetObservation" - path: /step method: POST description: "Take a cleaning action, returns observation + reward" - path: /state method: GET description: "Get current episode metadata" - path: /tasks method: GET description: "List all available tasks" observation_space: type: object description: "DatasetObservation with preview, null_count, date_errors, outlier_count" action_space: type: object description: "CleaningAction with action_type and optional column" reward: type: continuous range: [0.0, 1.0] description: "Graded score based on cleaning quality" infrastructure: min_cpu: 2 min_memory_gb: 4 requires_gpu: false runtime_limit_minutes: 20