File size: 1,740 Bytes
d7cc083
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
name: data-cleaning-env
version: "1.0.0"
description: |
  A reinforcement learning environment for data cleaning tasks.
  An AI agent receives messy datasets and must apply the correct
  cleaning operations. Supports 3 tasks of increasing difficulty:
  null removal (easy), date standardization (medium), and
  outlier detection (hard).

author: "Soham Sandeep Kamathi"
email: "2023.soham.kamathi@ves.ac.in"

tasks:
  - id: 1
    name: remove_nulls
    difficulty: easy
    description: "Remove rows with null values from the dataset"
    min_score: 0.0
    max_score: 1.0

  - id: 2
    name: fix_dates
    difficulty: medium
    description: "Standardise inconsistent date formats to YYYY-MM-DD"
    min_score: 0.0
    max_score: 1.0

  - id: 3
    name: remove_outliers
    difficulty: hard
    description: "Detect and remove statistical outliers using IQR method"
    min_score: 0.0
    max_score: 1.0

api:
  endpoints:
    - path: /reset
      method: POST
      description: "Start a new episode, returns DatasetObservation"
    - path: /step
      method: POST
      description: "Take a cleaning action, returns observation + reward"
    - path: /state
      method: GET
      description: "Get current episode metadata"
    - path: /tasks
      method: GET
      description: "List all available tasks"

observation_space:
  type: object
  description: "DatasetObservation with preview, null_count, date_errors, outlier_count"

action_space:
  type: object
  description: "CleaningAction with action_type and optional column"

reward:
  type: continuous
  range: [0.0, 1.0]
  description: "Graded score based on cleaning quality"

infrastructure:
  min_cpu: 2
  min_memory_gb: 4
  requires_gpu: false
  runtime_limit_minutes: 20