File size: 2,943 Bytes
91e7690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa25459
91e7690
 
 
94595e2
 
91e7690
 
aa25459
91e7690
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
name: data-quality-env
version: "2.0.0"
description: >
  RL environment where an AI agent acts as a data quality auditor.
  Multi-table, adversarial injection, budget-constrained exploration,
  confidence-calibrated Brier grading, and post-audit fix verification loop.
author: ""
license: MIT
tags:
  - openenv
  - data-quality
  - sql
  - rl-environment
  - multi-table
  - adversarial

tasks:
  - id: 1
    name: null_and_duplicate_detection
    difficulty: easy
    max_steps: 12
    description: "Find real nulls, disguised nulls (stored as 'N/A'/'NULL'), exact duplicates, and near-duplicates in a customers table."
    expected_baseline_score: 0.82

  - id: 2
    name: schema_violation_repair
    difficulty: medium
    max_steps: 12
    description: "Detect type violations, format violations, range violations, and unparseable values in an orders table."
    expected_baseline_score: 0.61

  - id: 3
    name: silent_data_drift_detection
    difficulty: hard
    max_steps: 12
    description: "Compare two transaction snapshots. Detect mean shifts, new category values, and referential drift — nothing is labelled wrong."
    expected_baseline_score: 0.34

  - id: 4
    name: multi_table_relational_audit
    difficulty: expert
    max_steps: 12
    description: "Audit 3 joined tables (customers, orders, line_items). Find orphaned FKs, temporal violations, and aggregate mismatches using JOIN queries."
    expected_baseline_score: 0.19

action_space:
  type: json
  actions:
    - name: query
      description: "Execute a SELECT query. Costs 1 query credit. Blocked: DROP/DELETE/UPDATE/CREATE."
      fields: {sql: string}
    - name: submit_report
      description: "Submit the structured AuditReport. Triggers grading. Unlocks fix phase."
      fields: {report: AuditReport}
    - name: fix_sql
      description: "Post-audit: submit corrective UPDATE SQL. Earns fix bonus up to +0.25."
      fields: {sql: string}

observation_space:
  fields:
    task_id: int
    task_description: string
    tables: "dict[table_name -> dict[col -> dtype]]"
    row_counts: "dict[table_name -> int]"
    step: int
    max_steps: int
    query_credits_remaining: int
    phase: "audit | fix"
    last_query_result: "list[dict] | null  (max 50 rows)"
    last_action_error: "string | null"
    last_fix_score: "float | null"

reward_range: [0.0, 1.25]

reward_design:
  audit_score: "0.0–1.0, Brier-adjusted per finding confidence"
  valid_query_no_signal: "+0.01 for syntactically valid exploratory SQL that returns no obvious issue signal"
  valid_query_finds_issue: "+0.1 for valid SQL that surfaces NULLs, duplicates, or other clear audit evidence"
  budget_bonus: "up to +0.10 for early report submission"
  fix_bonus: "up to +0.25 for correct fix_sql repairs"
  invalid_sql_penalty: 0.0

api:
  reset: "POST /reset  {task_id: int, seed: int}"
  step:  "POST /step   {action: Action}"
  state: "GET  /state"
  health: "GET /health"