Spaces:

avanigupta
/

dataqa-env

Sleeping

App Files Files Community

avanigupta commited on Apr 8

Commit

5d90461

1 Parent(s): 081eb22

expand datasets to include harder real-world scenarios

Browse files

Files changed (5) hide show

README.md +3 -3
dataqa_env/server/gradio_ui.py +22 -1
dataqa_env/server/tasks.py +28 -0
tests/test_environment.py +24 -39
tests/test_tasks.py +5 -4

README.md CHANGED Viewed

@@ -52,9 +52,9 @@ This creates a rich multi-step decision problem where agents must explore datase
 | Task | Issues | Difficulty | Domain | Description |
 |------|--------|-----------|--------|-------------|
-| `easy` | 4 | Beginner | HR/Employee data | Nulls, wrong types, duplicates, out-of-range values |
-| `medium` | 6 | Intermediate | E-commerce orders | Format violations, inconsistent computed fields, duplicate keys |
-| `hard` | 10 | Advanced | ML experiment metadata | Data leakage signals, unreasonable GPU memory, impossibly fast training, SOTA-exceeding accuracy, timestamp ordering, whitespace-only fields |
 **Difficulty progression**: Easy issues are individually obvious (empty fields, text in numeric columns). Medium issues require cross-column reasoning (total != qty * price) and set membership checks. Hard issues require ML domain knowledge (val_loss < train_loss = data leakage) and multi-row temporal reasoning.

 | Task | Issues | Difficulty | Domain | Description |
 |------|--------|-----------|--------|-------------|
+| `easy` | 6 | Beginner | HR/Employee data (21 rows) | Nulls, wrong types, duplicates, out-of-range, email-name mismatch, future dates |
+| `medium` | 8 | Intermediate | E-commerce orders (31 rows) | Inconsistent totals, invalid categories, duplicate keys, wrong date formats, invalid country codes, future-date deliveries |
+| `hard` | 10 | Advanced | ML experiment metadata (31 rows) | Data leakage signals, unreasonable GPU memory, impossibly fast training, SOTA-exceeding accuracy, timestamp ordering, whitespace-only fields |
 **Difficulty progression**: Easy issues are individually obvious (empty fields, text in numeric columns). Medium issues require cross-column reasoning (total != qty * price) and set membership checks. Hard issues require ML domain knowledge (val_loss < train_loss = data leakage) and multi-row temporal reasoning.

dataqa_env/server/gradio_ui.py CHANGED Viewed

@@ -26,6 +26,7 @@ AGENT_TRAJECTORIES = {
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
                 "row:9,col:salary,issue:out_of_range",
                 "row:3,col:email,issue:format_violation",  # FP
             ],
             "fixes": [],
@@ -35,12 +36,16 @@ AGENT_TRAJECTORIES = {
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
                 "row:9,col:salary,issue:out_of_range",
-                "row:11,col:employee_id,issue:duplicate_row",
             ],
             "fixes": [
                 "row:4,col:name,fix:David Kim",
                 "row:7,col:salary,fix:75000",
                 "row:9,col:salary,fix:73000",
             ],
         },
     ],
@@ -53,12 +58,28 @@ AGENT_TRAJECTORIES = {
                 "row:17,col:quantity,issue:out_of_range",
                 "row:19,col:order_id,issue:duplicate_row",
                 "row:12,col:order_date,issue:format_violation",
             ],
             "fixes": [
                 "row:5,col:total,fix:42.00",
                 "row:10,col:category,fix:Sports",
                 "row:12,col:order_date,fix:2024-01-26",
                 "row:14,col:product_name,fix:LED Strip Lights",
             ],
         },
     ],

                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
                 "row:9,col:salary,issue:out_of_range",
+                "row:18,col:start_date,issue:out_of_range",
                 "row:3,col:email,issue:format_violation",  # FP
             ],
             "fixes": [],
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
                 "row:9,col:salary,issue:out_of_range",
+                "row:21,col:employee_id,issue:duplicate_row",
+                "row:15,col:email,issue:inconsistent_value",
+                "row:18,col:start_date,issue:out_of_range",
             ],
             "fixes": [
                 "row:4,col:name,fix:David Kim",
                 "row:7,col:salary,fix:75000",
                 "row:9,col:salary,fix:73000",
+                "row:15,col:email,fix:oscar.rivera@company.com",
+                "row:18,col:start_date,fix:2022-01-19",
             ],
         },
     ],
                 "row:17,col:quantity,issue:out_of_range",
                 "row:19,col:order_id,issue:duplicate_row",
                 "row:12,col:order_date,issue:format_violation",
+                "row:24,col:shipping_country,issue:format_violation",
+            ],
+            "fixes": [],
+        },
+        {
+            "issues": [
+                "row:5,col:total,issue:inconsistent_value",
+                "row:10,col:category,issue:format_violation",
+                "row:14,col:product_name,issue:missing_value",
+                "row:17,col:quantity,issue:out_of_range",
+                "row:19,col:order_id,issue:duplicate_row",
+                "row:12,col:order_date,issue:format_violation",
+                "row:24,col:shipping_country,issue:format_violation",
+                "row:29,col:order_date,issue:inconsistent_value",
             ],
             "fixes": [
                 "row:5,col:total,fix:42.00",
                 "row:10,col:category,fix:Sports",
                 "row:12,col:order_date,fix:2024-01-26",
                 "row:14,col:product_name,fix:LED Strip Lights",
+                "row:24,col:shipping_country,fix:US",
+                "row:29,col:order_date,fix:2024-02-12",
             ],
         },
     ],

dataqa_env/server/tasks.py CHANGED Viewed

@@ -150,6 +150,20 @@ def create_task_easy(seed: int = 42) -> Task:
     issues.append(PlantedIssue(row=r + 1, col="salary", issue_type="out_of_range",
                                description="Salary 5000 is below minimum 50000", difficulty=1.0))
     corrupted = _rows_to_csv([header] + data)
     return Task(
@@ -269,6 +283,20 @@ ORD-030,CUST-128,Dumbbells Set,Sports,1,89.00,2024-02-13,US,shipped,89.00"""
     issues.append(PlantedIssue(row=r + 1, col="order_date", issue_type="format_violation",
                                description="Date format DD/MM/YYYY instead of YYYY-MM-DD", difficulty=1.5))
     corrupted = _rows_to_csv([header] + data)
     return Task(

     issues.append(PlantedIssue(row=r + 1, col="salary", issue_type="out_of_range",
                                description="Salary 5000 is below minimum 50000", difficulty=1.0))
+    # Issue 5: Email doesn't match name pattern (moderate — cross-column check)
+    r = 14  # Oscar Rivera -> email should be oscar.rivera@company.com
+    data[r][2] = "john.doe@company.com"
+    issues.append(PlantedIssue(row=r + 1, col="email", issue_type="inconsistent_value",
+                               description="Email john.doe@company.com doesn't match name Oscar Rivera",
+                               difficulty=1.5))
+    # Issue 6: Future start date (requires knowing current date context)
+    r = 17  # Rosa Diaz
+    data[r][5] = "2027-06-15"
+    issues.append(PlantedIssue(row=r + 1, col="start_date", issue_type="out_of_range",
+                               description="Start date 2027-06-15 is in the future (beyond 2025-12-31)",
+                               difficulty=1.5))
     corrupted = _rows_to_csv([header] + data)
     return Task(
     issues.append(PlantedIssue(row=r + 1, col="order_date", issue_type="format_violation",
                                description="Date format DD/MM/YYYY instead of YYYY-MM-DD", difficulty=1.5))
+    # Issue 7: Invalid country code (requires ISO knowledge)
+    r = 23  # ORD-024
+    data[r][7] = "XX"  # not a valid ISO country code
+    issues.append(PlantedIssue(row=r + 1, col="shipping_country", issue_type="format_violation",
+                               description="'XX' is not a valid ISO 2-letter country code", difficulty=1.5))
+    # Issue 8: Status-date inconsistency — order from Feb 13 still "processing" is suspicious
+    # but more importantly: delivered order with a future date
+    r = 28  # ORD-029
+    data[r][6] = "2025-12-25"  # future date but status is "delivered"
+    issues.append(PlantedIssue(row=r + 1, col="order_date", issue_type="inconsistent_value",
+                               description="Order date 2025-12-25 is in the future but status is 'delivered'",
+                               difficulty=2.0))
     corrupted = _rows_to_csv([header] + data)
     return Task(

tests/test_environment.py CHANGED Viewed

@@ -228,16 +228,16 @@ class TestGradeFixes:
         assert result["fixes_correct"] >= 1
     def test_all_fixes_correct(self, easy_task):
-        # Fix all 4 issues with exact values
         fixes = [
             (4, "name", "David Kim"),
             (7, "salary", "75000"),
             (9, "salary", "73000"),
-            # Row 11 is duplicate — clean value for employee_id is "Bob Martinez" row
-            # The duplicate is of row 2 (Bob Martinez), so the clean row 11 doesn't exist
         ]
         result = grade_fixes(fixes, easy_task)
-        assert result["fix_score"] > 0.5  # at least 3/4 issues fixed
     def test_fix_score_bounded(self, easy_task):
         fixes = [(4, "name", "David Kim"), (99, "x", "bad")]
@@ -260,7 +260,7 @@ class TestDataQAEnvironment:
         assert obs.schema_description
         assert obs.validation_rules
         assert obs.task_description
-        assert obs.num_issues_hint == 4
         assert obs.max_steps == 3
         assert obs.done is False
         assert obs.reward == 0.0
@@ -268,7 +268,7 @@ class TestDataQAEnvironment:
     def test_reset_medium(self, env):
         obs = env.reset(task_id="medium")
-        assert obs.num_issues_hint == 6
     def test_reset_hard(self, env):
         obs = env.reset(task_id="hard")
@@ -277,12 +277,15 @@ class TestDataQAEnvironment:
     def test_step_identify_only(self, env):
         """Backward compatible: only issues, no fixes."""
         env.reset(task_id="easy")
         action = DataQAAction(
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
-                "row:11,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
             ],
             task_id="easy",
         )
@@ -291,30 +294,17 @@ class TestDataQAEnvironment:
         assert obs.reward >= 0.999  # identify-only uses identify_score directly
     def test_step_with_fixes_increases_reward(self, env):
-        """Submitting correct fixes should increase reward beyond identify-only."""
         env.reset(task_id="easy")
-        # Step 1: identify only
-        action1 = DataQAAction(
-            issues=[
-                "row:4,col:name,issue:missing_value",
-                "row:7,col:salary,issue:wrong_type",
-                "row:11,col:employee_id,issue:duplicate_row",
-                "row:9,col:salary,issue:out_of_range",
-            ],
-            task_id="easy",
-        )
-        obs1 = env.step(action1)
-        score_identify = obs1.reward
-        # Reset for fair comparison
-        env.reset(task_id="easy")
-        # Step with identify + fixes
-        action2 = DataQAAction(
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
-                "row:11,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
             ],
             fixes=[
                 "row:4,col:name,fix:David Kim",
@@ -323,11 +313,9 @@ class TestDataQAEnvironment:
             ],
             task_id="easy",
         )
-        obs2 = env.step(action2)
-        score_with_fixes = obs2.metadata["combined_reward"]
-        # With correct fixes, combined should be close to 1.0
-        assert score_with_fixes > 0.8
     def test_step_with_partial_issues(self, env):
         env.reset(task_id="easy")
@@ -426,12 +414,7 @@ class TestDataQAEnvironment:
         """Verify combined = IDENTIFY_WEIGHT * identify + FIX_WEIGHT * fix."""
         env.reset(task_id="easy")
         action = DataQAAction(
-            issues=[
-                "row:4,col:name,issue:missing_value",
-                "row:7,col:salary,issue:wrong_type",
-                "row:11,col:employee_id,issue:duplicate_row",
-                "row:9,col:salary,issue:out_of_range",
-            ],
             fixes=["row:4,col:name,fix:David Kim"],
             task_id="easy",
         )
@@ -458,13 +441,15 @@ class TestDataQAEnvironment:
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
-                "row:11,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
             ],
             task_id="easy",
         )
         obs = env.step(action)
-        # identify_score should be ~1.0 since all issues found
         assert obs.reward >= 0.99
         # combined_reward equals identify_score when no fixes
         assert obs.metadata["combined_reward"] == obs.metadata["identify_score"]

         assert result["fixes_correct"] >= 1
     def test_all_fixes_correct(self, easy_task):
+        # Fix most issues with exact values
         fixes = [
             (4, "name", "David Kim"),
             (7, "salary", "75000"),
             (9, "salary", "73000"),
+            (15, "email", "oscar.rivera@company.com"),
+            (18, "start_date", "2022-01-19"),
         ]
         result = grade_fixes(fixes, easy_task)
+        assert result["fix_score"] > 0.7  # 5 out of 6 issues fixed (duplicate can't be fixed)
     def test_fix_score_bounded(self, easy_task):
         fixes = [(4, "name", "David Kim"), (99, "x", "bad")]
         assert obs.schema_description
         assert obs.validation_rules
         assert obs.task_description
+        assert obs.num_issues_hint == 6
         assert obs.max_steps == 3
         assert obs.done is False
         assert obs.reward == 0.0
     def test_reset_medium(self, env):
         obs = env.reset(task_id="medium")
+        assert obs.num_issues_hint == 8
     def test_reset_hard(self, env):
         obs = env.reset(task_id="hard")
     def test_step_identify_only(self, env):
         """Backward compatible: only issues, no fixes."""
         env.reset(task_id="easy")
+        # Submit all 6 correct issues for easy task
         action = DataQAAction(
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
+                "row:21,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
+                "row:15,col:email,issue:inconsistent_value",
+                "row:18,col:start_date,issue:out_of_range",
             ],
             task_id="easy",
         )
         assert obs.reward >= 0.999  # identify-only uses identify_score directly
     def test_step_with_fixes_increases_reward(self, env):
+        """Submitting correct fixes should produce high combined reward."""
         env.reset(task_id="easy")
+        # All 6 issues + 3 fixes
+        action = DataQAAction(
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
+                "row:21,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
+                "row:15,col:email,issue:inconsistent_value",
+                "row:18,col:start_date,issue:out_of_range",
             ],
             fixes=[
                 "row:4,col:name,fix:David Kim",
             ],
             task_id="easy",
         )
+        obs = env.step(action)
+        # Perfect identify + partial fixes -> high combined reward
+        assert obs.metadata["combined_reward"] > 0.7
     def test_step_with_partial_issues(self, env):
         env.reset(task_id="easy")
         """Verify combined = IDENTIFY_WEIGHT * identify + FIX_WEIGHT * fix."""
         env.reset(task_id="easy")
         action = DataQAAction(
+            issues=["row:4,col:name,issue:missing_value"],
             fixes=["row:4,col:name,fix:David Kim"],
             task_id="easy",
         )
             issues=[
                 "row:4,col:name,issue:missing_value",
                 "row:7,col:salary,issue:wrong_type",
+                "row:21,col:employee_id,issue:duplicate_row",
                 "row:9,col:salary,issue:out_of_range",
+                "row:15,col:email,issue:inconsistent_value",
+                "row:18,col:start_date,issue:out_of_range",
             ],
             task_id="easy",
         )
         obs = env.step(action)
+        # identify_score should be ~1.0 since all 6 issues found
         assert obs.reward >= 0.99
         # combined_reward equals identify_score when no fixes
         assert obs.metadata["combined_reward"] == obs.metadata["identify_score"]

tests/test_tasks.py CHANGED Viewed

@@ -49,8 +49,8 @@ class TestTaskEasy:
     def test_task_id(self, task):
         assert task.task_id == "easy"
-    def test_has_4_issues(self, task):
-        assert len(task.planted_issues) == 4
     def test_issue_types(self, task):
         types = {i.issue_type for i in task.planted_issues}
@@ -58,6 +58,7 @@ class TestTaskEasy:
         assert "wrong_type" in types
         assert "duplicate_row" in types
         assert "out_of_range" in types
     def test_corrupted_csv_differs_from_clean(self, task):
         assert task.corrupted_csv != task.clean_csv
@@ -87,8 +88,8 @@ class TestTaskMedium:
     def test_task_id(self, task):
         assert task.task_id == "medium"
-    def test_has_6_issues(self, task):
-        assert len(task.planted_issues) == 6
     def test_issue_types(self, task):
         types = {i.issue_type for i in task.planted_issues}

     def test_task_id(self, task):
         assert task.task_id == "easy"
+    def test_has_6_issues(self, task):
+        assert len(task.planted_issues) == 6
     def test_issue_types(self, task):
         types = {i.issue_type for i in task.planted_issues}
         assert "wrong_type" in types
         assert "duplicate_row" in types
         assert "out_of_range" in types
+        assert "inconsistent_value" in types
     def test_corrupted_csv_differs_from_clean(self, task):
         assert task.corrupted_csv != task.clean_csv
     def test_task_id(self, task):
         assert task.task_id == "medium"
+    def test_has_8_issues(self, task):
+        assert len(task.planted_issues) == 8
     def test_issue_types(self, task):
         types = {i.issue_type for i in task.planted_issues}