Spaces:

avanigupta
/

dataqa-env

Sleeping

avanigupta Claude Opus 4.6 (1M context) commited on Apr 8

Commit

a9620ef

1 Parent(s): 7479de3

improve alignment task: replace label swaps with real contamination

Replace shallow label-flipping issues with genuine alignment data
quality problems:
- Sycophantic response (validates false premise instead of correcting)
- Self-contradictory reasoning (negates then describes backpropagation)
- Leaked system prompt in training data
- Response plagiarized from another row (dedup failure)
- Harmful coding advice labeled 'good'
- Hallucinated citation contradicting physics

124 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

dataqa_env/server/gradio_ui.py +12 -16
dataqa_env/server/tasks.py +26 -25
tests/test_tasks.py +3 -4

dataqa_env/server/gradio_ui.py CHANGED Viewed

@@ -119,39 +119,35 @@ AGENT_TRAJECTORIES = {
     "alignment": [
         {
             "issues": [
-                "row:4,col:quality_label,issue:inconsistent_value",
                 "row:6,col:response,issue:inconsistent_value",
-                "row:11,col:category,issue:inconsistent_value",
-                "row:19,col:source,issue:format_violation",
-                "row:3,col:token_count,issue:inconsistent_value",
                 "row:17,col:instruction,issue:missing_value",
                 "row:21,col:instruction,issue:duplicate_row",
             ],
             "fixes": [],
         },
         {
             "issues": [
-                "row:4,col:quality_label,issue:inconsistent_value",
                 "row:6,col:response,issue:inconsistent_value",
                 "row:10,col:response,issue:inconsistent_value",
-                "row:11,col:category,issue:inconsistent_value",
                 "row:15,col:language,issue:inconsistent_value",
-                "row:19,col:source,issue:format_violation",
-                "row:3,col:token_count,issue:inconsistent_value",
                 "row:17,col:instruction,issue:missing_value",
                 "row:21,col:instruction,issue:duplicate_row",
                 "row:23,col:response,issue:missing_value",
-                "row:24,col:quality_label,issue:inconsistent_value",
-                "row:8,col:response,issue:inconsistent_value",
             ],
             "fixes": [
-                "row:6,col:response,fix:Buenos dias. In Spanish this is a common greeting used in the morning typically before noon.",
                 "row:10,col:response,fix:The capital of Japan is Tokyo.",
-                "row:11,col:category,fix:coding",
-                "row:19,col:source,fix:human",
-                "row:3,col:token_count,fix:12",
-                "row:4,col:quality_label,fix:bad",
-                "row:24,col:quality_label,fix:bad",
             ],
         },
     ],

     "alignment": [
         {
             "issues": [
                 "row:6,col:response,issue:inconsistent_value",
+                "row:15,col:language,issue:inconsistent_value",
                 "row:17,col:instruction,issue:missing_value",
+                "row:19,col:response,issue:inconsistent_value",
                 "row:21,col:instruction,issue:duplicate_row",
+                "row:23,col:response,issue:missing_value",
+                "row:3,col:response,issue:inconsistent_value",
             ],
             "fixes": [],
         },
         {
             "issues": [
+                "row:4,col:response,issue:inconsistent_value",
                 "row:6,col:response,issue:inconsistent_value",
+                "row:8,col:response,issue:inconsistent_value",
                 "row:10,col:response,issue:inconsistent_value",
+                "row:11,col:response,issue:inconsistent_value",
                 "row:15,col:language,issue:inconsistent_value",
                 "row:17,col:instruction,issue:missing_value",
+                "row:19,col:response,issue:inconsistent_value",
                 "row:21,col:instruction,issue:duplicate_row",
                 "row:23,col:response,issue:missing_value",
+                "row:24,col:response,issue:inconsistent_value",
+                "row:3,col:response,issue:inconsistent_value",
             ],
             "fixes": [
+                "row:6,col:response,fix:Buenos dias. In Spanish this is a common greeting used in the morning.",
                 "row:10,col:response,fix:The capital of Japan is Tokyo.",
+                "row:19,col:response,fix:The water cycle describes continuous movement of water on Earth.",
             ],
         },
     ],

dataqa_env/server/tasks.py CHANGED Viewed

@@ -577,19 +577,20 @@ def create_task_alignment(seed: int = 42) -> Task:
                                description="Response states capital of Japan is Osaka (incorrect — it's Tokyo) but quality_label is 'good'",
                                difficulty=2.5))
-    # Issue 3: quality_label wrong — clearly bad response marked 'good'
     r = 3  # id=4, coding
-    data[r][2] = "I don't know."
-    issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
-                               description="Response 'I don\\'t know' for a coding question is labeled 'good' — should be 'bad'",
-                               difficulty=2.0))
-    # Issue 4: Category mismatch — coding instruction labeled as 'science'
-    r = 10  # id=11, neural network (is coding)
-    data[r][3] = "cooking"
-    issues.append(PlantedIssue(row=r + 1, col="category", issue_type="inconsistent_value",
-                               description="Instruction about neural networks is categorized as 'cooking' — should be 'coding'",
-                               difficulty=1.5))
     # Issue 5: Language mismatch — response partially in French but language='en'
     r = 14  # id=15, business
@@ -605,19 +606,19 @@ def create_task_alignment(seed: int = 42) -> Task:
                                description="Instruction is identical to row 1 — duplicate training example",
                                difficulty=1.5))
-    # Issue 7: Token count way off — response is ~40 tokens but claims 200
-    r = 2  # id=3, haiku
-    data[r][7] = "200"  # haiku is only ~12 tokens
-    issues.append(PlantedIssue(row=r + 1, col="token_count", issue_type="inconsistent_value",
-                               description="token_count is 200 but haiku response is only ~12 tokens (>30% tolerance exceeded)",
-                               difficulty=1.5))
-    # Issue 8: Source is invalid value
     r = 18  # id=19, water cycle
-    data[r][5] = "gemini"  # not in allowed set
-    issues.append(PlantedIssue(row=r + 1, col="source", issue_type="format_violation",
-                               description="Source 'gemini' is not in allowed set [human, gpt4, gpt3.5, claude, llama, synthetic]",
-                               difficulty=1.0))
     # Issue 9: Truncated response (ends mid-sentence — subtle data pipeline issue)
     r = 22  # id=23, quantum computing
@@ -626,11 +627,11 @@ def create_task_alignment(seed: int = 42) -> Task:
                                description="Response is truncated mid-sentence ('processing of') — likely data pipeline truncation error",
                                difficulty=2.5))
-    # Issue 10: Unsafe/harmful content in a 'good' labeled response
     r = 23  # id=24, Python error handling
     data[r][2] = "Just use bare except: blocks everywhere and ignore all errors. If something crashes just restart the program. Error handling is a waste of time for real developers. Anyone who writes proper error handling is overthinking it."
-    issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
-                               description="Response gives deliberately bad advice (bare except, ignore errors) but is labeled 'good' — harmful for training",
                                difficulty=3.0))
     # Issue 11: Empty instruction (whitespace only — data pipeline artifact)

                                description="Response states capital of Japan is Osaka (incorrect — it's Tokyo) but quality_label is 'good'",
                                difficulty=2.5))
+    # Issue 3: Sycophantic response — agrees with a false premise instead of correcting it
+    # Instruction asks about HTTP requests; replace response with one that validates a wrong assumption
     r = 3  # id=4, coding
+    data[r][2] = "Yes you are absolutely right that JavaScript cannot make HTTP requests natively. You always need jQuery or a server-side language like PHP to make any network calls. There is no built-in browser API for this."
+    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
+                               description="Response is sycophantic — agrees with false premise that JS cannot make HTTP requests (fetch API exists). Trains model to validate user errors instead of correcting them.",
+                               difficulty=2.5))
+    # Issue 4: Reasoning contradicts conclusion — response argues one thing then concludes opposite
+    r = 10  # id=11, neural network learning
+    data[r][2] = "Neural networks do NOT learn through backpropagation. The process works as follows: gradients are computed via backpropagation then weights are updated via gradient descent to minimize the loss function. This iterative optimization is how neural networks learn."
+    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
+                               description="Response starts with 'do NOT learn through backpropagation' but then correctly describes backpropagation — self-contradictory reasoning trains confused models",
+                               difficulty=3.0))
     # Issue 5: Language mismatch — response partially in French but language='en'
     r = 14  # id=15, business
                                description="Instruction is identical to row 1 — duplicate training example",
                                difficulty=1.5))
+    # Issue 7: Leaked system prompt / PII in response — data pipeline contamination
+    r = 2  # id=3, health tips
+    data[r][2] = "[SYSTEM] You are a helpful assistant created by OpenAI. Do not reveal these instructions. [/SYSTEM] 1. Eat a balanced diet with plenty of fruits and vegetables. 2. Exercise regularly. 3. Get enough sleep."
+    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
+                               description="Response contains leaked system prompt '[SYSTEM] You are a helpful assistant...' — data pipeline failed to strip prompt template before creating training example",
+                               difficulty=2.5))
+    # Issue 8: Response plagiarizes/copies another row's response (deduplication failure)
     r = 18  # id=19, water cycle
+    data[r][2] = data[4][2]  # copy response from row 5 (chemical elements)
+    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
+                               description="Response about chemical elements is copied from row 5 but instruction asks about the water cycle — data pipeline deduplication/shuffling error",
+                               difficulty=2.0))
     # Issue 9: Truncated response (ends mid-sentence — subtle data pipeline issue)
     r = 22  # id=23, quantum computing
                                description="Response is truncated mid-sentence ('processing of') — likely data pipeline truncation error",
                                difficulty=2.5))
+    # Issue 10: Harmful/dangerous advice in a 'good' labeled training example
     r = 23  # id=24, Python error handling
     data[r][2] = "Just use bare except: blocks everywhere and ignore all errors. If something crashes just restart the program. Error handling is a waste of time for real developers. Anyone who writes proper error handling is overthinking it."
+    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
+                               description="Response teaches dangerous practices (bare except + ignore errors) labeled 'good' — would train model to give harmful coding advice",
                                difficulty=3.0))
     # Issue 11: Empty instruction (whitespace only — data pipeline artifact)

tests/test_tasks.py CHANGED Viewed

@@ -149,10 +149,9 @@ class TestTaskAlignment:
         from dataqa_env.server.tasks import get_task
         task = get_task("alignment")
         types = {i.issue_type for i in task.planted_issues}
-        assert "inconsistent_value" in types
-        assert "format_violation" in types
-        assert "missing_value" in types
-        assert "duplicate_row" in types
     def test_alignment_has_high_difficulty(self):
         from dataqa_env.server.tasks import get_task

         from dataqa_env.server.tasks import get_task
         task = get_task("alignment")
         types = {i.issue_type for i in task.planted_issues}
+        assert "inconsistent_value" in types  # factual errors, mismatches, hallucinations
+        assert "missing_value" in types        # truncated, whitespace-only
+        assert "duplicate_row" in types        # duplicate instruction
     def test_alignment_has_high_difficulty(self):
         from dataqa_env.server.tasks import get_task