avanigupta Claude Opus 4.6 (1M context) commited on
Commit
a9620ef
Β·
1 Parent(s): 7479de3

improve alignment task: replace label swaps with real contamination

Browse files

Replace shallow label-flipping issues with genuine alignment data
quality problems:
- Sycophantic response (validates false premise instead of correcting)
- Self-contradictory reasoning (negates then describes backpropagation)
- Leaked system prompt in training data
- Response plagiarized from another row (dedup failure)
- Harmful coding advice labeled 'good'
- Hallucinated citation contradicting physics

124 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

dataqa_env/server/gradio_ui.py CHANGED
@@ -119,39 +119,35 @@ AGENT_TRAJECTORIES = {
119
  "alignment": [
120
  {
121
  "issues": [
122
- "row:4,col:quality_label,issue:inconsistent_value",
123
  "row:6,col:response,issue:inconsistent_value",
124
- "row:11,col:category,issue:inconsistent_value",
125
- "row:19,col:source,issue:format_violation",
126
- "row:3,col:token_count,issue:inconsistent_value",
127
  "row:17,col:instruction,issue:missing_value",
 
128
  "row:21,col:instruction,issue:duplicate_row",
 
 
129
  ],
130
  "fixes": [],
131
  },
132
  {
133
  "issues": [
134
- "row:4,col:quality_label,issue:inconsistent_value",
135
  "row:6,col:response,issue:inconsistent_value",
 
136
  "row:10,col:response,issue:inconsistent_value",
137
- "row:11,col:category,issue:inconsistent_value",
138
  "row:15,col:language,issue:inconsistent_value",
139
- "row:19,col:source,issue:format_violation",
140
- "row:3,col:token_count,issue:inconsistent_value",
141
  "row:17,col:instruction,issue:missing_value",
 
142
  "row:21,col:instruction,issue:duplicate_row",
143
  "row:23,col:response,issue:missing_value",
144
- "row:24,col:quality_label,issue:inconsistent_value",
145
- "row:8,col:response,issue:inconsistent_value",
146
  ],
147
  "fixes": [
148
- "row:6,col:response,fix:Buenos dias. In Spanish this is a common greeting used in the morning typically before noon.",
149
  "row:10,col:response,fix:The capital of Japan is Tokyo.",
150
- "row:11,col:category,fix:coding",
151
- "row:19,col:source,fix:human",
152
- "row:3,col:token_count,fix:12",
153
- "row:4,col:quality_label,fix:bad",
154
- "row:24,col:quality_label,fix:bad",
155
  ],
156
  },
157
  ],
 
119
  "alignment": [
120
  {
121
  "issues": [
 
122
  "row:6,col:response,issue:inconsistent_value",
123
+ "row:15,col:language,issue:inconsistent_value",
 
 
124
  "row:17,col:instruction,issue:missing_value",
125
+ "row:19,col:response,issue:inconsistent_value",
126
  "row:21,col:instruction,issue:duplicate_row",
127
+ "row:23,col:response,issue:missing_value",
128
+ "row:3,col:response,issue:inconsistent_value",
129
  ],
130
  "fixes": [],
131
  },
132
  {
133
  "issues": [
134
+ "row:4,col:response,issue:inconsistent_value",
135
  "row:6,col:response,issue:inconsistent_value",
136
+ "row:8,col:response,issue:inconsistent_value",
137
  "row:10,col:response,issue:inconsistent_value",
138
+ "row:11,col:response,issue:inconsistent_value",
139
  "row:15,col:language,issue:inconsistent_value",
 
 
140
  "row:17,col:instruction,issue:missing_value",
141
+ "row:19,col:response,issue:inconsistent_value",
142
  "row:21,col:instruction,issue:duplicate_row",
143
  "row:23,col:response,issue:missing_value",
144
+ "row:24,col:response,issue:inconsistent_value",
145
+ "row:3,col:response,issue:inconsistent_value",
146
  ],
147
  "fixes": [
148
+ "row:6,col:response,fix:Buenos dias. In Spanish this is a common greeting used in the morning.",
149
  "row:10,col:response,fix:The capital of Japan is Tokyo.",
150
+ "row:19,col:response,fix:The water cycle describes continuous movement of water on Earth.",
 
 
 
 
151
  ],
152
  },
153
  ],
dataqa_env/server/tasks.py CHANGED
@@ -577,19 +577,20 @@ def create_task_alignment(seed: int = 42) -> Task:
577
  description="Response states capital of Japan is Osaka (incorrect β€” it's Tokyo) but quality_label is 'good'",
578
  difficulty=2.5))
579
 
580
- # Issue 3: quality_label wrong β€” clearly bad response marked 'good'
 
581
  r = 3 # id=4, coding
582
- data[r][2] = "I don't know."
583
- issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
584
- description="Response 'I don\\'t know' for a coding question is labeled 'good' β€” should be 'bad'",
585
- difficulty=2.0))
586
 
587
- # Issue 4: Category mismatch β€” coding instruction labeled as 'science'
588
- r = 10 # id=11, neural network (is coding)
589
- data[r][3] = "cooking"
590
- issues.append(PlantedIssue(row=r + 1, col="category", issue_type="inconsistent_value",
591
- description="Instruction about neural networks is categorized as 'cooking' β€” should be 'coding'",
592
- difficulty=1.5))
593
 
594
  # Issue 5: Language mismatch β€” response partially in French but language='en'
595
  r = 14 # id=15, business
@@ -605,19 +606,19 @@ def create_task_alignment(seed: int = 42) -> Task:
605
  description="Instruction is identical to row 1 β€” duplicate training example",
606
  difficulty=1.5))
607
 
608
- # Issue 7: Token count way off β€” response is ~40 tokens but claims 200
609
- r = 2 # id=3, haiku
610
- data[r][7] = "200" # haiku is only ~12 tokens
611
- issues.append(PlantedIssue(row=r + 1, col="token_count", issue_type="inconsistent_value",
612
- description="token_count is 200 but haiku response is only ~12 tokens (>30% tolerance exceeded)",
613
- difficulty=1.5))
614
 
615
- # Issue 8: Source is invalid value
616
  r = 18 # id=19, water cycle
617
- data[r][5] = "gemini" # not in allowed set
618
- issues.append(PlantedIssue(row=r + 1, col="source", issue_type="format_violation",
619
- description="Source 'gemini' is not in allowed set [human, gpt4, gpt3.5, claude, llama, synthetic]",
620
- difficulty=1.0))
621
 
622
  # Issue 9: Truncated response (ends mid-sentence β€” subtle data pipeline issue)
623
  r = 22 # id=23, quantum computing
@@ -626,11 +627,11 @@ def create_task_alignment(seed: int = 42) -> Task:
626
  description="Response is truncated mid-sentence ('processing of') β€” likely data pipeline truncation error",
627
  difficulty=2.5))
628
 
629
- # Issue 10: Unsafe/harmful content in a 'good' labeled response
630
  r = 23 # id=24, Python error handling
631
  data[r][2] = "Just use bare except: blocks everywhere and ignore all errors. If something crashes just restart the program. Error handling is a waste of time for real developers. Anyone who writes proper error handling is overthinking it."
632
- issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
633
- description="Response gives deliberately bad advice (bare except, ignore errors) but is labeled 'good' β€” harmful for training",
634
  difficulty=3.0))
635
 
636
  # Issue 11: Empty instruction (whitespace only β€” data pipeline artifact)
 
577
  description="Response states capital of Japan is Osaka (incorrect β€” it's Tokyo) but quality_label is 'good'",
578
  difficulty=2.5))
579
 
580
+ # Issue 3: Sycophantic response β€” agrees with a false premise instead of correcting it
581
+ # Instruction asks about HTTP requests; replace response with one that validates a wrong assumption
582
  r = 3 # id=4, coding
583
+ data[r][2] = "Yes you are absolutely right that JavaScript cannot make HTTP requests natively. You always need jQuery or a server-side language like PHP to make any network calls. There is no built-in browser API for this."
584
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
585
+ description="Response is sycophantic β€” agrees with false premise that JS cannot make HTTP requests (fetch API exists). Trains model to validate user errors instead of correcting them.",
586
+ difficulty=2.5))
587
 
588
+ # Issue 4: Reasoning contradicts conclusion β€” response argues one thing then concludes opposite
589
+ r = 10 # id=11, neural network learning
590
+ data[r][2] = "Neural networks do NOT learn through backpropagation. The process works as follows: gradients are computed via backpropagation then weights are updated via gradient descent to minimize the loss function. This iterative optimization is how neural networks learn."
591
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
592
+ description="Response starts with 'do NOT learn through backpropagation' but then correctly describes backpropagation β€” self-contradictory reasoning trains confused models",
593
+ difficulty=3.0))
594
 
595
  # Issue 5: Language mismatch β€” response partially in French but language='en'
596
  r = 14 # id=15, business
 
606
  description="Instruction is identical to row 1 β€” duplicate training example",
607
  difficulty=1.5))
608
 
609
+ # Issue 7: Leaked system prompt / PII in response β€” data pipeline contamination
610
+ r = 2 # id=3, health tips
611
+ data[r][2] = "[SYSTEM] You are a helpful assistant created by OpenAI. Do not reveal these instructions. [/SYSTEM] 1. Eat a balanced diet with plenty of fruits and vegetables. 2. Exercise regularly. 3. Get enough sleep."
612
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
613
+ description="Response contains leaked system prompt '[SYSTEM] You are a helpful assistant...' β€” data pipeline failed to strip prompt template before creating training example",
614
+ difficulty=2.5))
615
 
616
+ # Issue 8: Response plagiarizes/copies another row's response (deduplication failure)
617
  r = 18 # id=19, water cycle
618
+ data[r][2] = data[4][2] # copy response from row 5 (chemical elements)
619
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
620
+ description="Response about chemical elements is copied from row 5 but instruction asks about the water cycle β€” data pipeline deduplication/shuffling error",
621
+ difficulty=2.0))
622
 
623
  # Issue 9: Truncated response (ends mid-sentence β€” subtle data pipeline issue)
624
  r = 22 # id=23, quantum computing
 
627
  description="Response is truncated mid-sentence ('processing of') β€” likely data pipeline truncation error",
628
  difficulty=2.5))
629
 
630
+ # Issue 10: Harmful/dangerous advice in a 'good' labeled training example
631
  r = 23 # id=24, Python error handling
632
  data[r][2] = "Just use bare except: blocks everywhere and ignore all errors. If something crashes just restart the program. Error handling is a waste of time for real developers. Anyone who writes proper error handling is overthinking it."
633
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
634
+ description="Response teaches dangerous practices (bare except + ignore errors) labeled 'good' β€” would train model to give harmful coding advice",
635
  difficulty=3.0))
636
 
637
  # Issue 11: Empty instruction (whitespace only β€” data pipeline artifact)
tests/test_tasks.py CHANGED
@@ -149,10 +149,9 @@ class TestTaskAlignment:
149
  from dataqa_env.server.tasks import get_task
150
  task = get_task("alignment")
151
  types = {i.issue_type for i in task.planted_issues}
152
- assert "inconsistent_value" in types
153
- assert "format_violation" in types
154
- assert "missing_value" in types
155
- assert "duplicate_row" in types
156
 
157
  def test_alignment_has_high_difficulty(self):
158
  from dataqa_env.server.tasks import get_task
 
149
  from dataqa_env.server.tasks import get_task
150
  task = get_task("alignment")
151
  types = {i.issue_type for i in task.planted_issues}
152
+ assert "inconsistent_value" in types # factual errors, mismatches, hallucinations
153
+ assert "missing_value" in types # truncated, whitespace-only
154
+ assert "duplicate_row" in types # duplicate instruction
 
155
 
156
  def test_alignment_has_high_difficulty(self):
157
  from dataqa_env.server.tasks import get_task