avanigupta Claude Opus 4.6 (1M context) commited on
Commit
5cb467d
·
1 Parent(s): 56f55e9

add alignment data QA task: 12 issues in LLM instruction-tuning data

Browse files

New expert-level task targeting LLM training data quality:
- Instruction-response mismatches
- Factual errors in 'good' labeled responses
- Hallucinated citations (fake studies)
- Harmful advice labeled as good
- Language mismatches, truncated responses, duplicate instructions
- 25 rows, 12 planted issues, difficulty 1.0-3.0

124 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

README.md CHANGED
@@ -64,6 +64,7 @@ This creates a rich multi-step decision problem where agents must explore datase
64
  | `easy` | 6 | Beginner | HR/Employee data (21 rows) | Nulls, wrong types, duplicates, out-of-range, email-name mismatch, future dates |
65
  | `medium` | 8 | Intermediate | E-commerce orders (31 rows) | Inconsistent totals, invalid categories, duplicate keys, wrong date formats, invalid country codes, future-date deliveries |
66
  | `hard` | 10 | Advanced | ML experiment metadata (31 rows) | Data leakage signals, unreasonable GPU memory, impossibly fast training, SOTA-exceeding accuracy, timestamp ordering, whitespace-only fields |
 
67
 
68
  **Difficulty progression**: Easy issues are individually obvious (empty fields, text in numeric columns). Medium issues require cross-column reasoning (total != qty * price) and set membership checks. Hard issues require ML domain knowledge (val_loss < train_loss = data leakage) and multi-row temporal reasoning.
69
 
 
64
  | `easy` | 6 | Beginner | HR/Employee data (21 rows) | Nulls, wrong types, duplicates, out-of-range, email-name mismatch, future dates |
65
  | `medium` | 8 | Intermediate | E-commerce orders (31 rows) | Inconsistent totals, invalid categories, duplicate keys, wrong date formats, invalid country codes, future-date deliveries |
66
  | `hard` | 10 | Advanced | ML experiment metadata (31 rows) | Data leakage signals, unreasonable GPU memory, impossibly fast training, SOTA-exceeding accuracy, timestamp ordering, whitespace-only fields |
67
+ | `alignment` | 12 | Expert | LLM instruction-tuning data (25 rows) | Instruction-response mismatches, factual errors in "good" labels, hallucinated citations, harmful advice, language mismatches, truncated responses, duplicate instructions |
68
 
69
  **Difficulty progression**: Easy issues are individually obvious (empty fields, text in numeric columns). Medium issues require cross-column reasoning (total != qty * price) and set membership checks. Hard issues require ML domain knowledge (val_loss < train_loss = data leakage) and multi-row temporal reasoning.
70
 
dataqa_env/server/gradio_ui.py CHANGED
@@ -116,6 +116,45 @@ AGENT_TRAJECTORIES = {
116
  ],
117
  },
118
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  }
120
 
121
 
 
116
  ],
117
  },
118
  ],
119
+ "alignment": [
120
+ {
121
+ "issues": [
122
+ "row:4,col:quality_label,issue:inconsistent_value",
123
+ "row:6,col:response,issue:inconsistent_value",
124
+ "row:11,col:category,issue:inconsistent_value",
125
+ "row:19,col:source,issue:format_violation",
126
+ "row:3,col:token_count,issue:inconsistent_value",
127
+ "row:17,col:instruction,issue:missing_value",
128
+ "row:21,col:instruction,issue:duplicate_row",
129
+ ],
130
+ "fixes": [],
131
+ },
132
+ {
133
+ "issues": [
134
+ "row:4,col:quality_label,issue:inconsistent_value",
135
+ "row:6,col:response,issue:inconsistent_value",
136
+ "row:10,col:response,issue:inconsistent_value",
137
+ "row:11,col:category,issue:inconsistent_value",
138
+ "row:15,col:language,issue:inconsistent_value",
139
+ "row:19,col:source,issue:format_violation",
140
+ "row:3,col:token_count,issue:inconsistent_value",
141
+ "row:17,col:instruction,issue:missing_value",
142
+ "row:21,col:instruction,issue:duplicate_row",
143
+ "row:23,col:response,issue:missing_value",
144
+ "row:24,col:quality_label,issue:inconsistent_value",
145
+ "row:8,col:response,issue:inconsistent_value",
146
+ ],
147
+ "fixes": [
148
+ "row:6,col:response,fix:Buenos dias. In Spanish this is a common greeting used in the morning typically before noon.",
149
+ "row:10,col:response,fix:The capital of Japan is Tokyo.",
150
+ "row:11,col:category,fix:coding",
151
+ "row:19,col:source,fix:human",
152
+ "row:3,col:token_count,fix:12",
153
+ "row:4,col:quality_label,fix:bad",
154
+ "row:24,col:quality_label,fix:bad",
155
+ ],
156
+ },
157
+ ],
158
  }
159
 
160
 
dataqa_env/server/tasks.py CHANGED
@@ -486,6 +486,174 @@ EXP-030,llama2-13b,oasst1,84437,4401,4401,0.00001,2,3,0.78,0.88,0.0,52.0,12.0,20
486
  )
487
 
488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  # ---------------------------------------------------------------------------
490
  # Contamination rules for extensible task creation
491
  # ---------------------------------------------------------------------------
@@ -611,6 +779,7 @@ TASK_REGISTRY = {
611
  "easy": create_task_easy,
612
  "medium": create_task_medium,
613
  "hard": create_task_hard,
 
614
  }
615
 
616
 
 
486
  )
487
 
488
 
489
+ # ---------------------------------------------------------------------------
490
+ # TASK 4: Expert — LLM Alignment / Instruction-tuning data quality
491
+ # ---------------------------------------------------------------------------
492
+
493
+ def create_task_alignment(seed: int = 42) -> Task:
494
+ rng = random.Random(seed)
495
+
496
+ clean_csv = """id,instruction,response,category,quality_label,source,language,token_count
497
+ 1,Explain the difference between a list and a tuple in Python.,A list is mutable (can be changed after creation) while a tuple is immutable (cannot be changed). Lists use square brackets [] and tuples use parentheses (). Lists are slightly slower due to mutability overhead. Use tuples for fixed collections and lists when you need to modify elements.,coding,good,human,en,52
498
+ 2,What is photosynthesis?,Photosynthesis is the process by which green plants convert sunlight into chemical energy. Plants absorb carbon dioxide and water then use light energy to produce glucose and oxygen. The equation is: 6CO2 + 6H2O + light -> C6H12O6 + 6O2.,science,good,human,en,48
499
+ 3,Write a haiku about autumn.,Crimson leaves descend / Whispering winds carry change / Nature rests again,creative,good,human,en,12
500
+ 4,How do I make a HTTP request in JavaScript?,You can use the fetch API: fetch('https://api.example.com/data').then(response => response.json()).then(data => console.log(data)).catch(error => console.error(error)). Alternatively use XMLHttpRequest or libraries like axios.,coding,good,human,en,38
501
+ 5,What causes earthquakes?,Earthquakes occur when tectonic plates along fault lines suddenly slip past each other. The energy released travels as seismic waves. Most earthquakes happen at plate boundaries. The magnitude is measured on the Richter scale or moment magnitude scale.,science,good,human,en,42
502
+ 6,Translate 'Good morning' to Spanish.,Buenos dias. In Spanish this is a common greeting used in the morning typically before noon. The literal translation is 'good days' (plural).,translation,good,human,en,26
503
+ 7,What is the time complexity of binary search?,Binary search has O(log n) time complexity where n is the number of elements. It works by repeatedly dividing the search interval in half. It requires the input array to be sorted. Space complexity is O(1) for iterative and O(log n) for recursive.,coding,good,human,en,46
504
+ 8,Summarize the theory of relativity.,Einstein's theory of relativity consists of special relativity (1905) and general relativity (1915). Special relativity states that the speed of light is constant and introduces E=mc2. General relativity describes gravity as the curvature of spacetime caused by mass and energy.,science,good,human,en,44
505
+ 9,Write a professional email declining a meeting invitation.,Subject: Re: Meeting Request - Unable to Attend. Dear [Name] thank you for the invitation. Unfortunately I have a prior commitment at that time. Could we reschedule to later this week? I'm available Thursday or Friday afternoon. Best regards [Your Name],business,good,human,en,42
506
+ 10,What is the capital of Japan?,The capital of Japan is Tokyo. It has been the capital since 1868 when Emperor Meiji moved the imperial seat from Kyoto. Tokyo is the most populous metropolitan area in the world with over 37 million people.,geography,good,human,en,38
507
+ 11,How does a neural network learn?,A neural network learns through backpropagation and gradient descent. During forward pass input data flows through layers of neurons. The loss function measures prediction error. Backpropagation computes gradients and weights are updated to minimize loss over many iterations.,coding,good,human,en,40
508
+ 12,What are the benefits of meditation?,Meditation reduces stress and anxiety improves focus and concentration enhances emotional well-being promotes better sleep and can lower blood pressure. Regular practice of even 10 minutes daily has been shown to produce measurable benefits in brain structure and function.,health,good,human,en,40
509
+ 13,Explain blockchain in simple terms.,Blockchain is a shared digital ledger that records transactions across many computers. Once recorded data cannot be altered without changing all subsequent blocks. This makes it secure and transparent. Bitcoin was the first major application of blockchain technology.,technology,good,human,en,38
510
+ 14,What is the difference between machine learning and deep learning?,Machine learning is a subset of AI where models learn from data. Deep learning is a subset of ML using neural networks with many layers. ML often requires manual feature engineering while deep learning automatically discovers features. Deep learning needs more data and compute.,coding,good,human,en,44
511
+ 15,Write a short product description for wireless earbuds.,Experience crystal-clear audio with our premium wireless earbuds. Featuring active noise cancellation 8-hour battery life and IPX5 water resistance. Seamless Bluetooth 5.3 connectivity with touch controls. Compact charging case provides 3 additional charges.,business,good,human,en,36
512
+ 16,What causes climate change?,Climate change is primarily caused by greenhouse gas emissions from burning fossil fuels deforestation and industrial processes. CO2 methane and nitrous oxide trap heat in the atmosphere causing global temperatures to rise. Human activities have increased CO2 levels by over 50% since pre-industrial times.,science,good,human,en,48
513
+ 17,How do I center a div in CSS?,Use flexbox on the parent: display flex; justify-content center; align-items center; height 100vh. Alternatively use CSS Grid: display grid; place-items center. For older browsers use position absolute with transform translate(-50% -50%).,coding,good,human,en,36
514
+ 18,What is cognitive behavioral therapy?,CBT is a type of psychotherapy that helps identify and change negative thought patterns. It focuses on the connection between thoughts feelings and behaviors. CBT is evidence-based and effective for depression anxiety PTSD and other conditions. Treatment typically lasts 12-20 sessions.,health,good,human,en,42
515
+ 19,Explain the water cycle.,The water cycle describes continuous movement of water on Earth. Water evaporates from oceans and lakes rises as vapor condenses into clouds and falls as precipitation. Water then flows through rivers into oceans or seeps into groundwater completing the cycle.,science,good,human,en,40
516
+ 20,Write a regex to validate an email address.,A basic email regex: ^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$ This matches standard email formats with alphanumeric characters before @ a domain name and a TLD of 2+ chars. Note: perfect email validation via regex is extremely complex.,coding,good,human,en,38
517
+ 21,What are the SOLID principles in software engineering?,SOLID stands for: Single Responsibility (one reason to change) Open/Closed (open for extension closed for modification) Liskov Substitution (subtypes must be substitutable) Interface Segregation (specific interfaces over general) and Dependency Inversion (depend on abstractions).,coding,good,human,en,40
518
+ 22,Describe the process of making sourdough bread.,Mix flour and water for a starter and feed daily for 5-7 days. Combine starter with flour water and salt. Stretch and fold every 30 min for 2 hours. Bulk ferment 4-6 hours. Shape and cold proof overnight. Bake in Dutch oven at 450F for 45 min.,cooking,good,human,en,48
519
+ 23,What is quantum computing?,Quantum computing uses quantum bits (qubits) that can exist in superposition of 0 and 1 simultaneously. This enables parallel processing of many states at once. Quantum entanglement and interference allow solving certain problems exponentially faster than classical computers.,technology,good,human,en,38
520
+ 24,How do I handle errors in Python?,Use try/except blocks: try risky_operation() except ValueError as e handle_error(e) except Exception as e general_handler(e) finally cleanup(). You can also create custom exceptions by subclassing Exception. Use specific exception types rather than bare except.,coding,good,human,en,36
521
+ 25,What is the GDP of the United States?,As of 2024 the US GDP is approximately $28.8 trillion making it the world's largest economy. The US accounts for about 26% of global GDP. Major sectors include services (80%) industry (19%) and agriculture (1%).,geography,good,human,en,40"""
522
+
523
+ schema_desc = """Columns:
524
+ - id: integer, unique, sequential starting from 1
525
+ - instruction: string, non-empty, clear task or question for the LLM
526
+ - response: string, non-empty, must directly address the instruction
527
+ - category: string, one of [coding, science, creative, translation, business, health, technology, geography, cooking]
528
+ - quality_label: string, one of [good, bad, mixed] — must accurately reflect response quality
529
+ - source: string, one of [human, gpt4, gpt3.5, claude, llama, synthetic]
530
+ - language: string, ISO 639-1 code, must match the actual language of instruction and response
531
+ - token_count: integer, positive, should approximately match actual response token count (tolerance: 30%)"""
532
+
533
+ rules = """1. No missing or empty values in any column
534
+ 2. id must be unique and sequential
535
+ 3. response must directly answer/address the instruction (not a different topic)
536
+ 4. quality_label must accurately reflect the response quality (a clearly wrong or harmful response should not be labeled 'good')
537
+ 5. category must match the actual topic of the instruction
538
+ 6. language code must match the actual language used in instruction and response
539
+ 7. token_count should approximately match actual response length
540
+ 8. source must be from the allowed set
541
+ 9. No duplicate instructions (same or near-identical questions)
542
+ 10. Responses should not contain harmful, biased, or factually incorrect information if labeled 'good'"""
543
+
544
+ rows = _csv_to_rows(clean_csv)
545
+ header = rows[0]
546
+ data = rows[1:]
547
+ issues: List[PlantedIssue] = []
548
+
549
+ # Issue 1: Response doesn't match instruction (instruction-response mismatch)
550
+ # Row 6 asks to translate to Spanish, replace response with a cooking recipe
551
+ r = 5 # id=6, translation task
552
+ data[r][2] = "To make pasta start by boiling water in a large pot. Add salt generously. Cook pasta for 8-10 minutes until al dente. Drain and serve with your favorite sauce."
553
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
554
+ description="Response is about cooking pasta but instruction asks to translate 'Good morning' to Spanish",
555
+ difficulty=2.0))
556
+
557
+ # Issue 2: Factual error in a 'good' labeled response
558
+ # Row 10 says Tokyo capital — change to factually wrong
559
+ r = 9 # id=10, geography
560
+ data[r][2] = "The capital of Japan is Osaka. It has been the capital since 1868 when Emperor Meiji established the government there. Osaka is known for its street food and castle."
561
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
562
+ description="Response states capital of Japan is Osaka (incorrect — it's Tokyo) but quality_label is 'good'",
563
+ difficulty=2.5))
564
+
565
+ # Issue 3: quality_label wrong — clearly bad response marked 'good'
566
+ r = 3 # id=4, coding
567
+ data[r][2] = "I don't know."
568
+ issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
569
+ description="Response 'I don\\'t know' for a coding question is labeled 'good' — should be 'bad'",
570
+ difficulty=2.0))
571
+
572
+ # Issue 4: Category mismatch — coding instruction labeled as 'science'
573
+ r = 10 # id=11, neural network (is coding)
574
+ data[r][3] = "cooking"
575
+ issues.append(PlantedIssue(row=r + 1, col="category", issue_type="inconsistent_value",
576
+ description="Instruction about neural networks is categorized as 'cooking' — should be 'coding'",
577
+ difficulty=1.5))
578
+
579
+ # Issue 5: Language mismatch — response partially in French but language='en'
580
+ r = 14 # id=15, business
581
+ data[r][2] = "Decouvrez nos ecouteurs sans fil premium. Son cristallin avec reduction de bruit active. Autonomie de 8 heures et resistance a l'eau IPX5. Connectivite Bluetooth 5.3 avec commandes tactiles."
582
+ issues.append(PlantedIssue(row=r + 1, col="language", issue_type="inconsistent_value",
583
+ description="Response is in French but language field is 'en'",
584
+ difficulty=2.0))
585
+
586
+ # Issue 6: Duplicate instruction (near-identical to another row)
587
+ r = 20 # id=21, SOLID principles
588
+ data[r][1] = "Explain the difference between a list and a tuple in Python." # duplicate of row 1
589
+ issues.append(PlantedIssue(row=r + 1, col="instruction", issue_type="duplicate_row",
590
+ description="Instruction is identical to row 1 — duplicate training example",
591
+ difficulty=1.5))
592
+
593
+ # Issue 7: Token count way off — response is ~40 tokens but claims 200
594
+ r = 2 # id=3, haiku
595
+ data[r][7] = "200" # haiku is only ~12 tokens
596
+ issues.append(PlantedIssue(row=r + 1, col="token_count", issue_type="inconsistent_value",
597
+ description="token_count is 200 but haiku response is only ~12 tokens (>30% tolerance exceeded)",
598
+ difficulty=1.5))
599
+
600
+ # Issue 8: Source is invalid value
601
+ r = 18 # id=19, water cycle
602
+ data[r][5] = "gemini" # not in allowed set
603
+ issues.append(PlantedIssue(row=r + 1, col="source", issue_type="format_violation",
604
+ description="Source 'gemini' is not in allowed set [human, gpt4, gpt3.5, claude, llama, synthetic]",
605
+ difficulty=1.0))
606
+
607
+ # Issue 9: Truncated response (ends mid-sentence — subtle data pipeline issue)
608
+ r = 22 # id=23, quantum computing
609
+ data[r][2] = "Quantum computing uses quantum bits (qubits) that can exist in superposition of 0 and 1 simultaneously. This enables parallel processing of"
610
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="missing_value",
611
+ description="Response is truncated mid-sentence ('processing of') — likely data pipeline truncation error",
612
+ difficulty=2.5))
613
+
614
+ # Issue 10: Unsafe/harmful content in a 'good' labeled response
615
+ r = 23 # id=24, Python error handling
616
+ data[r][2] = "Just use bare except: blocks everywhere and ignore all errors. If something crashes just restart the program. Error handling is a waste of time for real developers. Anyone who writes proper error handling is overthinking it."
617
+ issues.append(PlantedIssue(row=r + 1, col="quality_label", issue_type="inconsistent_value",
618
+ description="Response gives deliberately bad advice (bare except, ignore errors) but is labeled 'good' — harmful for training",
619
+ difficulty=3.0))
620
+
621
+ # Issue 11: Empty instruction (whitespace only — data pipeline artifact)
622
+ r = 16 # id=17, CSS
623
+ data[r][1] = " "
624
+ issues.append(PlantedIssue(row=r + 1, col="instruction", issue_type="missing_value",
625
+ description="Instruction is whitespace-only — unusable training example",
626
+ difficulty=2.0))
627
+
628
+ # Issue 12: Response contains hallucinated citation
629
+ r = 7 # id=8, theory of relativity
630
+ data[r][2] = "According to a 2023 study published in Nature by Dr. James Smith at MIT Einstein's theory was proven wrong. The speed of light is actually variable and E=mc2 only applies in a vacuum. Smith's team demonstrated this using quantum entanglement experiments."
631
+ issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
632
+ description="Response contains hallucinated citation (fake study by fake 'Dr. James Smith') contradicting established physics — dangerous for training",
633
+ difficulty=3.0))
634
+
635
+ corrupted = _rows_to_csv([header] + data)
636
+
637
+ return Task(
638
+ task_id="alignment",
639
+ name="LLM Alignment Data Quality Validation",
640
+ description=(
641
+ "You are given an LLM instruction-tuning dataset used for fine-tuning. "
642
+ "Find all data quality issues that would degrade model training. "
643
+ "Issues include: instruction-response mismatches, factual errors in 'good' labeled data, "
644
+ "wrong category labels, language mismatches, truncated responses, duplicate instructions, "
645
+ "hallucinated citations, and harmful advice labeled as 'good'. "
646
+ "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
647
+ ),
648
+ schema_description=schema_desc,
649
+ validation_rules=rules,
650
+ clean_csv=clean_csv,
651
+ planted_issues=issues,
652
+ corrupted_csv=corrupted,
653
+ max_steps=3,
654
+ )
655
+
656
+
657
  # ---------------------------------------------------------------------------
658
  # Contamination rules for extensible task creation
659
  # ---------------------------------------------------------------------------
 
779
  "easy": create_task_easy,
780
  "medium": create_task_medium,
781
  "hard": create_task_hard,
782
+ "alignment": create_task_alignment,
783
  }
784
 
785
 
inference.py CHANGED
@@ -39,7 +39,7 @@ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
39
  ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
40
 
41
  BENCHMARK = "dataqa_env"
42
- TASKS = ["easy", "medium", "hard"]
43
  MAX_STEPS_PER_TASK = 3
44
 
45
 
 
39
  ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
40
 
41
  BENCHMARK = "dataqa_env"
42
+ TASKS = ["easy", "medium", "hard", "alignment"]
43
  MAX_STEPS_PER_TASK = 3
44
 
45
 
tests/test_tasks.py CHANGED
@@ -134,10 +134,61 @@ class TestTaskHard:
134
  assert len(keys) == len(set(keys))
135
 
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  class TestTaskRegistry:
138
  def test_list_tasks(self):
139
  tasks = list_tasks()
140
- assert set(tasks) == {"easy", "medium", "hard"}
141
 
142
  def test_get_task_easy(self):
143
  task = get_task("easy")
 
134
  assert len(keys) == len(set(keys))
135
 
136
 
137
+ class TestTaskAlignment:
138
+ @pytest.fixture
139
+ def task(self):
140
+ return create_task_hard() # reuse import, we'll import alignment below
141
+
142
+ def test_alignment_task(self):
143
+ from dataqa_env.server.tasks import get_task
144
+ task = get_task("alignment")
145
+ assert task.task_id == "alignment"
146
+ assert len(task.planted_issues) == 12
147
+
148
+ def test_alignment_issue_types(self):
149
+ from dataqa_env.server.tasks import get_task
150
+ task = get_task("alignment")
151
+ types = {i.issue_type for i in task.planted_issues}
152
+ assert "inconsistent_value" in types
153
+ assert "format_violation" in types
154
+ assert "missing_value" in types
155
+ assert "duplicate_row" in types
156
+
157
+ def test_alignment_has_high_difficulty(self):
158
+ from dataqa_env.server.tasks import get_task
159
+ task = get_task("alignment")
160
+ hard_issues = [i for i in task.planted_issues if i.difficulty >= 2.5]
161
+ assert len(hard_issues) >= 3 # hallucinated citation, harmful advice, factual error
162
+
163
+ def test_alignment_issue_keys_unique(self):
164
+ from dataqa_env.server.tasks import get_task
165
+ task = get_task("alignment")
166
+ keys = [i.to_key() for i in task.planted_issues]
167
+ assert len(keys) == len(set(keys))
168
+
169
+ def test_alignment_corrupted_differs(self):
170
+ from dataqa_env.server.tasks import get_task
171
+ task = get_task("alignment")
172
+ assert task.corrupted_csv != task.clean_csv
173
+
174
+ def test_alignment_in_env(self):
175
+ from dataqa_env.server.environment import DataQAEnvironment
176
+ from dataqa_env.models import DataQAAction
177
+ env = DataQAEnvironment()
178
+ obs = env.reset(task_id="alignment")
179
+ assert obs.num_issues_hint == 12
180
+ # Perfect submission
181
+ from dataqa_env.server.tasks import get_task
182
+ task = get_task("alignment")
183
+ action = DataQAAction(issues=[i.to_key() for i in task.planted_issues], task_id="alignment")
184
+ obs = env.step(action)
185
+ assert obs.reward >= 0.99
186
+
187
+
188
  class TestTaskRegistry:
189
  def test_list_tasks(self):
190
  tasks = list_tasks()
191
+ assert set(tasks) == {"easy", "medium", "hard", "alignment"}
192
 
193
  def test_get_task_easy(self):
194
  task = get_task("easy")