Spaces:
Sleeping
Sleeping
Commit Β·
96d698c
1
Parent(s): 8910a26
make alignment issues subtler to challenge frontier models
Browse files- Subtle factual error: Cerasus vs Prunus serrulata (old synonym)
- Plausible wrong numbers: $400.3M at Sotheby's vs $450.3M at Christie's
- Near-duplicate prompt (semantic, not exact copy)
- Score inflation: helpfulness=4 for a 4-word answer
- Self-contradictory reasoning, hallucinated citations, leaked prompts unchanged
124 tests passing.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- dataqa_env/server/tasks.py +27 -22
dataqa_env/server/tasks.py
CHANGED
|
@@ -570,21 +570,24 @@ def create_task_alignment(seed: int = 42) -> Task:
|
|
| 570 |
data = rows[1:]
|
| 571 |
issues: List[PlantedIssue] = []
|
| 572 |
|
| 573 |
-
# Issue 1:
|
| 574 |
-
# Row 6
|
| 575 |
r = 5 # id=6
|
| 576 |
-
data[r][2] = "
|
|
|
|
|
|
|
| 577 |
issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
|
| 578 |
-
description="
|
| 579 |
-
difficulty=
|
| 580 |
|
| 581 |
-
# Issue 2:
|
| 582 |
-
# Row 4 about most expensive painting β
|
| 583 |
r = 3 # id=4
|
| 584 |
-
data[r][2] = "The most expensive painting ever sold at auction is
|
|
|
|
| 585 |
issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
|
| 586 |
-
description="Response
|
| 587 |
-
difficulty=
|
| 588 |
|
| 589 |
# Issue 3: Sycophantic/self-contradictory response with high coherence score
|
| 590 |
# Row 11 about morbidity vs mortality
|
|
@@ -608,12 +611,12 @@ def create_task_alignment(seed: int = 42) -> Task:
|
|
| 608 |
description="Response is in French but prompt is in English β language contamination in training data",
|
| 609 |
difficulty=2.0))
|
| 610 |
|
| 611 |
-
# Issue 6:
|
| 612 |
-
r = 6 # id=7 (centurion duties
|
| 613 |
-
data[r][1] =
|
| 614 |
issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
|
| 615 |
-
description="Prompt is
|
| 616 |
-
difficulty=
|
| 617 |
|
| 618 |
# Issue 7: Response plagiarizes another row (data shuffling error)
|
| 619 |
r = 19 # id=20, grassland animals
|
|
@@ -636,14 +639,16 @@ def create_task_alignment(seed: int = 42) -> Task:
|
|
| 636 |
description="Response suggests dangerous methods (boiling water on ice sharp knife near coils) but helpfulness score is 3 β harmful advice in training data",
|
| 637 |
difficulty=3.0))
|
| 638 |
|
| 639 |
-
# Issue 10:
|
| 640 |
-
# Row 17 about most destructive
|
| 641 |
r = 16 # id=17
|
| 642 |
-
data[r][3] = "4" #
|
| 643 |
-
data[r][
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
|
|
|
|
|
|
| 647 |
difficulty=2.5))
|
| 648 |
|
| 649 |
# Issue 11: Whitespace-only prompt (data pipeline artifact)
|
|
|
|
| 570 |
data = rows[1:]
|
| 571 |
issues: List[PlantedIssue] = []
|
| 572 |
|
| 573 |
+
# Issue 1: Subtle factual error β ALMOST correct but wrong on a key detail
|
| 574 |
+
# Row 6 about cherry blossoms β wrong scientific name (close but wrong genus)
|
| 575 |
r = 5 # id=6
|
| 576 |
+
data[r][2] = "The scientific name for the flower commonly called a cherry blossom is Cerasus serrulata. It can be found in white pink and red colors."
|
| 577 |
+
# Real name is Prunus serrulata, not Cerasus. Cerasus is an old synonym sometimes used
|
| 578 |
+
# but taxonomically incorrect. Subtle enough that most LLMs won't catch it.
|
| 579 |
issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
|
| 580 |
+
description="Scientific name Cerasus serrulata is taxonomically incorrect β accepted name is Prunus serrulata. Subtle factual error with correctness=3",
|
| 581 |
+
difficulty=3.0))
|
| 582 |
|
| 583 |
+
# Issue 2: Plausible but wrong numbers β close enough to seem right
|
| 584 |
+
# Row 4 about most expensive painting β right painting, wrong price and venue
|
| 585 |
r = 3 # id=4
|
| 586 |
+
data[r][2] = "The most expensive painting ever sold at auction is Salvator Mundi by Leonardo da Vinci. It was sold for $400.3 million at Sotheby's in New York City in 2017."
|
| 587 |
+
# Actual: $450.3M at Christie's (not Sotheby's, not $400.3M)
|
| 588 |
issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
|
| 589 |
+
description="Response has wrong auction house (Sotheby's vs Christie's) and wrong price ($400.3M vs $450.3M) but correctness score is 4 β subtle factual errors",
|
| 590 |
+
difficulty=3.0))
|
| 591 |
|
| 592 |
# Issue 3: Sycophantic/self-contradictory response with high coherence score
|
| 593 |
# Row 11 about morbidity vs mortality
|
|
|
|
| 611 |
description="Response is in French but prompt is in English β language contamination in training data",
|
| 612 |
difficulty=2.0))
|
| 613 |
|
| 614 |
+
# Issue 6: Near-duplicate prompt β rephrased but semantically identical to another row
|
| 615 |
+
r = 6 # id=7 (centurion duties)
|
| 616 |
+
data[r][1] = "What is gelatin and how is it produced?" # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
|
| 617 |
issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
|
| 618 |
+
description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β wastes training compute and biases model",
|
| 619 |
+
difficulty=2.5))
|
| 620 |
|
| 621 |
# Issue 7: Response plagiarizes another row (data shuffling error)
|
| 622 |
r = 19 # id=20, grassland animals
|
|
|
|
| 639 |
description="Response suggests dangerous methods (boiling water on ice sharp knife near coils) but helpfulness score is 3 β harmful advice in training data",
|
| 640 |
difficulty=3.0))
|
| 641 |
|
| 642 |
+
# Issue 10: Helpfulness score contradicts response quality
|
| 643 |
+
# Row 17 about most destructive disaster β response is extremely terse for a complex question
|
| 644 |
r = 16 # id=17
|
| 645 |
+
data[r][3] = "4" # helpfulness=4 but response is just 2 sentences for a nuanced historical question
|
| 646 |
+
data[r][4] = "4" # correctness=4 but the answer itself is debatable
|
| 647 |
+
data[r][2] = "The 1556 Shaanxi earthquake."
|
| 648 |
+
# This is arguably correct but gives no context, no detail β helpfulness=4 and correctness=4
|
| 649 |
+
# for a 4-word answer to "most destructive natural disaster" is clearly inflated
|
| 650 |
+
issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
|
| 651 |
+
description="Helpfulness score is 4 but response is only 4 words ('The 1556 Shaanxi earthquake.') with no explanation β score inflated for an unhelpful response",
|
| 652 |
difficulty=2.5))
|
| 653 |
|
| 654 |
# Issue 11: Whitespace-only prompt (data pipeline artifact)
|