v2.0: Combined with cgrt-consensus-5model data (8,050 disagreements, 1,556 contested)
Browse files- .gitattributes +3 -0
- README.md +121 -133
- create_combined_dataset.py +209 -0
- data/combined_summary.json +17 -0
- data/combined_test.jsonl +3 -0
- data/goodhart_contested.jsonl +3 -0
- data/goodhart_disagreements.jsonl +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/combined_test.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/goodhart_contested.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/goodhart_disagreements.jsonl filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -13,8 +13,10 @@ tags:
|
|
| 13 |
- llm-evaluation
|
| 14 |
- goodhart
|
| 15 |
- execution-vs-understanding
|
|
|
|
|
|
|
| 16 |
size_categories:
|
| 17 |
-
- n<
|
| 18 |
---
|
| 19 |
|
| 20 |
# Goodhart Gap Benchmark
|
|
@@ -25,182 +27,174 @@ size_categories:
|
|
| 25 |
|
| 26 |
The Goodhart Gap Benchmark tests whether language models can correctly *execute* multi-step reasoning tasks that they can correctly *explain*. Named after Goodhart's Law ("When a measure becomes a target, it ceases to be a good measure"), this benchmark reveals a critical failure mode: models that understand procedures but fail to execute them.
|
| 27 |
|
| 28 |
-
##
|
| 29 |
-
|
| 30 |
-
In our testing of 15+ models:
|
| 31 |
-
- **gpt-4o**: 57% pass rate (fails on financial, scheduling, units)
|
| 32 |
-
- **gpt-4o-mini**: 36% pass rate
|
| 33 |
-
- **Claude 3.5 Haiku**: 93% pass rate
|
| 34 |
-
- **Llama 3.1 70B**: Fails the canonical discount calculation despite correct explanation
|
| 35 |
-
|
| 36 |
-
## The Canonical Example
|
| 37 |
-
|
| 38 |
-
**Problem**: "If a shirt costs $25 and is on 20% sale, and you have a $5 coupon, what do you pay?"
|
| 39 |
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
|
|
|
|
| 48 |
| Metric | Value |
|
| 49 |
|--------|-------|
|
| 50 |
| Total problems | 101 |
|
| 51 |
| Domains | 12 |
|
| 52 |
-
|
|
| 53 |
-
| Steps per problem | 2-6 |
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|--------|-------|-------------|
|
| 61 |
-
| math_discount | 15 | Discounts, coupons, taxes, markups |
|
| 62 |
-
| time | 13 | Duration arithmetic, travel times |
|
| 63 |
-
| financial | 10 | Interest, taxes, commissions |
|
| 64 |
-
| logic | 8 | Ordering, deduction, set operations |
|
| 65 |
-
| recipe | 7 | Scaling, unit conversion |
|
| 66 |
-
| scheduling | 7 | Task dependencies, work rates |
|
| 67 |
-
| units | 7 | Unit conversion with operations |
|
| 68 |
-
|
| 69 |
-
**Non-Numerical Domains (34 problems)**
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|--------|-------|-------------|
|
| 73 |
-
| spatial | 7 | Direction tracking, grid navigation, relative positions |
|
| 74 |
-
| procedural | 6 | State machines, undo/redo, procedure following |
|
| 75 |
-
| text | 7 | String manipulation, encoding, word operations |
|
| 76 |
-
| sequence | 7 | Pattern recognition (letters, symbols, words) |
|
| 77 |
-
| causal | 7 | Cause-effect chains, counterfactuals, necessary/sufficient |
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|------------|-------|-------------|
|
| 83 |
-
| Easy | 28 | 2 steps, straightforward |
|
| 84 |
-
| Medium | 32 | 2-3 steps, some complexity |
|
| 85 |
-
| Hard | 7 | 3-4 steps, multiple operations |
|
| 86 |
|
| 87 |
## Data Format
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
|
|
|
| 91 |
```json
|
| 92 |
{
|
| 93 |
"id": "math_discount_01",
|
| 94 |
"domain": "math_discount",
|
| 95 |
-
"problem": "A product costs $25 and is on 20% sale
|
| 96 |
"correct_answer": "15",
|
| 97 |
"explanation": "25 × 0.8 = 20.0, then 20.0 - 5 = 15.0",
|
| 98 |
-
"
|
| 99 |
"difficulty": "easy",
|
| 100 |
"steps": 2
|
| 101 |
}
|
| 102 |
```
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
| 107 |
-
|
| 108 |
-
|
|
| 109 |
-
|
|
| 110 |
-
|
|
| 111 |
-
|
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
## Usage
|
| 118 |
|
| 119 |
### Quick Evaluation
|
| 120 |
-
|
| 121 |
```bash
|
| 122 |
-
#
|
| 123 |
-
|
| 124 |
|
| 125 |
-
# Evaluate
|
| 126 |
-
python evaluate.py --provider openai --model gpt-4o
|
| 127 |
-
|
| 128 |
-
# Evaluate Claude model
|
| 129 |
-
python evaluate.py --provider anthropic --model claude-3-5-haiku-latest -v
|
| 130 |
-
|
| 131 |
-
# Evaluate local Ollama model
|
| 132 |
-
python evaluate.py --provider ollama --model llama3.1:8b -v
|
| 133 |
```
|
| 134 |
|
| 135 |
### Python API
|
| 136 |
-
|
| 137 |
```python
|
| 138 |
import json
|
| 139 |
|
| 140 |
-
# Load
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
# Validate response against expected
|
| 151 |
```
|
| 152 |
|
| 153 |
### With HuggingFace Datasets
|
| 154 |
-
|
| 155 |
```python
|
| 156 |
from datasets import load_dataset
|
| 157 |
|
| 158 |
-
dataset = load_dataset("
|
| 159 |
-
|
| 160 |
-
for example in dataset['test']:
|
| 161 |
-
print(example['problem'])
|
| 162 |
-
print(f"Expected: {example['correct_answer']}")
|
| 163 |
```
|
| 164 |
|
| 165 |
-
## Evaluation Criteria
|
| 166 |
-
|
| 167 |
-
A response is considered correct if:
|
| 168 |
-
1. **Numeric answers**: The expected number appears in the response (with tolerance for rounding)
|
| 169 |
-
2. **Time answers**: The expected time appears in any reasonable format (e.g., "4:45 PM", "4:45pm", "16:45")
|
| 170 |
-
3. **Yes/no answers**: The response clearly indicates yes, no, or "cannot determine"
|
| 171 |
-
4. **Ordering answers**: Items appear in the correct sequence
|
| 172 |
-
|
| 173 |
## Leaderboard
|
| 174 |
|
| 175 |
-
| Model | Provider | Pass Rate |
|
| 176 |
-
|
| 177 |
-
| Claude 3.5 Haiku | Anthropic | 93% |
|
| 178 |
-
| Claude Sonnet 4 | Anthropic | 79% |
|
| 179 |
-
| gpt-4o | OpenAI | 57% |
|
| 180 |
-
| gpt-4o-mini | OpenAI | 36% |
|
| 181 |
-
| Qwen 2.5 72B | Alibaba | TBD | - |
|
| 182 |
-
| Llama 3.1 70B | Meta | TBD | - |
|
| 183 |
-
|
| 184 |
-
*Submit your results via PR to add to the leaderboard*
|
| 185 |
|
| 186 |
## Why This Matters
|
| 187 |
|
| 188 |
### For AI Safety
|
| 189 |
-
Models
|
| 190 |
-
-
|
| 191 |
-
-
|
| 192 |
-
- A gap between capability benchmarks and deployment readiness
|
| 193 |
-
|
| 194 |
-
### For Model Selection
|
| 195 |
-
Not all models are equal for multi-step reasoning:
|
| 196 |
-
- Model family matters more than size
|
| 197 |
-
- Distilled models often lose this capability
|
| 198 |
-
- Test execution, not just explanation
|
| 199 |
|
| 200 |
### For Training
|
| 201 |
-
|
| 202 |
-
-
|
| 203 |
-
-
|
| 204 |
|
| 205 |
## Citation
|
| 206 |
|
|
@@ -209,27 +203,21 @@ The gap appears to be a training problem:
|
|
| 209 |
title={Goodhart Gap Benchmark: Detecting the Gap Between Understanding and Execution in LLMs},
|
| 210 |
author={Adam Kruger},
|
| 211 |
year={2026},
|
| 212 |
-
url={https://huggingface.co/datasets/Adam1010/goodhart-gap-benchmark}
|
|
|
|
| 213 |
}
|
| 214 |
```
|
| 215 |
|
| 216 |
-
##
|
| 217 |
|
| 218 |
-
|
| 219 |
|
| 220 |
-
##
|
| 221 |
-
|
| 222 |
-
We welcome contributions:
|
| 223 |
-
- New test cases in underrepresented domains
|
| 224 |
-
- Results from additional models
|
| 225 |
-
- Improved validators
|
| 226 |
-
- Translations to other languages
|
| 227 |
|
| 228 |
-
|
| 229 |
|
| 230 |
## Acknowledgments
|
| 231 |
|
| 232 |
-
|
|
|
|
| 233 |
- Goodhart's Law and its application to AI evaluation
|
| 234 |
-
- Work on multi-step reasoning in LLMs
|
| 235 |
-
- The distinction between System 1 and System 2 thinking
|
|
|
|
| 13 |
- llm-evaluation
|
| 14 |
- goodhart
|
| 15 |
- execution-vs-understanding
|
| 16 |
+
- consensus
|
| 17 |
+
- multi-model
|
| 18 |
size_categories:
|
| 19 |
+
- 1K<n<10K
|
| 20 |
---
|
| 21 |
|
| 22 |
# Goodhart Gap Benchmark
|
|
|
|
| 27 |
|
| 28 |
The Goodhart Gap Benchmark tests whether language models can correctly *execute* multi-step reasoning tasks that they can correctly *explain*. Named after Goodhart's Law ("When a measure becomes a target, it ceases to be a good measure"), this benchmark reveals a critical failure mode: models that understand procedures but fail to execute them.
|
| 29 |
|
| 30 |
+
## Data Sources
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
This benchmark combines two data sources:
|
| 33 |
|
| 34 |
+
### 1. CGRT Consensus Dataset (Primary)
|
| 35 |
+
**Source**: [Adam1010/cgrt-consensus-5model](https://huggingface.co/datasets/Adam1010/cgrt-consensus-5model)
|
| 36 |
|
| 37 |
+
| Metric | Value |
|
| 38 |
+
|--------|-------|
|
| 39 |
+
| Total problems | 61,678 |
|
| 40 |
+
| Models queried | 5 (Claude, GPT-4, Gemini, DeepSeek, Qwen) |
|
| 41 |
+
| API cost | ~$1,000 |
|
| 42 |
+
| Disagreement cases | 8,050 |
|
| 43 |
+
| Contested (strongest) | 1,556 |
|
| 44 |
|
| 45 |
+
Each problem includes full reasoning traces from 5 frontier models, enabling analysis of where execution diverges despite similar understanding.
|
| 46 |
|
| 47 |
+
### 2. Programmatic Multi-Domain Problems
|
| 48 |
| Metric | Value |
|
| 49 |
|--------|-------|
|
| 50 |
| Total problems | 101 |
|
| 51 |
| Domains | 12 |
|
| 52 |
+
| Cost | $0 (generated) |
|
|
|
|
| 53 |
|
| 54 |
+
## Dataset Files
|
| 55 |
|
| 56 |
+
| File | Description | Count |
|
| 57 |
+
|------|-------------|-------|
|
| 58 |
+
| `combined_test.jsonl` | Main evaluation set (contested + programmatic) | 1,657 |
|
| 59 |
+
| `goodhart_disagreements.jsonl` | All disagreement cases from consensus | 8,050 |
|
| 60 |
+
| `goodhart_contested.jsonl` | Strongest Goodhart Gap cases | 1,556 |
|
| 61 |
+
| `test.jsonl` | Programmatic problems only | 101 |
|
| 62 |
|
| 63 |
+
## Key Finding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
**Models that consistently show chain-of-thought execute correctly; models that give quick answers fail.**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
| Model | Financial Domain | Behavior |
|
| 68 |
+
|-------|------------------|----------|
|
| 69 |
+
| Claude 3.5 Haiku | 100% | Always shows work |
|
| 70 |
+
| Claude Sonnet 4 | 30% | Sometimes skips work |
|
| 71 |
+
| gpt-4o | 30% | Sometimes skips work |
|
| 72 |
+
| gpt-4o-mini | 0% | Usually skips work |
|
| 73 |
|
| 74 |
+
The financial domain (compound interest + tax) is the strongest Goodhart Gap detector.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
## Data Format
|
| 77 |
|
| 78 |
+
### Consensus-derived examples
|
| 79 |
+
```json
|
| 80 |
+
{
|
| 81 |
+
"id": "consensus_12345",
|
| 82 |
+
"domain": "math_consensus",
|
| 83 |
+
"problem": "A store sells apples for $2 each...",
|
| 84 |
+
"correct_answer": "15",
|
| 85 |
+
"source": "cgrt-consensus-5model",
|
| 86 |
+
"consensus_tier": "contested",
|
| 87 |
+
"model_responses": {
|
| 88 |
+
"claude": {"answer": "15", "response": "Step 1..."},
|
| 89 |
+
"codex": {"answer": "14", "response": "First..."},
|
| 90 |
+
"gemini": {"answer": "15", "response": "Let me..."},
|
| 91 |
+
"deepseek": {"answer": "16", "response": "..."},
|
| 92 |
+
"qwen": {"answer": "15", "response": "..."}
|
| 93 |
+
},
|
| 94 |
+
"difficulty": "hard"
|
| 95 |
+
}
|
| 96 |
+
```
|
| 97 |
|
| 98 |
+
### Programmatic examples
|
| 99 |
```json
|
| 100 |
{
|
| 101 |
"id": "math_discount_01",
|
| 102 |
"domain": "math_discount",
|
| 103 |
+
"problem": "A product costs $25 and is on 20% sale...",
|
| 104 |
"correct_answer": "15",
|
| 105 |
"explanation": "25 × 0.8 = 20.0, then 20.0 - 5 = 15.0",
|
| 106 |
+
"source": "programmatic",
|
| 107 |
"difficulty": "easy",
|
| 108 |
"steps": 2
|
| 109 |
}
|
| 110 |
```
|
| 111 |
|
| 112 |
+
## Consensus Tiers
|
| 113 |
+
|
| 114 |
+
| Tier | Description | Count |
|
| 115 |
+
|------|-------------|-------|
|
| 116 |
+
| **Gold** | All 5 models agree | 51,174 |
|
| 117 |
+
| **Silver** | 4/5 models agree | 5,766 |
|
| 118 |
+
| **Bronze** | 3/5 models agree | 3,182 |
|
| 119 |
+
| **Contested** | No majority (strongest Goodhart Gap) | 1,556 |
|
| 120 |
+
|
| 121 |
+
## Domains
|
| 122 |
+
|
| 123 |
+
### From Consensus Data
|
| 124 |
+
- Math word problems (GSM8K-style)
|
| 125 |
+
- Multi-step arithmetic
|
| 126 |
+
- Rate/ratio problems
|
| 127 |
+
|
| 128 |
+
### Programmatic Domains
|
| 129 |
+
| Domain | Count | Type |
|
| 130 |
+
|--------|-------|------|
|
| 131 |
+
| math_discount | 15 | Numerical |
|
| 132 |
+
| time | 13 | Numerical |
|
| 133 |
+
| financial | 10 | Numerical |
|
| 134 |
+
| logic | 8 | Numerical |
|
| 135 |
+
| recipe | 7 | Numerical |
|
| 136 |
+
| scheduling | 7 | Numerical |
|
| 137 |
+
| units | 7 | Numerical |
|
| 138 |
+
| spatial | 7 | Non-numerical |
|
| 139 |
+
| procedural | 6 | Non-numerical |
|
| 140 |
+
| text | 7 | Non-numerical |
|
| 141 |
+
| sequence | 7 | Non-numerical |
|
| 142 |
+
| causal | 7 | Non-numerical |
|
| 143 |
|
| 144 |
## Usage
|
| 145 |
|
| 146 |
### Quick Evaluation
|
|
|
|
| 147 |
```bash
|
| 148 |
+
# Evaluate on combined test set
|
| 149 |
+
python evaluate.py --provider anthropic --model claude-3-5-haiku-latest --dataset combined_test.jsonl
|
| 150 |
|
| 151 |
+
# Evaluate on contested only (hardest)
|
| 152 |
+
python evaluate.py --provider openai --model gpt-4o --dataset goodhart_contested.jsonl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
```
|
| 154 |
|
| 155 |
### Python API
|
|
|
|
| 156 |
```python
|
| 157 |
import json
|
| 158 |
|
| 159 |
+
# Load combined test set
|
| 160 |
+
with open('data/combined_test.jsonl') as f:
|
| 161 |
+
problems = [json.loads(line) for line in f]
|
| 162 |
+
|
| 163 |
+
# Analyze consensus examples with model responses
|
| 164 |
+
for p in problems:
|
| 165 |
+
if p.get('source') == 'cgrt-consensus-5model':
|
| 166 |
+
# Has full model reasoning traces
|
| 167 |
+
for model, data in p['model_responses'].items():
|
| 168 |
+
print(f"{model}: {data['answer']}")
|
|
|
|
| 169 |
```
|
| 170 |
|
| 171 |
### With HuggingFace Datasets
|
|
|
|
| 172 |
```python
|
| 173 |
from datasets import load_dataset
|
| 174 |
|
| 175 |
+
dataset = load_dataset("Adam1010/goodhart-gap-benchmark")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
```
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
## Leaderboard
|
| 179 |
|
| 180 |
+
| Model | Provider | Pass Rate | Notes |
|
| 181 |
+
|-------|----------|-----------|-------|
|
| 182 |
+
| Claude 3.5 Haiku | Anthropic | 93% | Shows work consistently |
|
| 183 |
+
| Claude Sonnet 4 | Anthropic | 79% | |
|
| 184 |
+
| gpt-4o | OpenAI | 57% | |
|
| 185 |
+
| gpt-4o-mini | OpenAI | 36% | |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
## Why This Matters
|
| 188 |
|
| 189 |
### For AI Safety
|
| 190 |
+
- Models explaining correctly but executing incorrectly are harder to detect
|
| 191 |
+
- Gap between capability benchmarks and deployment readiness
|
| 192 |
+
- Critical for agentic AI systems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
### For Training
|
| 195 |
+
- Disagreement cases reveal where models need improvement
|
| 196 |
+
- Chain-of-thought consistency matters more than raw capability
|
| 197 |
+
- Smaller models (Haiku) can outperform larger ones through reliable execution
|
| 198 |
|
| 199 |
## Citation
|
| 200 |
|
|
|
|
| 203 |
title={Goodhart Gap Benchmark: Detecting the Gap Between Understanding and Execution in LLMs},
|
| 204 |
author={Adam Kruger},
|
| 205 |
year={2026},
|
| 206 |
+
url={https://huggingface.co/datasets/Adam1010/goodhart-gap-benchmark},
|
| 207 |
+
note={Built on cgrt-consensus-5model dataset}
|
| 208 |
}
|
| 209 |
```
|
| 210 |
|
| 211 |
+
## Related Datasets
|
| 212 |
|
| 213 |
+
- [Adam1010/cgrt-consensus-5model](https://huggingface.co/datasets/Adam1010/cgrt-consensus-5model) - Source consensus data
|
| 214 |
|
| 215 |
+
## License
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
+
MIT License - free for research and commercial use.
|
| 218 |
|
| 219 |
## Acknowledgments
|
| 220 |
|
| 221 |
+
- CGRT (Consensus-Guided Recursive Training) research
|
| 222 |
+
- 5-model consensus data collection (~$1000 in API calls)
|
| 223 |
- Goodhart's Law and its application to AI evaluation
|
|
|
|
|
|
create_combined_dataset.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Create Combined Goodhart Gap Benchmark
|
| 4 |
+
|
| 5 |
+
Combines:
|
| 6 |
+
1. cgrt-consensus-5model data (61,678 problems, ~$1000 in API calls)
|
| 7 |
+
2. Programmatic multi-domain problems (101 problems)
|
| 8 |
+
|
| 9 |
+
Focus: Disagreement cases where models show the "Goodhart Gap" -
|
| 10 |
+
understanding procedures but failing execution.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
|
| 17 |
+
# Paths
|
| 18 |
+
CONSENSUS_DATA = Path("/home/adam/Mojo/Research/experiments/tau-bench/cgrt/data/consensus_cli_labels_enriched.jsonl")
|
| 19 |
+
PROGRAMMATIC_DATA = Path("data/test.jsonl")
|
| 20 |
+
OUTPUT_DIR = Path("data")
|
| 21 |
+
|
| 22 |
+
def load_consensus_data():
|
| 23 |
+
"""Load the cgrt-consensus-5model dataset."""
|
| 24 |
+
data = []
|
| 25 |
+
with open(CONSENSUS_DATA) as f:
|
| 26 |
+
for line in f:
|
| 27 |
+
data.append(json.loads(line))
|
| 28 |
+
return data
|
| 29 |
+
|
| 30 |
+
def load_programmatic_data():
|
| 31 |
+
"""Load our programmatic test problems."""
|
| 32 |
+
data = []
|
| 33 |
+
with open(PROGRAMMATIC_DATA) as f:
|
| 34 |
+
for line in f:
|
| 35 |
+
data.append(json.loads(line))
|
| 36 |
+
return data
|
| 37 |
+
|
| 38 |
+
def extract_goodhart_examples(consensus_data):
|
| 39 |
+
"""
|
| 40 |
+
Extract examples that demonstrate the Goodhart Gap:
|
| 41 |
+
- Models that show work but get wrong answers
|
| 42 |
+
- Disagreement between models despite similar reasoning
|
| 43 |
+
- Contested problems where execution differs
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
goodhart_examples = []
|
| 47 |
+
|
| 48 |
+
for item in consensus_data:
|
| 49 |
+
# Focus on disagreement cases
|
| 50 |
+
if item.get('all_agree', True):
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
# Get all model answers
|
| 54 |
+
answers = {}
|
| 55 |
+
for model in ['claude', 'codex', 'gemini', 'deepseek', 'qwen']:
|
| 56 |
+
ans_key = f'{model}_answer'
|
| 57 |
+
resp_key = f'{model}_response'
|
| 58 |
+
if ans_key in item and resp_key in item:
|
| 59 |
+
answers[model] = {
|
| 60 |
+
'answer': item[ans_key],
|
| 61 |
+
'response': item[resp_key]
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
if len(answers) < 3:
|
| 65 |
+
continue
|
| 66 |
+
|
| 67 |
+
# Classify the type of disagreement
|
| 68 |
+
unique_answers = set(a['answer'] for a in answers.values() if a['answer'])
|
| 69 |
+
|
| 70 |
+
example = {
|
| 71 |
+
'id': f"consensus_{item['idx']}",
|
| 72 |
+
'source': 'cgrt-consensus-5model',
|
| 73 |
+
'question': item['question'],
|
| 74 |
+
'majority_answer': item.get('majority_answer', ''),
|
| 75 |
+
'agreement_score': item.get('agreement_score', 0),
|
| 76 |
+
'consensus_tier': item.get('consensus_tier', 'unknown'),
|
| 77 |
+
'num_unique_answers': len(unique_answers),
|
| 78 |
+
'model_responses': answers,
|
| 79 |
+
'outlier_models': item.get('outlier_models', []),
|
| 80 |
+
'difficulty_signal': item.get('difficulty_signal', 0),
|
| 81 |
+
'goodhart_type': classify_goodhart_type(answers, item)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
goodhart_examples.append(example)
|
| 85 |
+
|
| 86 |
+
return goodhart_examples
|
| 87 |
+
|
| 88 |
+
def classify_goodhart_type(answers, item):
|
| 89 |
+
"""Classify the type of Goodhart Gap exhibited."""
|
| 90 |
+
|
| 91 |
+
# Check if models show similar reasoning but different answers
|
| 92 |
+
responses = [a['response'] for a in answers.values() if a['response']]
|
| 93 |
+
answer_set = set(a['answer'] for a in answers.values() if a['answer'])
|
| 94 |
+
|
| 95 |
+
if len(answer_set) == 1:
|
| 96 |
+
return 'agreement' # Shouldn't happen in disagreement set
|
| 97 |
+
|
| 98 |
+
tier = item.get('consensus_tier', '')
|
| 99 |
+
|
| 100 |
+
if tier == 'contested':
|
| 101 |
+
return 'execution_divergence' # Strong Goodhart Gap
|
| 102 |
+
elif tier == 'bronze':
|
| 103 |
+
return 'partial_agreement'
|
| 104 |
+
elif tier == 'silver':
|
| 105 |
+
return 'minor_disagreement'
|
| 106 |
+
else:
|
| 107 |
+
return 'calculation_error'
|
| 108 |
+
|
| 109 |
+
def create_combined_dataset():
|
| 110 |
+
"""Create the combined benchmark dataset."""
|
| 111 |
+
|
| 112 |
+
print("Loading consensus data...")
|
| 113 |
+
consensus_data = load_consensus_data()
|
| 114 |
+
print(f" Loaded {len(consensus_data)} consensus problems")
|
| 115 |
+
|
| 116 |
+
print("\nLoading programmatic data...")
|
| 117 |
+
programmatic_data = load_programmatic_data()
|
| 118 |
+
print(f" Loaded {len(programmatic_data)} programmatic problems")
|
| 119 |
+
|
| 120 |
+
print("\nExtracting Goodhart Gap examples...")
|
| 121 |
+
goodhart_examples = extract_goodhart_examples(consensus_data)
|
| 122 |
+
print(f" Found {len(goodhart_examples)} disagreement cases")
|
| 123 |
+
|
| 124 |
+
# Categorize by tier
|
| 125 |
+
by_tier = defaultdict(list)
|
| 126 |
+
for ex in goodhart_examples:
|
| 127 |
+
by_tier[ex['consensus_tier']].append(ex)
|
| 128 |
+
|
| 129 |
+
print("\n By tier:")
|
| 130 |
+
for tier, examples in sorted(by_tier.items()):
|
| 131 |
+
print(f" {tier}: {len(examples)}")
|
| 132 |
+
|
| 133 |
+
# Create output datasets
|
| 134 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 135 |
+
|
| 136 |
+
# 1. Full disagreement dataset
|
| 137 |
+
print("\nWriting full disagreement dataset...")
|
| 138 |
+
with open(OUTPUT_DIR / "goodhart_disagreements.jsonl", 'w') as f:
|
| 139 |
+
for ex in goodhart_examples:
|
| 140 |
+
f.write(json.dumps(ex) + '\n')
|
| 141 |
+
print(f" Wrote {len(goodhart_examples)} examples to goodhart_disagreements.jsonl")
|
| 142 |
+
|
| 143 |
+
# 2. Contested subset (strongest Goodhart Gap cases)
|
| 144 |
+
contested = by_tier.get('contested', [])
|
| 145 |
+
print(f"\nWriting contested subset ({len(contested)} examples)...")
|
| 146 |
+
with open(OUTPUT_DIR / "goodhart_contested.jsonl", 'w') as f:
|
| 147 |
+
for ex in contested:
|
| 148 |
+
f.write(json.dumps(ex) + '\n')
|
| 149 |
+
|
| 150 |
+
# 3. Combined test set (contested + programmatic)
|
| 151 |
+
print("\nCreating combined test set...")
|
| 152 |
+
combined = []
|
| 153 |
+
|
| 154 |
+
# Add contested examples (reformatted for evaluation)
|
| 155 |
+
for ex in contested:
|
| 156 |
+
combined.append({
|
| 157 |
+
'id': ex['id'],
|
| 158 |
+
'domain': 'math_consensus',
|
| 159 |
+
'problem': ex['question'],
|
| 160 |
+
'correct_answer': ex['majority_answer'],
|
| 161 |
+
'source': 'cgrt-consensus-5model',
|
| 162 |
+
'consensus_tier': ex['consensus_tier'],
|
| 163 |
+
'model_responses': ex['model_responses'],
|
| 164 |
+
'difficulty': 'hard',
|
| 165 |
+
'steps': 3 # Estimate
|
| 166 |
+
})
|
| 167 |
+
|
| 168 |
+
# Add programmatic examples
|
| 169 |
+
for ex in programmatic_data:
|
| 170 |
+
ex['source'] = 'programmatic'
|
| 171 |
+
combined.append(ex)
|
| 172 |
+
|
| 173 |
+
with open(OUTPUT_DIR / "combined_test.jsonl", 'w') as f:
|
| 174 |
+
for ex in combined:
|
| 175 |
+
f.write(json.dumps(ex) + '\n')
|
| 176 |
+
print(f" Wrote {len(combined)} examples to combined_test.jsonl")
|
| 177 |
+
|
| 178 |
+
# 4. Summary statistics
|
| 179 |
+
summary = {
|
| 180 |
+
'total_consensus_problems': len(consensus_data),
|
| 181 |
+
'total_disagreements': len(goodhart_examples),
|
| 182 |
+
'contested_count': len(contested),
|
| 183 |
+
'programmatic_count': len(programmatic_data),
|
| 184 |
+
'combined_test_count': len(combined),
|
| 185 |
+
'by_tier': {k: len(v) for k, v in by_tier.items()},
|
| 186 |
+
'sources': {
|
| 187 |
+
'cgrt-consensus-5model': 'https://huggingface.co/datasets/Adam1010/cgrt-consensus-5model',
|
| 188 |
+
'programmatic': 'Python-generated multi-domain problems'
|
| 189 |
+
},
|
| 190 |
+
'cost_estimate': '$1000+ in API calls for consensus data'
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
with open(OUTPUT_DIR / "combined_summary.json", 'w') as f:
|
| 194 |
+
json.dump(summary, f, indent=2)
|
| 195 |
+
|
| 196 |
+
print("\n" + "="*50)
|
| 197 |
+
print("COMBINED DATASET SUMMARY")
|
| 198 |
+
print("="*50)
|
| 199 |
+
print(f"Consensus source problems: {len(consensus_data)}")
|
| 200 |
+
print(f"Disagreement cases: {len(goodhart_examples)}")
|
| 201 |
+
print(f"Contested (strongest): {len(contested)}")
|
| 202 |
+
print(f"Programmatic problems: {len(programmatic_data)}")
|
| 203 |
+
print(f"Combined test set: {len(combined)}")
|
| 204 |
+
print("="*50)
|
| 205 |
+
|
| 206 |
+
return summary
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
create_combined_dataset()
|
data/combined_summary.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_consensus_problems": 61678,
|
| 3 |
+
"total_disagreements": 8050,
|
| 4 |
+
"contested_count": 1556,
|
| 5 |
+
"programmatic_count": 101,
|
| 6 |
+
"combined_test_count": 1657,
|
| 7 |
+
"by_tier": {
|
| 8 |
+
"silver": 3345,
|
| 9 |
+
"bronze": 3149,
|
| 10 |
+
"contested": 1556
|
| 11 |
+
},
|
| 12 |
+
"sources": {
|
| 13 |
+
"cgrt-consensus-5model": "https://huggingface.co/datasets/Adam1010/cgrt-consensus-5model",
|
| 14 |
+
"programmatic": "Python-generated multi-domain problems"
|
| 15 |
+
},
|
| 16 |
+
"cost_estimate": "$1000+ in API calls for consensus data"
|
| 17 |
+
}
|
data/combined_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83f932367f73f47956f5b5a90814f26d7e59f19cc6f508f9f5fb4fd75c7f962f
|
| 3 |
+
size 16890961
|
data/goodhart_contested.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f76d0ea04bb9a2eb4242edaa90bb49c37f7880d0cc91375b0fc57fdaca8b4dab
|
| 3 |
+
size 17000548
|
data/goodhart_disagreements.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:342c69e26ac79b0e6274b084b2b9977d5b0a61aa93d339c14e9a38222b94315b
|
| 3 |
+
size 57061950
|