goodhart-gap-benchmark / data /combined_summary.json
Adam1010's picture
v2.0: Combined with cgrt-consensus-5model data (8,050 disagreements, 1,556 contested)
ca5e3d7 verified
{
"total_consensus_problems": 61678,
"total_disagreements": 8050,
"contested_count": 1556,
"programmatic_count": 101,
"combined_test_count": 1657,
"by_tier": {
"silver": 3345,
"bronze": 3149,
"contested": 1556
},
"sources": {
"cgrt-consensus-5model": "https://huggingface.co/datasets/Adam1010/cgrt-consensus-5model",
"programmatic": "Python-generated multi-domain problems"
},
"cost_estimate": "$1000+ in API calls for consensus data"
}