Update README.md
Browse files
README.md
CHANGED
|
@@ -196,7 +196,37 @@ outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7,
|
|
| 196 |
print(outputs[0]["generated_text"])
|
| 197 |
```
|
| 198 |
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_ssmits__Qwen2.5-95B-Instruct)
|
| 201 |
|
| 202 |
| Metric |Value|
|
|
@@ -212,103 +242,103 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
|
|
| 212 |
|
| 213 |
| Key | 72b Result | 95b Result | Difference | Which is Higher | Multiplier |
|
| 214 |
|:--------------------------------------------------------------------------|-------------:|-------------:|-------------:|:------------------|:-------------|
|
| 215 |
-
| leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.
|
| 216 |
-
| leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 |
|
| 217 |
-
| leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 |
|
| 218 |
-
| leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.
|
| 219 |
-
| leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 |
|
| 220 |
-
| leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.
|
| 221 |
-
| leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.
|
| 222 |
-
| leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 |
|
| 223 |
-
| leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 |
|
| 224 |
-
| leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 |
|
| 225 |
-
| leaderboard.acc_norm,none | 0.641 | 0.622 |
|
| 226 |
-
| leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 |
|
| 227 |
-
| leaderboard.acc,none | 0.563 | 0.522 |
|
| 228 |
-
| leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 |
|
| 229 |
-
| leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 |
|
| 230 |
-
| leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 |
|
| 231 |
-
| leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.
|
| 232 |
-
| leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.
|
| 233 |
-
| leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 |
|
| 234 |
-
| leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.
|
| 235 |
-
| leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.
|
| 236 |
-
| leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.
|
| 237 |
-
| leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.
|
| 238 |
-
| leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 |
|
| 239 |
-
| leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 |
|
| 240 |
-
| leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 |
|
| 241 |
-
| leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 |
|
| 242 |
-
| leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 |
|
| 243 |
-
| leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 |
|
| 244 |
-
| leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.
|
| 245 |
-
| leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.
|
| 246 |
-
| leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 |
|
| 247 |
-
| leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.
|
| 248 |
-
| leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 |
|
| 249 |
-
| leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 |
|
| 250 |
-
| leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 |
|
| 251 |
-
| leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 |
|
| 252 |
-
| leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 |
|
| 253 |
-
| all.leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.
|
| 254 |
-
| all.leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 |
|
| 255 |
-
| all.leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 |
|
| 256 |
-
| all.leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.
|
| 257 |
-
| all.leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 |
|
| 258 |
-
| all.leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.
|
| 259 |
-
| all.leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.
|
| 260 |
-
| all.leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 |
|
| 261 |
-
| all.leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 |
|
| 262 |
-
| all.leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 |
|
| 263 |
-
| all.leaderboard.acc_norm,none | 0.641 | 0.622 |
|
| 264 |
-
| all.leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 |
|
| 265 |
-
| all.leaderboard.acc,none | 0.563 | 0.522 |
|
| 266 |
-
| all.leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 |
|
| 267 |
-
| all.leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 |
|
| 268 |
-
| all.leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 |
|
| 269 |
-
| all.leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.
|
| 270 |
-
| all.leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.
|
| 271 |
-
| all.leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 |
|
| 272 |
-
| all.leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.
|
| 273 |
-
| all.leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.
|
| 274 |
-
| all.leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.
|
| 275 |
-
| all.leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.
|
| 276 |
-
| all.leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 |
|
| 277 |
-
| all.leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 |
|
| 278 |
-
| all.leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 |
|
| 279 |
-
| all.leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 |
|
| 280 |
-
| all.leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 |
|
| 281 |
-
| all.leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 |
|
| 282 |
-
| all.leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.
|
| 283 |
-
| all.leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.
|
| 284 |
-
| all.leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 |
|
| 285 |
-
| all.leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.
|
| 286 |
-
| all.leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 |
|
| 287 |
-
| all.leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 |
|
| 288 |
-
| all.leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 |
|
| 289 |
-
| all.leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 |
|
| 290 |
-
| all.leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 |
|
| 291 |
-
| all.leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 |
|
| 292 |
-
| all.leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.
|
| 293 |
-
| all.leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 |
|
| 294 |
-
| all.leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 |
|
| 295 |
-
| all.leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 |
|
| 296 |
-
| all.leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 |
|
| 297 |
-
| all.leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.
|
| 298 |
-
| all.leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 |
|
| 299 |
-
| all.leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.
|
| 300 |
-
| all.leaderboard_bbh.acc_norm,none | 0.726 | 0.701 |
|
| 301 |
-
| all.leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 |
|
| 302 |
-
| all.leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 |
|
| 303 |
-
| leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 |
|
| 304 |
-
| leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.
|
| 305 |
-
| leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 |
|
| 306 |
-
| leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 |
|
| 307 |
-
| leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 |
|
| 308 |
-
| leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 |
|
| 309 |
-
| leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.
|
| 310 |
-
| leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 |
|
| 311 |
-
| leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.
|
| 312 |
-
| leaderboard_bbh.acc_norm,none | 0.726 | 0.701 |
|
| 313 |
-
| leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 |
|
| 314 |
-
| leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 |
|
|
|
|
| 196 |
print(outputs[0]["generated_text"])
|
| 197 |
```
|
| 198 |
|
| 199 |
+
## 🏆 Evaluation
|
| 200 |
+
|
| 201 |
+
Initial benchmarks show interesting performance characteristics compared to the 72B model:
|
| 202 |
+
|
| 203 |
+
### Strengths
|
| 204 |
+
The 95B model shows notable improvements in:
|
| 205 |
+
|
| 206 |
+
1. **Mathematical Reasoning**
|
| 207 |
+
- Up to 5.83x improvement in algebra tasks
|
| 208 |
+
- 3.33x improvement in pre-algebra
|
| 209 |
+
- Consistent gains across geometry, number theory, and probability tasks
|
| 210 |
+
- Overall stronger performance in complex mathematical reasoning
|
| 211 |
+
|
| 212 |
+
2. **Spatial & Object Understanding**
|
| 213 |
+
- 11% improvement in object placement tasks
|
| 214 |
+
- 7% better at tabular data interpretation
|
| 215 |
+
- Enhanced performance in logical deduction with multiple objects
|
| 216 |
+
|
| 217 |
+
3. **Complex Language Tasks**
|
| 218 |
+
- 4% improvement in disambiguation tasks
|
| 219 |
+
- 2% better at movie recommendations
|
| 220 |
+
- Slight improvements in hyperbaton (complex word order) tasks
|
| 221 |
+
|
| 222 |
+
4. **Creative & Analytical Reasoning**
|
| 223 |
+
- 10% improvement in murder mystery solving
|
| 224 |
+
- Better performance in tasks requiring creative problem-solving
|
| 225 |
+
|
| 226 |
+
### Areas for Consideration
|
| 227 |
+
While the model shows improvements in specific areas, users should note that the 72B model still performs better in many general language and reasoning tasks. The 95B version appears to excel particularly in mathematical and spatial reasoning while maintaining comparable performance in other areas.
|
| 228 |
+
|
| 229 |
+
### [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
|
| 230 |
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_ssmits__Qwen2.5-95B-Instruct)
|
| 231 |
|
| 232 |
| Metric |Value|
|
|
|
|
| 242 |
|
| 243 |
| Key | 72b Result | 95b Result | Difference | Which is Higher | Multiplier |
|
| 244 |
|:--------------------------------------------------------------------------|-------------:|-------------:|-------------:|:------------------|:-------------|
|
| 245 |
+
| leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02 |
|
| 246 |
+
| leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | -0.016 | 72b | 0.98 |
|
| 247 |
+
| leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | -0.012 | 72b | 0.99 |
|
| 248 |
+
| leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00 |
|
| 249 |
+
| leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | -0.011 | 72b | 0.97 |
|
| 250 |
+
| leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
|
| 251 |
+
| leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
|
| 252 |
+
| leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
|
| 253 |
+
| leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
|
| 254 |
+
| leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
|
| 255 |
+
| leaderboard.acc_norm,none | 0.641 | 0.622 | -0.020 | 72b | 0.97 |
|
| 256 |
+
| leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
|
| 257 |
+
| leaderboard.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
|
| 258 |
+
| leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | -0.005 | 72b | 0.99 |
|
| 259 |
+
| leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | -0.080 | 72b | 0.88 |
|
| 260 |
+
| leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | -0.007 | 72b | 0.98 |
|
| 261 |
+
| leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33 |
|
| 262 |
+
| leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83 |
|
| 263 |
+
| leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | -0.016 | 72b | 0.98 |
|
| 264 |
+
| leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00 |
|
| 265 |
+
| leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02 |
|
| 266 |
+
| leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00 |
|
| 267 |
+
| leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00 |
|
| 268 |
+
| leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
|
| 269 |
+
| leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
|
| 270 |
+
| leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
|
| 271 |
+
| leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
|
| 272 |
+
| leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | -0.022 | 72b | 0.98 |
|
| 273 |
+
| leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | -0.060 | 72b | 0.91 |
|
| 274 |
+
| leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07 |
|
| 275 |
+
| leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01 |
|
| 276 |
+
| leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | -0.068 | 72b | 0.89 |
|
| 277 |
+
| leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11 |
|
| 278 |
+
| leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | -0.112 | 72b | 0.84 |
|
| 279 |
+
| leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | -0.060 | 72b | 0.87 |
|
| 280 |
+
| leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | -0.044 | 72b | 0.95 |
|
| 281 |
+
| leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | -0.036 | 72b | 0.89 |
|
| 282 |
+
| leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | -0.020 | 72b | 0.97 |
|
| 283 |
+
| all.leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02 |
|
| 284 |
+
| all.leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | -0.016 | 72b | 0.98 |
|
| 285 |
+
| all.leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | -0.012 | 72b | 0.99 |
|
| 286 |
+
| all.leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00 |
|
| 287 |
+
| all.leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | -0.011 | 72b | 0.97 |
|
| 288 |
+
| all.leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
|
| 289 |
+
| all.leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
|
| 290 |
+
| all.leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
|
| 291 |
+
| all.leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
|
| 292 |
+
| all.leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
|
| 293 |
+
| all.leaderboard.acc_norm,none | 0.641 | 0.622 | -0.020 | 72b | 0.97 |
|
| 294 |
+
| all.leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
|
| 295 |
+
| all.leaderboard.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
|
| 296 |
+
| all.leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | -0.005 | 72b | 0.99 |
|
| 297 |
+
| all.leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | -0.080 | 72b | 0.88 |
|
| 298 |
+
| all.leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | -0.007 | 72b | 0.98 |
|
| 299 |
+
| all.leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33 |
|
| 300 |
+
| all.leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83 |
|
| 301 |
+
| all.leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | -0.016 | 72b | 0.98 |
|
| 302 |
+
| all.leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00 |
|
| 303 |
+
| all.leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02 |
|
| 304 |
+
| all.leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00 |
|
| 305 |
+
| all.leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00 |
|
| 306 |
+
| all.leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
|
| 307 |
+
| all.leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
|
| 308 |
+
| all.leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
|
| 309 |
+
| all.leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
|
| 310 |
+
| all.leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | -0.022 | 72b | 0.98 |
|
| 311 |
+
| all.leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | -0.060 | 72b | 0.91 |
|
| 312 |
+
| all.leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07 |
|
| 313 |
+
| all.leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01 |
|
| 314 |
+
| all.leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | -0.068 | 72b | 0.89 |
|
| 315 |
+
| all.leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11 |
|
| 316 |
+
| all.leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | -0.112 | 72b | 0.84 |
|
| 317 |
+
| all.leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | -0.060 | 72b | 0.87 |
|
| 318 |
+
| all.leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | -0.044 | 72b | 0.95 |
|
| 319 |
+
| all.leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | -0.036 | 72b | 0.89 |
|
| 320 |
+
| all.leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | -0.020 | 72b | 0.97 |
|
| 321 |
+
| all.leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | -0.020 | 72b | 0.95 |
|
| 322 |
+
| all.leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04 |
|
| 323 |
+
| all.leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | -0.036 | 72b | 0.89 |
|
| 324 |
+
| all.leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | -0.020 | 72b | 0.97 |
|
| 325 |
+
| all.leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | -0.052 | 72b | 0.89 |
|
| 326 |
+
| all.leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | -0.024 | 72b | 0.97 |
|
| 327 |
+
| all.leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10 |
|
| 328 |
+
| all.leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | -0.028 | 72b | 0.97 |
|
| 329 |
+
| all.leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.020 | 95b | 1.03 |
|
| 330 |
+
| all.leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | -0.025 | 72b | 0.97 |
|
| 331 |
+
| all.leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | -0.028 | 72b | 0.97 |
|
| 332 |
+
| all.leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
|
| 333 |
+
| leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | -0.020 | 72b | 0.95 |
|
| 334 |
+
| leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04 |
|
| 335 |
+
| leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | -0.036 | 72b | 0.89 |
|
| 336 |
+
| leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | -0.020 | 72b | 0.97 |
|
| 337 |
+
| leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | -0.052 | 72b | 0.89 |
|
| 338 |
+
| leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | -0.024 | 72b | 0.97 |
|
| 339 |
+
| leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10 |
|
| 340 |
+
| leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | -0.028 | 72b | 0.97 |
|
| 341 |
+
| leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.020 | 95b | 1.03 |
|
| 342 |
+
| leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | -0.025 | 72b | 0.97 |
|
| 343 |
+
| leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | -0.028 | 72b | 0.97 |
|
| 344 |
+
| leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
|