Buckets:
| suite,model,model_slug,source_kind,label,artifact_count,generation_ok,generation_total,artifact_present,generation_duration_s,input_tokens,output_tokens,total_tokens,billing_tokens,reasoning_tokens,tool_use_tokens,cache_read_tokens,cache_write_tokens,cache_hit_tokens,total_cache_tokens,effective_input_tokens,display_input_tokens,usage_event_count,tool_calls,turn_count,self_check_attempted,self_check_ran,self_check_succeeded,self_check_runs,self_check_failed_runs,self_check_successful_runs,self_correction_edits,self_corrected_after_checker,self_correction_verified,assistant_turns_trace,deterministic_failures,deterministic_warnings,vlm_failures,vlm_warnings,deterministic_failure_units,deterministic_warning_units,vlm_failure_units,vlm_warning_units,generation_trace_count,vlm_trace_count,selected_record_path,quality_score,quality_score_basis,missing_final_artifacts,efficiency_score,quality_efficiency_score,rank_quality_efficiency | |
| publish,codexresponses.gpt-5.5,codexresponses-gpt-5-5,clean-final,skill-with-shell-codexresponses-gpt-5-5-publication-final,5,5,5,5,748.602,1190656,35085,1225741,1225741,4074,0,0,0,1018880,1018880,171776,1190656,62,83,62,5,5,5,11,2,9,1,1,2,62,0,0,1,1,0,0,1,1,5,5,results/publish/models/codexresponses-gpt-5-5,98.2,sum_of_five_20_point_task_scores,0,82.77,94.34,1 | |
| publish,opus47,opus47,clean-final,skill-with-shell-opus47-publication-final,5,5,5,5,872.87,1980822,60545,2041367,2041367,0,0,1547331,205103,0,1752434,228388,1980822,67,83,67,5,5,5,10,0,10,0,0,0,67,0,0,0,0,0,0,0,0,5,5,results/publish/models/opus47,100.0,sum_of_five_20_point_task_scores,0,76.32,94.08,2 | |
| publish,gpt-5.3-codex,gpt-5-3-codex,clean-final,skill-with-shell-gpt-5-3-codex-publication-final,5,5,5,5,372.521,1260130,28682,1288812,1288812,6974,0,0,0,1036288,1036288,223842,1260130,58,70,58,3,3,2,5,3,2,0,0,2,58,6,2,1,1,3,1,1,1,5,5,results/publish/models/gpt-5-3-codex,94.4,sum_of_five_20_point_task_scores,0,90.83,93.51,3 | |
| publish,glm51,glm51,clean-final,skill-with-shell-glm51-publication-final,5,5,5,5,767.449,1557545,52925,1610470,1610470,0,0,0,0,1221440,1221440,336105,1557545,62,74,62,5,5,4,6,1,5,0,0,0,62,2,0,2,4,1,0,1,2,5,5,results/publish/models/glm51,96.8,sum_of_five_20_point_task_scores,0,81.79,93.05,4 | |
| publish,codexresponses.gpt-5.4-mini,codexresponses-gpt-5-4-mini,clean-final,skill-with-shell-codexresponses-gpt-5-4-mini-publication-final,5,5,5,5,1155.803,2799895,87812,2887707,2887707,55592,0,0,0,2607104,2607104,192791,2799895,72,113,72,5,5,5,13,6,7,0,0,4,72,0,2,0,0,0,1,0,0,5,5,results/publish/models/codexresponses-gpt-5-4-mini,99.8,sum_of_five_20_point_task_scores,0,62.39,90.45,5 | |
| publish,kimi27,kimi27,clean-final,skill-with-shell-kimi27-publication-final,5,5,5,5,833.135,5848029,82781,5930810,5930810,0,0,0,0,4667885,4667885,1180144,5848029,87,104,87,5,5,5,10,2,8,0,0,2,87,0,0,0,0,0,0,0,0,5,5,results/publish/models/kimi27,100.0,sum_of_five_20_point_task_scores,0,53.1,88.28,6 | |
| publish,haiku45,haiku45,clean-final,skill-with-shell-haiku45-publication-final,5,3,5,5,370.199,907211,42193,949404,949404,0,0,450168,130377,0,580545,162656,743201,38,49,47,3,2,2,2,0,2,0,0,0,38,26,12,1,5,7,3,1,2,5,5,results/publish/models/haiku45,83.0,sum_of_five_20_point_task_scores,0,95.91,86.23,7 | |
| publish,gemini35flash,gemini35flash,clean-final,skill-with-shell-gemini35flash-publication-final,5,5,5,5,774.424,8095357,32386,8127743,8127743,78303,0,0,0,6936722,6936722,1158635,8095357,147,142,147,5,5,3,13,9,4,0,0,2,147,0,0,0,0,0,0,0,0,5,5,results/publish/models/gemini35flash,100.0,sum_of_five_20_point_task_scores,0,36.58,84.14,8 | |
| publish,sonnet46,sonnet46,clean-final,skill-with-shell-sonnet46-publication-final,5,5,5,5,2303.843,4878483,156614,5035097,5035097,0,0,4215364,372099,0,4587463,291020,4878483,93,108,93,5,5,5,11,2,9,0,0,2,93,0,0,0,0,0,0,0,0,5,5,results/publish/models/sonnet46,100.0,sum_of_five_20_point_task_scores,0,32.2,83.05,9 | |
| publish,glm52,glm52,clean-final,skill-with-shell-glm52-publication-final,5,5,5,5,2633.551,4239304,147485,4386789,4386789,0,0,0,0,3907136,3907136,332168,4239304,95,126,95,5,5,5,11,0,11,0,0,0,95,0,0,0,0,0,0,0,0,5,5,results/publish/models/glm52,100.0,sum_of_five_20_point_task_scores,0,27.26,81.81,10 | |
| publish,deepseek,deepseek,clean-final,skill-with-shell-deepseek-publication-final,5,4,5,5,1242.354,2535136,77564,2612700,2612700,0,0,0,0,2637696,2637696,180215,2817911,84,97,80,5,4,4,8,3,5,0,0,3,84,8,1,7,0,3,1,2,0,5,5,results/publish/models/deepseek,84.0,sum_of_five_20_point_task_scores,0,64.84,79.21,11 | |
| publish,kimi,kimi,clean-final,skill-with-shell-kimi-publication-final,5,4,5,5,1764.648,2600494,69995,2670489,2670489,0,0,0,0,2332928,2332928,267566,2600494,87,99,87,5,4,4,7,2,5,0,0,2,87,6,0,7,2,2,0,3,2,5,5,results/publish/models/kimi,83.8,sum_of_five_20_point_task_scores,0,55.33,76.68,12 | |
| publish,grok-4.3,grok-4-3,clean-final,skill-with-shell-grok-4-3-publication-final,5,2,5,5,284.258,575242,24310,599552,599552,8191,0,0,0,336000,336000,134147,470147,52,44,49,1,0,0,0,0,0,0,0,0,52,16,0,11,1,4,0,4,1,5,5,results/publish/models/grok-4-3,58.0,sum_of_five_20_point_task_scores,0,100.0,68.5,13 | |
| publish,codexspark,codexspark,clean-final,skill-with-shell-codexspark-publication-final,5,3,5,5,363.473,7093160,92660,7185820,7185820,58302,0,0,0,6181120,6181120,450555,6631675,167,174,159,3,2,2,5,0,5,0,0,0,167,14,6,5,0,4,2,2,0,5,5,results/publish/models/codexspark,72.2,sum_of_five_20_point_task_scores,0,43.66,65.06,14 | |
| publish,minimax27,minimax27,clean-final,skill-with-shell-minimax27-publication-final,5,1,5,5,1039.913,1255595,70938,1326533,1326533,0,0,0,0,841088,841088,485011,1326099,49,55,50,3,2,2,3,1,2,0,0,1,49,26,4,4,4,7,1,1,1,5,5,results/publish/models/minimax27,58.0,sum_of_five_20_point_task_scores,0,81.58,63.89,15 | |
Xet Storage Details
- Size:
- 5.69 kB
- Xet hash:
- 7a823e2605378539f90817d06f1de8edd51169d855a80bc3e739fdbc43ac5367
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.