Kirim-1-Math / benchmark_results.json
Kirim1's picture
Create benchmark_results.json
ca016db verified
{
"model": "Kirim-1-Math",
"version": "1.0.0",
"parameters": "30B",
"evaluation_date": "2024-12-13",
"temperature": 0.1,
"sampling": "greedy",
"mathematical_reasoning": {
"GSM8K": {
"accuracy": 0.942,
"total_questions": 1319,
"correct": 1242,
"comparison": {
"gpt4": 0.920,
"claude_3_opus": 0.915,
"best_open_source": 0.917
}
},
"MATH": {
"accuracy": 0.785,
"total_questions": 5000,
"correct": 3925,
"breakdown_by_difficulty": {
"level_1": 0.96,
"level_2": 0.92,
"level_3": 0.84,
"level_4": 0.71,
"level_5": 0.58
},
"breakdown_by_subject": {
"algebra": 0.89,
"counting_and_probability": 0.82,
"geometry": 0.76,
"intermediate_algebra": 0.81,
"number_theory": 0.78,
"prealgebra": 0.94,
"precalculus": 0.73
},
"comparison": {
"gpt4": 0.764,
"claude_3_opus": 0.752,
"best_open_source": 0.742
}
},
"MMLU_Math": {
"accuracy": 0.887,
"subjects": {
"abstract_algebra": 0.82,
"college_mathematics": 0.89,
"elementary_mathematics": 0.96,
"high_school_mathematics": 0.91,
"high_school_statistics": 0.85
}
},
"Minerva_Math": {
"accuracy": 0.452,
"total_questions": 272,
"correct": 123,
"note": "Complex competition-level problems"
},
"AMC10": {
"accuracy": 0.723,
"average_score": "18.1/25",
"comparison": {
"human_average": 0.48,
"gpt4": 0.695
}
},
"AMC12": {
"accuracy": 0.723,
"average_score": "18.1/25",
"comparison": {
"human_average": 0.42,
"gpt4": 0.695
}
},
"AIME": {
"accuracy": 0.387,
"average_score": "5.8/15",
"comparison": {
"human_qualifier_average": 0.40,
"gpt4": 0.352
}
}
},
"tool_calling_evaluation": {
"tool_selection_accuracy": {
"score": 0.968,
"description": "Correctly identifies which tool to use"
},
"parameter_extraction_accuracy": {
"score": 0.942,
"description": "Correctly extracts parameters for tool calls"
},
"execution_success_rate": {
"score": 0.925,
"description": "Tool calls execute without errors"
},
"result_integration_accuracy": {
"score": 0.951,
"description": "Correctly uses tool results in final answer"
},
"tool_usage_by_type": {
"calculator": {
"called": 5234,
"successful": 4872,
"success_rate": 0.931
},
"symbolic_solver": {
"called": 3421,
"successful": 3189,
"success_rate": 0.932
},
"derivative": {
"called": 1892,
"successful": 1756,
"success_rate": 0.928
},
"integrate": {
"called": 1654,
"successful": 1521,
"success_rate": 0.920
},
"code_executor": {
"called": 2341,
"successful": 2103,
"success_rate": 0.898
}
}
},
"code_generation": {
"HumanEval_Math": {
"pass_at_1": 0.783,
"pass_at_10": 0.921,
"language": "Python"
},
"MBPP_Math": {
"pass_at_1": 0.756,
"pass_at_10": 0.894
},
"SymPy_Tasks": {
"accuracy": 0.825,
"tasks": "symbolic_manipulation"
},
"NumPy_Tasks": {
"accuracy": 0.756,
"tasks": "numerical_computation"
}
},
"multilingual_math": {
"chinese_math_problems": {
"accuracy": 0.891,
"total": 1000,
"correct": 891,
"sources": ["Gaokao", "Chinese_Olympiad"]
},
"english_math_problems": {
"accuracy": 0.887,
"total": 1000,
"correct": 887
},
"cross_lingual_consistency": {
"score": 0.965,
"description": "Same problem in different languages yields same answer"
}
},
"reasoning_quality": {
"step_by_step_accuracy": {
"score": 0.912,
"description": "Each reasoning step is logically sound"
},
"proof_validity": {
"score": 0.834,
"description": "Mathematical proofs are formally valid"
},
"notation_correctness": {
"score": 0.956,
"description": "Mathematical notation is used correctly"
},
"latex_formatting": {
"score": 0.978,
"description": "LaTeX output is properly formatted"
}
},
"performance_metrics": {
"inference_speed": {
"tokens_per_second": 45,
"hardware": "A100 80GB",
"batch_size": 1
},
"memory_usage": {
"bf16": "60GB",
"int8": "30GB",
"int4": "20GB"
},
"latency": {
"mean_ms": 89,
"p50_ms": 82,
"p95_ms": 145,
"p99_ms": 203
}
},
"comparison_with_baselines": {
"overall_math_score": {
"kirim_1_math": 0.847,
"gpt4": 0.826,
"claude_3_opus": 0.814,
"gemini_1_5_pro": 0.798,
"llama_3_70b": 0.742,
"mistral_large": 0.735
}
},
"limitations": {
"observed_failures": [
"Complex multi-variable calculus problems",
"Abstract topology proofs",
"Very large numerical computations without tools",
"Problems requiring visual/geometric intuition",
"Extremely novel mathematical concepts"
],
"error_rate_by_difficulty": {
"elementary": 0.04,
"high_school": 0.08,
"undergraduate": 0.15,
"graduate": 0.28,
"research": 0.45
}
},
"notes": {
"evaluation_methodology": "All benchmarks run with temperature=0.1 for deterministic results",
"tool_calling": "Tool calling enabled for all evaluations",
"verification": "Results verified by automated test suites and manual review",
"reproducibility": "Seeds fixed for reproducible results"
}
}