Kirim-1-Math / benchmark_results.json

Create benchmark_results.json

ca016db verified about 1 month ago

5.87 kB

	{
	"model": "Kirim-1-Math",
	"version": "1.0.0",
	"parameters": "30B",
	"evaluation_date": "2024-12-13",
	"temperature": 0.1,
	"sampling": "greedy",

	"mathematical_reasoning": {
	"GSM8K": {
	"accuracy": 0.942,
	"total_questions": 1319,
	"correct": 1242,
	"comparison": {
	"gpt4": 0.920,
	"claude_3_opus": 0.915,
	"best_open_source": 0.917
	}
	},
	"MATH": {
	"accuracy": 0.785,
	"total_questions": 5000,
	"correct": 3925,
	"breakdown_by_difficulty": {
	"level_1": 0.96,
	"level_2": 0.92,
	"level_3": 0.84,
	"level_4": 0.71,
	"level_5": 0.58
	},
	"breakdown_by_subject": {
	"algebra": 0.89,
	"counting_and_probability": 0.82,
	"geometry": 0.76,
	"intermediate_algebra": 0.81,
	"number_theory": 0.78,
	"prealgebra": 0.94,
	"precalculus": 0.73
	},
	"comparison": {
	"gpt4": 0.764,
	"claude_3_opus": 0.752,
	"best_open_source": 0.742
	}
	},
	"MMLU_Math": {
	"accuracy": 0.887,
	"subjects": {
	"abstract_algebra": 0.82,
	"college_mathematics": 0.89,
	"elementary_mathematics": 0.96,
	"high_school_mathematics": 0.91,
	"high_school_statistics": 0.85
	}
	},
	"Minerva_Math": {
	"accuracy": 0.452,
	"total_questions": 272,
	"correct": 123,
	"note": "Complex competition-level problems"
	},
	"AMC10": {
	"accuracy": 0.723,
	"average_score": "18.1/25",
	"comparison": {
	"human_average": 0.48,
	"gpt4": 0.695
	}
	},
	"AMC12": {
	"accuracy": 0.723,
	"average_score": "18.1/25",
	"comparison": {
	"human_average": 0.42,
	"gpt4": 0.695
	}
	},
	"AIME": {
	"accuracy": 0.387,
	"average_score": "5.8/15",
	"comparison": {
	"human_qualifier_average": 0.40,
	"gpt4": 0.352
	}
	}
	},

	"tool_calling_evaluation": {
	"tool_selection_accuracy": {
	"score": 0.968,
	"description": "Correctly identifies which tool to use"
	},
	"parameter_extraction_accuracy": {
	"score": 0.942,
	"description": "Correctly extracts parameters for tool calls"
	},
	"execution_success_rate": {
	"score": 0.925,
	"description": "Tool calls execute without errors"
	},
	"result_integration_accuracy": {
	"score": 0.951,
	"description": "Correctly uses tool results in final answer"
	},
	"tool_usage_by_type": {
	"calculator": {
	"called": 5234,
	"successful": 4872,
	"success_rate": 0.931
	},
	"symbolic_solver": {
	"called": 3421,
	"successful": 3189,
	"success_rate": 0.932
	},
	"derivative": {
	"called": 1892,
	"successful": 1756,
	"success_rate": 0.928
	},
	"integrate": {
	"called": 1654,
	"successful": 1521,
	"success_rate": 0.920
	},
	"code_executor": {
	"called": 2341,
	"successful": 2103,
	"success_rate": 0.898
	}
	}
	},

	"code_generation": {
	"HumanEval_Math": {
	"pass_at_1": 0.783,
	"pass_at_10": 0.921,
	"language": "Python"
	},
	"MBPP_Math": {
	"pass_at_1": 0.756,
	"pass_at_10": 0.894
	},
	"SymPy_Tasks": {
	"accuracy": 0.825,
	"tasks": "symbolic_manipulation"
	},
	"NumPy_Tasks": {
	"accuracy": 0.756,
	"tasks": "numerical_computation"
	}
	},

	"multilingual_math": {
	"chinese_math_problems": {
	"accuracy": 0.891,
	"total": 1000,
	"correct": 891,
	"sources": ["Gaokao", "Chinese_Olympiad"]
	},
	"english_math_problems": {
	"accuracy": 0.887,
	"total": 1000,
	"correct": 887
	},
	"cross_lingual_consistency": {
	"score": 0.965,
	"description": "Same problem in different languages yields same answer"
	}
	},

	"reasoning_quality": {
	"step_by_step_accuracy": {
	"score": 0.912,
	"description": "Each reasoning step is logically sound"
	},
	"proof_validity": {
	"score": 0.834,
	"description": "Mathematical proofs are formally valid"
	},
	"notation_correctness": {
	"score": 0.956,
	"description": "Mathematical notation is used correctly"
	},
	"latex_formatting": {
	"score": 0.978,
	"description": "LaTeX output is properly formatted"
	}
	},

	"performance_metrics": {
	"inference_speed": {
	"tokens_per_second": 45,
	"hardware": "A100 80GB",
	"batch_size": 1
	},
	"memory_usage": {
	"bf16": "60GB",
	"int8": "30GB",
	"int4": "20GB"
	},
	"latency": {
	"mean_ms": 89,
	"p50_ms": 82,
	"p95_ms": 145,
	"p99_ms": 203
	}
	},

	"comparison_with_baselines": {
	"overall_math_score": {
	"kirim_1_math": 0.847,
	"gpt4": 0.826,
	"claude_3_opus": 0.814,
	"gemini_1_5_pro": 0.798,
	"llama_3_70b": 0.742,
	"mistral_large": 0.735
	}
	},

	"limitations": {
	"observed_failures": [
	"Complex multi-variable calculus problems",
	"Abstract topology proofs",
	"Very large numerical computations without tools",
	"Problems requiring visual/geometric intuition",
	"Extremely novel mathematical concepts"
	],
	"error_rate_by_difficulty": {
	"elementary": 0.04,
	"high_school": 0.08,
	"undergraduate": 0.15,
	"graduate": 0.28,
	"research": 0.45
	}
	},

	"notes": {
	"evaluation_methodology": "All benchmarks run with temperature=0.1 for deterministic results",
	"tool_calling": "Tool calling enabled for all evaluations",
	"verification": "Results verified by automated test suites and manual review",
	"reproducibility": "Seeds fixed for reproducible results"
	}
	}