| { | |
| "model": "Kirim-1-Math", | |
| "version": "1.0.0", | |
| "parameters": "30B", | |
| "evaluation_date": "2024-12-13", | |
| "temperature": 0.1, | |
| "sampling": "greedy", | |
| "mathematical_reasoning": { | |
| "GSM8K": { | |
| "accuracy": 0.942, | |
| "total_questions": 1319, | |
| "correct": 1242, | |
| "comparison": { | |
| "gpt4": 0.920, | |
| "claude_3_opus": 0.915, | |
| "best_open_source": 0.917 | |
| } | |
| }, | |
| "MATH": { | |
| "accuracy": 0.785, | |
| "total_questions": 5000, | |
| "correct": 3925, | |
| "breakdown_by_difficulty": { | |
| "level_1": 0.96, | |
| "level_2": 0.92, | |
| "level_3": 0.84, | |
| "level_4": 0.71, | |
| "level_5": 0.58 | |
| }, | |
| "breakdown_by_subject": { | |
| "algebra": 0.89, | |
| "counting_and_probability": 0.82, | |
| "geometry": 0.76, | |
| "intermediate_algebra": 0.81, | |
| "number_theory": 0.78, | |
| "prealgebra": 0.94, | |
| "precalculus": 0.73 | |
| }, | |
| "comparison": { | |
| "gpt4": 0.764, | |
| "claude_3_opus": 0.752, | |
| "best_open_source": 0.742 | |
| } | |
| }, | |
| "MMLU_Math": { | |
| "accuracy": 0.887, | |
| "subjects": { | |
| "abstract_algebra": 0.82, | |
| "college_mathematics": 0.89, | |
| "elementary_mathematics": 0.96, | |
| "high_school_mathematics": 0.91, | |
| "high_school_statistics": 0.85 | |
| } | |
| }, | |
| "Minerva_Math": { | |
| "accuracy": 0.452, | |
| "total_questions": 272, | |
| "correct": 123, | |
| "note": "Complex competition-level problems" | |
| }, | |
| "AMC10": { | |
| "accuracy": 0.723, | |
| "average_score": "18.1/25", | |
| "comparison": { | |
| "human_average": 0.48, | |
| "gpt4": 0.695 | |
| } | |
| }, | |
| "AMC12": { | |
| "accuracy": 0.723, | |
| "average_score": "18.1/25", | |
| "comparison": { | |
| "human_average": 0.42, | |
| "gpt4": 0.695 | |
| } | |
| }, | |
| "AIME": { | |
| "accuracy": 0.387, | |
| "average_score": "5.8/15", | |
| "comparison": { | |
| "human_qualifier_average": 0.40, | |
| "gpt4": 0.352 | |
| } | |
| } | |
| }, | |
| "tool_calling_evaluation": { | |
| "tool_selection_accuracy": { | |
| "score": 0.968, | |
| "description": "Correctly identifies which tool to use" | |
| }, | |
| "parameter_extraction_accuracy": { | |
| "score": 0.942, | |
| "description": "Correctly extracts parameters for tool calls" | |
| }, | |
| "execution_success_rate": { | |
| "score": 0.925, | |
| "description": "Tool calls execute without errors" | |
| }, | |
| "result_integration_accuracy": { | |
| "score": 0.951, | |
| "description": "Correctly uses tool results in final answer" | |
| }, | |
| "tool_usage_by_type": { | |
| "calculator": { | |
| "called": 5234, | |
| "successful": 4872, | |
| "success_rate": 0.931 | |
| }, | |
| "symbolic_solver": { | |
| "called": 3421, | |
| "successful": 3189, | |
| "success_rate": 0.932 | |
| }, | |
| "derivative": { | |
| "called": 1892, | |
| "successful": 1756, | |
| "success_rate": 0.928 | |
| }, | |
| "integrate": { | |
| "called": 1654, | |
| "successful": 1521, | |
| "success_rate": 0.920 | |
| }, | |
| "code_executor": { | |
| "called": 2341, | |
| "successful": 2103, | |
| "success_rate": 0.898 | |
| } | |
| } | |
| }, | |
| "code_generation": { | |
| "HumanEval_Math": { | |
| "pass_at_1": 0.783, | |
| "pass_at_10": 0.921, | |
| "language": "Python" | |
| }, | |
| "MBPP_Math": { | |
| "pass_at_1": 0.756, | |
| "pass_at_10": 0.894 | |
| }, | |
| "SymPy_Tasks": { | |
| "accuracy": 0.825, | |
| "tasks": "symbolic_manipulation" | |
| }, | |
| "NumPy_Tasks": { | |
| "accuracy": 0.756, | |
| "tasks": "numerical_computation" | |
| } | |
| }, | |
| "multilingual_math": { | |
| "chinese_math_problems": { | |
| "accuracy": 0.891, | |
| "total": 1000, | |
| "correct": 891, | |
| "sources": ["Gaokao", "Chinese_Olympiad"] | |
| }, | |
| "english_math_problems": { | |
| "accuracy": 0.887, | |
| "total": 1000, | |
| "correct": 887 | |
| }, | |
| "cross_lingual_consistency": { | |
| "score": 0.965, | |
| "description": "Same problem in different languages yields same answer" | |
| } | |
| }, | |
| "reasoning_quality": { | |
| "step_by_step_accuracy": { | |
| "score": 0.912, | |
| "description": "Each reasoning step is logically sound" | |
| }, | |
| "proof_validity": { | |
| "score": 0.834, | |
| "description": "Mathematical proofs are formally valid" | |
| }, | |
| "notation_correctness": { | |
| "score": 0.956, | |
| "description": "Mathematical notation is used correctly" | |
| }, | |
| "latex_formatting": { | |
| "score": 0.978, | |
| "description": "LaTeX output is properly formatted" | |
| } | |
| }, | |
| "performance_metrics": { | |
| "inference_speed": { | |
| "tokens_per_second": 45, | |
| "hardware": "A100 80GB", | |
| "batch_size": 1 | |
| }, | |
| "memory_usage": { | |
| "bf16": "60GB", | |
| "int8": "30GB", | |
| "int4": "20GB" | |
| }, | |
| "latency": { | |
| "mean_ms": 89, | |
| "p50_ms": 82, | |
| "p95_ms": 145, | |
| "p99_ms": 203 | |
| } | |
| }, | |
| "comparison_with_baselines": { | |
| "overall_math_score": { | |
| "kirim_1_math": 0.847, | |
| "gpt4": 0.826, | |
| "claude_3_opus": 0.814, | |
| "gemini_1_5_pro": 0.798, | |
| "llama_3_70b": 0.742, | |
| "mistral_large": 0.735 | |
| } | |
| }, | |
| "limitations": { | |
| "observed_failures": [ | |
| "Complex multi-variable calculus problems", | |
| "Abstract topology proofs", | |
| "Very large numerical computations without tools", | |
| "Problems requiring visual/geometric intuition", | |
| "Extremely novel mathematical concepts" | |
| ], | |
| "error_rate_by_difficulty": { | |
| "elementary": 0.04, | |
| "high_school": 0.08, | |
| "undergraduate": 0.15, | |
| "graduate": 0.28, | |
| "research": 0.45 | |
| } | |
| }, | |
| "notes": { | |
| "evaluation_methodology": "All benchmarks run with temperature=0.1 for deterministic results", | |
| "tool_calling": "Tool calling enabled for all evaluations", | |
| "verification": "Results verified by automated test suites and manual review", | |
| "reproducibility": "Seeds fixed for reproducible results" | |
| } | |
| } |