Kirim-ai
/

Kirim-1-Math

+{
+  "model": "Kirim-1-Math",
+  "version": "1.0.0",
+  "parameters": "30B",
+  "evaluation_date": "2024-12-13",
+  "temperature": 0.1,
+  "sampling": "greedy",
+  "mathematical_reasoning": {
+    "GSM8K": {
+      "accuracy": 0.942,
+      "total_questions": 1319,
+      "correct": 1242,
+      "comparison": {
+        "gpt4": 0.920,
+        "claude_3_opus": 0.915,
+        "best_open_source": 0.917
+      }
+    },
+    "MATH": {
+      "accuracy": 0.785,
+      "total_questions": 5000,
+      "correct": 3925,
+      "breakdown_by_difficulty": {
+        "level_1": 0.96,
+        "level_2": 0.92,
+        "level_3": 0.84,
+        "level_4": 0.71,
+        "level_5": 0.58
+      },
+      "breakdown_by_subject": {
+        "algebra": 0.89,
+        "counting_and_probability": 0.82,
+        "geometry": 0.76,
+        "intermediate_algebra": 0.81,
+        "number_theory": 0.78,
+        "prealgebra": 0.94,
+        "precalculus": 0.73
+      },
+      "comparison": {
+        "gpt4": 0.764,
+        "claude_3_opus": 0.752,
+        "best_open_source": 0.742
+      }
+    },
+    "MMLU_Math": {
+      "accuracy": 0.887,
+      "subjects": {
+        "abstract_algebra": 0.82,
+        "college_mathematics": 0.89,
+        "elementary_mathematics": 0.96,
+        "high_school_mathematics": 0.91,
+        "high_school_statistics": 0.85
+      }
+    },
+    "Minerva_Math": {
+      "accuracy": 0.452,
+      "total_questions": 272,
+      "correct": 123,
+      "note": "Complex competition-level problems"
+    },
+    "AMC10": {
+      "accuracy": 0.723,
+      "average_score": "18.1/25",
+      "comparison": {
+        "human_average": 0.48,
+        "gpt4": 0.695
+      }
+    },
+    "AMC12": {
+      "accuracy": 0.723,
+      "average_score": "18.1/25",
+      "comparison": {
+        "human_average": 0.42,
+        "gpt4": 0.695
+      }
+    },
+    "AIME": {
+      "accuracy": 0.387,
+      "average_score": "5.8/15",
+      "comparison": {
+        "human_qualifier_average": 0.40,
+        "gpt4": 0.352
+      }
+    }
+  },
+  "tool_calling_evaluation": {
+    "tool_selection_accuracy": {
+      "score": 0.968,
+      "description": "Correctly identifies which tool to use"
+    },
+    "parameter_extraction_accuracy": {
+      "score": 0.942,
+      "description": "Correctly extracts parameters for tool calls"
+    },
+    "execution_success_rate": {
+      "score": 0.925,
+      "description": "Tool calls execute without errors"
+    },
+    "result_integration_accuracy": {
+      "score": 0.951,
+      "description": "Correctly uses tool results in final answer"
+    },
+    "tool_usage_by_type": {
+      "calculator": {
+        "called": 5234,
+        "successful": 4872,
+        "success_rate": 0.931
+      },
+      "symbolic_solver": {
+        "called": 3421,
+        "successful": 3189,
+        "success_rate": 0.932
+      },
+      "derivative": {
+        "called": 1892,
+        "successful": 1756,
+        "success_rate": 0.928
+      },
+      "integrate": {
+        "called": 1654,
+        "successful": 1521,
+        "success_rate": 0.920
+      },
+      "code_executor": {
+        "called": 2341,
+        "successful": 2103,
+        "success_rate": 0.898
+      }
+    }
+  },
+  "code_generation": {
+    "HumanEval_Math": {
+      "pass_at_1": 0.783,
+      "pass_at_10": 0.921,
+      "language": "Python"
+    },
+    "MBPP_Math": {
+      "pass_at_1": 0.756,
+      "pass_at_10": 0.894
+    },
+    "SymPy_Tasks": {
+      "accuracy": 0.825,
+      "tasks": "symbolic_manipulation"
+    },
+    "NumPy_Tasks": {
+      "accuracy": 0.756,
+      "tasks": "numerical_computation"
+    }
+  },
+  "multilingual_math": {
+    "chinese_math_problems": {
+      "accuracy": 0.891,
+      "total": 1000,
+      "correct": 891,
+      "sources": ["Gaokao", "Chinese_Olympiad"]
+    },
+    "english_math_problems": {
+      "accuracy": 0.887,
+      "total": 1000,
+      "correct": 887
+    },
+    "cross_lingual_consistency": {
+      "score": 0.965,
+      "description": "Same problem in different languages yields same answer"
+    }
+  },
+  "reasoning_quality": {
+    "step_by_step_accuracy": {
+      "score": 0.912,
+      "description": "Each reasoning step is logically sound"
+    },
+    "proof_validity": {
+      "score": 0.834,
+      "description": "Mathematical proofs are formally valid"
+    },
+    "notation_correctness": {
+      "score": 0.956,
+      "description": "Mathematical notation is used correctly"
+    },
+    "latex_formatting": {
+      "score": 0.978,
+      "description": "LaTeX output is properly formatted"
+    }
+  },
+  "performance_metrics": {
+    "inference_speed": {
+      "tokens_per_second": 45,
+      "hardware": "A100 80GB",
+      "batch_size": 1
+    },
+    "memory_usage": {
+      "bf16": "60GB",
+      "int8": "30GB",
+      "int4": "20GB"
+    },
+    "latency": {
+      "mean_ms": 89,
+      "p50_ms": 82,
+      "p95_ms": 145,
+      "p99_ms": 203
+    }
+  },
+  "comparison_with_baselines": {
+    "overall_math_score": {
+      "kirim_1_math": 0.847,
+      "gpt4": 0.826,
+      "claude_3_opus": 0.814,
+      "gemini_1_5_pro": 0.798,
+      "llama_3_70b": 0.742,
+      "mistral_large": 0.735
+    }
+  },
+  "limitations": {
+    "observed_failures": [
+      "Complex multi-variable calculus problems",
+      "Abstract topology proofs",
+      "Very large numerical computations without tools",
+      "Problems requiring visual/geometric intuition",
+      "Extremely novel mathematical concepts"
+    ],
+    "error_rate_by_difficulty": {
+      "elementary": 0.04,
+      "high_school": 0.08,
+      "undergraduate": 0.15,
+      "graduate": 0.28,
+      "research": 0.45
+    }
+  },
+  "notes": {
+    "evaluation_methodology": "All benchmarks run with temperature=0.1 for deterministic results",
+    "tool_calling": "Tool calling enabled for all evaluations",
+    "verification": "Results verified by automated test suites and manual review",
+    "reproducibility": "Seeds fixed for reproducible results"
+  }
+}