Spaces:
Running
Running
api model results on unified exam appending
Browse files- model_results.json +48 -0
model_results.json
CHANGED
|
@@ -809,5 +809,53 @@
|
|
| 809 |
}
|
| 810 |
]
|
| 811 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
}
|
| 813 |
]
|
|
|
|
| 809 |
}
|
| 810 |
]
|
| 811 |
}
|
| 812 |
+
},
|
| 813 |
+
{
|
| 814 |
+
"model_name": "gemini-2.5-flash",
|
| 815 |
+
"results": {
|
| 816 |
+
"mmlu_results": [],
|
| 817 |
+
"unified_exam_results": [
|
| 818 |
+
{
|
| 819 |
+
"category": "Average",
|
| 820 |
+
"score": 9.5
|
| 821 |
+
}
|
| 822 |
+
]
|
| 823 |
+
}
|
| 824 |
+
},
|
| 825 |
+
{
|
| 826 |
+
"model_name": "gemini-2.5-pro",
|
| 827 |
+
"results": {
|
| 828 |
+
"mmlu_results": [],
|
| 829 |
+
"unified_exam_results": [
|
| 830 |
+
{
|
| 831 |
+
"category": "Average",
|
| 832 |
+
"score": 11.25
|
| 833 |
+
}
|
| 834 |
+
]
|
| 835 |
+
}
|
| 836 |
+
},
|
| 837 |
+
{
|
| 838 |
+
"model_name": "gpt-4.1-2025-04-14",
|
| 839 |
+
"results": {
|
| 840 |
+
"mmlu_results": [],
|
| 841 |
+
"unified_exam_results": [
|
| 842 |
+
{
|
| 843 |
+
"category": "Average",
|
| 844 |
+
"score": 8.0
|
| 845 |
+
}
|
| 846 |
+
]
|
| 847 |
+
}
|
| 848 |
+
},
|
| 849 |
+
{
|
| 850 |
+
"model_name": "claude-sonnet-4-20250514",
|
| 851 |
+
"results": {
|
| 852 |
+
"mmlu_results": [],
|
| 853 |
+
"unified_exam_results": [
|
| 854 |
+
{
|
| 855 |
+
"category": "Average",
|
| 856 |
+
"score": 7.0
|
| 857 |
+
}
|
| 858 |
+
]
|
| 859 |
+
}
|
| 860 |
}
|
| 861 |
]
|