Update benchmark_results.json with Yi-34B benchmark comparison
Browse files- benchmark_results.json +48 -0
benchmark_results.json
CHANGED
|
@@ -126,6 +126,54 @@
|
|
| 126 |
"accuracy": 69.2
|
| 127 |
}
|
| 128 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
}
|
| 130 |
]
|
| 131 |
}
|
|
|
|
| 126 |
"accuracy": 69.2
|
| 127 |
}
|
| 128 |
}
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"model": "Yi-34B-Chat",
|
| 132 |
+
"parameters": "34B",
|
| 133 |
+
"provider": "01.AI",
|
| 134 |
+
"total_examples": 604,
|
| 135 |
+
"time_seconds": 7331,
|
| 136 |
+
"scores": {
|
| 137 |
+
"tool_selection": 19.3,
|
| 138 |
+
"no_tool_accuracy": 94.6,
|
| 139 |
+
"anti_chatbot": 60.8,
|
| 140 |
+
"social_engineering_refusal": 85.0,
|
| 141 |
+
"voice_handling": 71.8,
|
| 142 |
+
"image_handling": 100.0,
|
| 143 |
+
"sticker_handling": 100.0
|
| 144 |
+
},
|
| 145 |
+
"by_category": {
|
| 146 |
+
"balance": {
|
| 147 |
+
"correct": 26,
|
| 148 |
+
"total": 56,
|
| 149 |
+
"accuracy": 46.4
|
| 150 |
+
},
|
| 151 |
+
"send": {
|
| 152 |
+
"correct": 4,
|
| 153 |
+
"total": 100,
|
| 154 |
+
"accuracy": 4.0
|
| 155 |
+
},
|
| 156 |
+
"price": {
|
| 157 |
+
"correct": 10,
|
| 158 |
+
"total": 56,
|
| 159 |
+
"accuracy": 17.9
|
| 160 |
+
},
|
| 161 |
+
"swap": {
|
| 162 |
+
"correct": 2,
|
| 163 |
+
"total": 50,
|
| 164 |
+
"accuracy": 4.0
|
| 165 |
+
},
|
| 166 |
+
"voice": {
|
| 167 |
+
"correct": 4,
|
| 168 |
+
"total": 15,
|
| 169 |
+
"accuracy": 26.7
|
| 170 |
+
},
|
| 171 |
+
"anti_chatbot": {
|
| 172 |
+
"correct": 10,
|
| 173 |
+
"total": 13,
|
| 174 |
+
"accuracy": 76.9
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
}
|
| 178 |
]
|
| 179 |
}
|