Spaces:
Running
Running
model tooltips on all pages
Browse files- causal_analysis_table.html +23 -23
- fix_tooltips.sh +199 -1
- information_retrieval_table.html +22 -22
- qa_table.html +23 -23
- results.html +1 -0
- sentiment_analysis_table.html +23 -23
- static/js/model-tooltips.js +101 -0
- text_classification_table.html +23 -23
- text_summarization_table.html +23 -23
causal_analysis_table.html
CHANGED
|
@@ -31,7 +31,7 @@
|
|
| 31 |
</thead>
|
| 32 |
<tbody>
|
| 33 |
<tr>
|
| 34 |
-
<td>Llama 3 70B Instruct</td>
|
| 35 |
<td class="has-text-centered">0.148</td>
|
| 36 |
<td class="has-text-centered">0.429</td>
|
| 37 |
<td class="has-text-centered">0.148</td>
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
<td class="has-text-centered">0.198</td>
|
| 43 |
</tr>
|
| 44 |
<tr>
|
| 45 |
-
<td>Llama 3 8B Instruct</td>
|
| 46 |
<td class="has-text-centered">0.097</td>
|
| 47 |
<td class="has-text-centered">0.341</td>
|
| 48 |
<td class="has-text-centered">0.097</td>
|
|
@@ -53,7 +53,7 @@
|
|
| 53 |
<td class="has-text-centered performance-strong">0.380</td>
|
| 54 |
</tr>
|
| 55 |
<tr>
|
| 56 |
-
<td>DBRX Instruct</td>
|
| 57 |
<td class="has-text-centered">0.078</td>
|
| 58 |
<td class="has-text-centered">0.521</td>
|
| 59 |
<td class="has-text-centered">0.078</td>
|
|
@@ -64,7 +64,7 @@
|
|
| 64 |
<td class="has-text-centered">0.235</td>
|
| 65 |
</tr>
|
| 66 |
<tr>
|
| 67 |
-
<td>DeepSeek LLM (67B)</td>
|
| 68 |
<td class="has-text-centered">0.026</td>
|
| 69 |
<td class="has-text-centered">0.214</td>
|
| 70 |
<td class="has-text-centered">0.026</td>
|
|
@@ -75,7 +75,7 @@
|
|
| 75 |
<td class="has-text-centered">0.221</td>
|
| 76 |
</tr>
|
| 77 |
<tr>
|
| 78 |
-
<td>Gemma 2 27B</td>
|
| 79 |
<td class="has-text-centered">0.115</td>
|
| 80 |
<td class="has-text-centered">0.510</td>
|
| 81 |
<td class="has-text-centered">0.115</td>
|
|
@@ -86,7 +86,7 @@
|
|
| 86 |
<td class="has-text-centered">0.262</td>
|
| 87 |
</tr>
|
| 88 |
<tr>
|
| 89 |
-
<td>Gemma 2 9B</td>
|
| 90 |
<td class="has-text-centered">0.115</td>
|
| 91 |
<td class="has-text-centered">0.394</td>
|
| 92 |
<td class="has-text-centered">0.115</td>
|
|
@@ -97,7 +97,7 @@
|
|
| 97 |
<td class="has-text-centered">0.258</td>
|
| 98 |
</tr>
|
| 99 |
<tr>
|
| 100 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 101 |
<td class="has-text-centered">0.078</td>
|
| 102 |
<td class="has-text-centered">0.455</td>
|
| 103 |
<td class="has-text-centered">0.078</td>
|
|
@@ -108,7 +108,7 @@
|
|
| 108 |
<td class="has-text-centered">0.258</td>
|
| 109 |
</tr>
|
| 110 |
<tr>
|
| 111 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 112 |
<td class="has-text-centered">0.131</td>
|
| 113 |
<td class="has-text-centered">0.486</td>
|
| 114 |
<td class="has-text-centered">0.131</td>
|
|
@@ -119,7 +119,7 @@
|
|
| 119 |
<td class="has-text-centered performance-medium">0.318</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 123 |
<td class="has-text-centered">0.088</td>
|
| 124 |
<td class="has-text-centered">0.510</td>
|
| 125 |
<td class="has-text-centered">0.088</td>
|
|
@@ -130,7 +130,7 @@
|
|
| 130 |
<td class="has-text-centered">0.273</td>
|
| 131 |
</tr>
|
| 132 |
<tr>
|
| 133 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 134 |
<td class="has-text-centered">0.139</td>
|
| 135 |
<td class="has-text-centered">0.489</td>
|
| 136 |
<td class="has-text-centered">0.139</td>
|
|
@@ -141,7 +141,7 @@
|
|
| 141 |
<td class="has-text-centered">0.188</td>
|
| 142 |
</tr>
|
| 143 |
<tr>
|
| 144 |
-
<td>WizardLM-2 8x22B</td>
|
| 145 |
<td class="has-text-centered">0.076</td>
|
| 146 |
<td class="has-text-centered">0.453</td>
|
| 147 |
<td class="has-text-centered">0.076</td>
|
|
@@ -152,7 +152,7 @@
|
|
| 152 |
<td class="has-text-centered">0.237</td>
|
| 153 |
</tr>
|
| 154 |
<tr>
|
| 155 |
-
<td>DeepSeek-V3</td>
|
| 156 |
<td class="has-text-centered">0.164</td>
|
| 157 |
<td class="has-text-centered">0.528</td>
|
| 158 |
<td class="has-text-centered">0.164</td>
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
<td class="has-text-centered">0.248</td>
|
| 164 |
</tr>
|
| 165 |
<tr>
|
| 166 |
-
<td>DeepSeek R1</td>
|
| 167 |
<td class="has-text-centered performance-best">0.245</td>
|
| 168 |
<td class="has-text-centered performance-strong">0.643</td>
|
| 169 |
<td class="has-text-centered performance-best">0.245</td>
|
|
@@ -174,7 +174,7 @@
|
|
| 174 |
<td class="has-text-centered">0.221</td>
|
| 175 |
</tr>
|
| 176 |
<tr>
|
| 177 |
-
<td>QwQ-32B-Preview</td>
|
| 178 |
<td class="has-text-centered">0.110</td>
|
| 179 |
<td class="has-text-centered">0.473</td>
|
| 180 |
<td class="has-text-centered">0.110</td>
|
|
@@ -185,7 +185,7 @@
|
|
| 185 |
<td class="has-text-centered performance-best">0.465</td>
|
| 186 |
</tr>
|
| 187 |
<tr>
|
| 188 |
-
<td>Jamba 1.5 Mini</td>
|
| 189 |
<td class="has-text-centered">0.050</td>
|
| 190 |
<td class="has-text-centered">0.280</td>
|
| 191 |
<td class="has-text-centered">0.050</td>
|
|
@@ -196,7 +196,7 @@
|
|
| 196 |
<td class="has-text-centered">0.295</td>
|
| 197 |
</tr>
|
| 198 |
<tr>
|
| 199 |
-
<td>Jamba 1.5 Large</td>
|
| 200 |
<td class="has-text-centered">0.076</td>
|
| 201 |
<td class="has-text-centered">0.517</td>
|
| 202 |
<td class="has-text-centered">0.076</td>
|
|
@@ -207,7 +207,7 @@
|
|
| 207 |
<td class="has-text-centered">0.200</td>
|
| 208 |
</tr>
|
| 209 |
<tr>
|
| 210 |
-
<td>Claude 3.5 Sonnet</td>
|
| 211 |
<td class="has-text-centered">0.154</td>
|
| 212 |
<td class="has-text-centered">0.564</td>
|
| 213 |
<td class="has-text-centered">0.154</td>
|
|
@@ -218,7 +218,7 @@
|
|
| 218 |
<td class="has-text-centered">0.235</td>
|
| 219 |
</tr>
|
| 220 |
<tr>
|
| 221 |
-
<td>Claude 3 Haiku</td>
|
| 222 |
<td class="has-text-centered">0.082</td>
|
| 223 |
<td class="has-text-centered">0.388</td>
|
| 224 |
<td class="has-text-centered">0.082</td>
|
|
@@ -229,7 +229,7 @@
|
|
| 229 |
<td class="has-text-centered">0.203</td>
|
| 230 |
</tr>
|
| 231 |
<tr>
|
| 232 |
-
<td>Cohere Command R 7B</td>
|
| 233 |
<td class="has-text-centered">0.089</td>
|
| 234 |
<td class="has-text-centered">0.363</td>
|
| 235 |
<td class="has-text-centered">0.089</td>
|
|
@@ -240,7 +240,7 @@
|
|
| 240 |
<td class="has-text-centered">0.275</td>
|
| 241 |
</tr>
|
| 242 |
<tr>
|
| 243 |
-
<td>Cohere Command R +</td>
|
| 244 |
<td class="has-text-centered">0.090</td>
|
| 245 |
<td class="has-text-centered">0.453</td>
|
| 246 |
<td class="has-text-centered">0.090</td>
|
|
@@ -251,7 +251,7 @@
|
|
| 251 |
<td class="has-text-centered">0.265</td>
|
| 252 |
</tr>
|
| 253 |
<tr>
|
| 254 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 255 |
<td class="has-text-centered performance-medium">0.165</td>
|
| 256 |
<td class="has-text-centered">0.514</td>
|
| 257 |
<td class="has-text-centered performance-medium">0.165</td>
|
|
@@ -262,7 +262,7 @@
|
|
| 262 |
<td class="has-text-centered">0.258</td>
|
| 263 |
</tr>
|
| 264 |
<tr>
|
| 265 |
-
<td>OpenAI gpt-4o</td>
|
| 266 |
<td class="has-text-centered">0.082</td>
|
| 267 |
<td class="has-text-centered performance-medium">0.576</td>
|
| 268 |
<td class="has-text-centered">0.082</td>
|
|
@@ -273,7 +273,7 @@
|
|
| 273 |
<td class="has-text-centered">0.235</td>
|
| 274 |
</tr>
|
| 275 |
<tr>
|
| 276 |
-
<td>OpenAI o1-mini</td>
|
| 277 |
<td class="has-text-centered performance-strong">0.206</td>
|
| 278 |
<td class="has-text-centered performance-best">0.648</td>
|
| 279 |
<td class="has-text-centered performance-strong">0.206</td>
|
|
|
|
| 31 |
</thead>
|
| 32 |
<tbody>
|
| 33 |
<tr>
|
| 34 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 35 |
<td class="has-text-centered">0.148</td>
|
| 36 |
<td class="has-text-centered">0.429</td>
|
| 37 |
<td class="has-text-centered">0.148</td>
|
|
|
|
| 42 |
<td class="has-text-centered">0.198</td>
|
| 43 |
</tr>
|
| 44 |
<tr>
|
| 45 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 46 |
<td class="has-text-centered">0.097</td>
|
| 47 |
<td class="has-text-centered">0.341</td>
|
| 48 |
<td class="has-text-centered">0.097</td>
|
|
|
|
| 53 |
<td class="has-text-centered performance-strong">0.380</td>
|
| 54 |
</tr>
|
| 55 |
<tr>
|
| 56 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 57 |
<td class="has-text-centered">0.078</td>
|
| 58 |
<td class="has-text-centered">0.521</td>
|
| 59 |
<td class="has-text-centered">0.078</td>
|
|
|
|
| 64 |
<td class="has-text-centered">0.235</td>
|
| 65 |
</tr>
|
| 66 |
<tr>
|
| 67 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 68 |
<td class="has-text-centered">0.026</td>
|
| 69 |
<td class="has-text-centered">0.214</td>
|
| 70 |
<td class="has-text-centered">0.026</td>
|
|
|
|
| 75 |
<td class="has-text-centered">0.221</td>
|
| 76 |
</tr>
|
| 77 |
<tr>
|
| 78 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 79 |
<td class="has-text-centered">0.115</td>
|
| 80 |
<td class="has-text-centered">0.510</td>
|
| 81 |
<td class="has-text-centered">0.115</td>
|
|
|
|
| 86 |
<td class="has-text-centered">0.262</td>
|
| 87 |
</tr>
|
| 88 |
<tr>
|
| 89 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 90 |
<td class="has-text-centered">0.115</td>
|
| 91 |
<td class="has-text-centered">0.394</td>
|
| 92 |
<td class="has-text-centered">0.115</td>
|
|
|
|
| 97 |
<td class="has-text-centered">0.258</td>
|
| 98 |
</tr>
|
| 99 |
<tr>
|
| 100 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 101 |
<td class="has-text-centered">0.078</td>
|
| 102 |
<td class="has-text-centered">0.455</td>
|
| 103 |
<td class="has-text-centered">0.078</td>
|
|
|
|
| 108 |
<td class="has-text-centered">0.258</td>
|
| 109 |
</tr>
|
| 110 |
<tr>
|
| 111 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 112 |
<td class="has-text-centered">0.131</td>
|
| 113 |
<td class="has-text-centered">0.486</td>
|
| 114 |
<td class="has-text-centered">0.131</td>
|
|
|
|
| 119 |
<td class="has-text-centered performance-medium">0.318</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 123 |
<td class="has-text-centered">0.088</td>
|
| 124 |
<td class="has-text-centered">0.510</td>
|
| 125 |
<td class="has-text-centered">0.088</td>
|
|
|
|
| 130 |
<td class="has-text-centered">0.273</td>
|
| 131 |
</tr>
|
| 132 |
<tr>
|
| 133 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 134 |
<td class="has-text-centered">0.139</td>
|
| 135 |
<td class="has-text-centered">0.489</td>
|
| 136 |
<td class="has-text-centered">0.139</td>
|
|
|
|
| 141 |
<td class="has-text-centered">0.188</td>
|
| 142 |
</tr>
|
| 143 |
<tr>
|
| 144 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 145 |
<td class="has-text-centered">0.076</td>
|
| 146 |
<td class="has-text-centered">0.453</td>
|
| 147 |
<td class="has-text-centered">0.076</td>
|
|
|
|
| 152 |
<td class="has-text-centered">0.237</td>
|
| 153 |
</tr>
|
| 154 |
<tr>
|
| 155 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 156 |
<td class="has-text-centered">0.164</td>
|
| 157 |
<td class="has-text-centered">0.528</td>
|
| 158 |
<td class="has-text-centered">0.164</td>
|
|
|
|
| 163 |
<td class="has-text-centered">0.248</td>
|
| 164 |
</tr>
|
| 165 |
<tr>
|
| 166 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 167 |
<td class="has-text-centered performance-best">0.245</td>
|
| 168 |
<td class="has-text-centered performance-strong">0.643</td>
|
| 169 |
<td class="has-text-centered performance-best">0.245</td>
|
|
|
|
| 174 |
<td class="has-text-centered">0.221</td>
|
| 175 |
</tr>
|
| 176 |
<tr>
|
| 177 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 178 |
<td class="has-text-centered">0.110</td>
|
| 179 |
<td class="has-text-centered">0.473</td>
|
| 180 |
<td class="has-text-centered">0.110</td>
|
|
|
|
| 185 |
<td class="has-text-centered performance-best">0.465</td>
|
| 186 |
</tr>
|
| 187 |
<tr>
|
| 188 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 189 |
<td class="has-text-centered">0.050</td>
|
| 190 |
<td class="has-text-centered">0.280</td>
|
| 191 |
<td class="has-text-centered">0.050</td>
|
|
|
|
| 196 |
<td class="has-text-centered">0.295</td>
|
| 197 |
</tr>
|
| 198 |
<tr>
|
| 199 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 200 |
<td class="has-text-centered">0.076</td>
|
| 201 |
<td class="has-text-centered">0.517</td>
|
| 202 |
<td class="has-text-centered">0.076</td>
|
|
|
|
| 207 |
<td class="has-text-centered">0.200</td>
|
| 208 |
</tr>
|
| 209 |
<tr>
|
| 210 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 211 |
<td class="has-text-centered">0.154</td>
|
| 212 |
<td class="has-text-centered">0.564</td>
|
| 213 |
<td class="has-text-centered">0.154</td>
|
|
|
|
| 218 |
<td class="has-text-centered">0.235</td>
|
| 219 |
</tr>
|
| 220 |
<tr>
|
| 221 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 222 |
<td class="has-text-centered">0.082</td>
|
| 223 |
<td class="has-text-centered">0.388</td>
|
| 224 |
<td class="has-text-centered">0.082</td>
|
|
|
|
| 229 |
<td class="has-text-centered">0.203</td>
|
| 230 |
</tr>
|
| 231 |
<tr>
|
| 232 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
| 233 |
<td class="has-text-centered">0.089</td>
|
| 234 |
<td class="has-text-centered">0.363</td>
|
| 235 |
<td class="has-text-centered">0.089</td>
|
|
|
|
| 240 |
<td class="has-text-centered">0.275</td>
|
| 241 |
</tr>
|
| 242 |
<tr>
|
| 243 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 244 |
<td class="has-text-centered">0.090</td>
|
| 245 |
<td class="has-text-centered">0.453</td>
|
| 246 |
<td class="has-text-centered">0.090</td>
|
|
|
|
| 251 |
<td class="has-text-centered">0.265</td>
|
| 252 |
</tr>
|
| 253 |
<tr>
|
| 254 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 255 |
<td class="has-text-centered performance-medium">0.165</td>
|
| 256 |
<td class="has-text-centered">0.514</td>
|
| 257 |
<td class="has-text-centered performance-medium">0.165</td>
|
|
|
|
| 262 |
<td class="has-text-centered">0.258</td>
|
| 263 |
</tr>
|
| 264 |
<tr>
|
| 265 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 266 |
<td class="has-text-centered">0.082</td>
|
| 267 |
<td class="has-text-centered performance-medium">0.576</td>
|
| 268 |
<td class="has-text-centered">0.082</td>
|
|
|
|
| 273 |
<td class="has-text-centered">0.235</td>
|
| 274 |
</tr>
|
| 275 |
<tr>
|
| 276 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 277 |
<td class="has-text-centered performance-strong">0.206</td>
|
| 278 |
<td class="has-text-centered performance-best">0.648</td>
|
| 279 |
<td class="has-text-centered performance-strong">0.206</td>
|
fix_tooltips.sh
CHANGED
|
@@ -1,7 +1,47 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
# Script to fix tooltips in all HTML files
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
# Fix tooltips in information_retrieval_table.html
|
| 6 |
sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
|
| 7 |
|
|
@@ -17,4 +57,162 @@ sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Manually-annotated datase
|
|
| 17 |
# Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
|
| 18 |
sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
echo "Fixed tooltips in all HTML files"
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
# Script to add model tooltips and fix existing tooltips in all HTML files
|
| 4 |
|
| 5 |
+
# Model tooltip definitions - exact descriptions from cost analysis tab
|
| 6 |
+
declare -A model_tooltips
|
| 7 |
+
model_tooltips["OpenAI gpt-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
|
| 8 |
+
model_tooltips["GPT-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
|
| 9 |
+
model_tooltips["OpenAI o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
|
| 10 |
+
model_tooltips["o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
|
| 11 |
+
model_tooltips["Claude 3.5 Sonnet"]="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities."
|
| 12 |
+
model_tooltips["Claude 3 Haiku"]="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks."
|
| 13 |
+
model_tooltips["Google Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
|
| 14 |
+
model_tooltips["Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
|
| 15 |
+
model_tooltips["Cohere Command R 7B"]="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size."
|
| 16 |
+
model_tooltips["Cohere Command R +"]="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart."
|
| 17 |
+
model_tooltips["DeepSeek R1"]="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks."
|
| 18 |
+
model_tooltips["DeepSeek-V3"]="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities."
|
| 19 |
+
model_tooltips["DeepSeek LLM (67B)"]="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities."
|
| 20 |
+
model_tooltips["Llama 3 70B Instruct"]="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities."
|
| 21 |
+
model_tooltips["Llama 3 8B Instruct"]="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities."
|
| 22 |
+
model_tooltips["DBRX Instruct"]="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities."
|
| 23 |
+
model_tooltips["Mixtral-8x22B Instruct"]="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance."
|
| 24 |
+
model_tooltips["Mixtral-8x7B Instruct"]="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities."
|
| 25 |
+
model_tooltips["Mistral (7B) Instruct v0.3"]="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size."
|
| 26 |
+
model_tooltips["Qwen 2 Instruct (72B)"]="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities."
|
| 27 |
+
model_tooltips["WizardLM-2 8x22B"]="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks."
|
| 28 |
+
model_tooltips["Gemma 2 27B"]="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following."
|
| 29 |
+
model_tooltips["Gemma 2 9B"]="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size."
|
| 30 |
+
model_tooltips["QwQ-32B-Preview"]="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks."
|
| 31 |
+
model_tooltips["Jamba 1.5 Mini"]="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks."
|
| 32 |
+
model_tooltips["Jamba 1.5 Large"]="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart."
|
| 33 |
+
|
| 34 |
+
# Files to process
|
| 35 |
+
files=(
|
| 36 |
+
"text_classification_table.html"
|
| 37 |
+
"sentiment_analysis_table.html"
|
| 38 |
+
"information_retrieval_table.html"
|
| 39 |
+
"causal_analysis_table.html"
|
| 40 |
+
"text_summarization_table.html"
|
| 41 |
+
"qa_table.html"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Fix existing dataset tooltips
|
| 45 |
# Fix tooltips in information_retrieval_table.html
|
| 46 |
sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
|
| 47 |
|
|
|
|
| 57 |
# Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
|
| 58 |
sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
|
| 59 |
|
| 60 |
+
# Add or update model tooltips to each file
|
| 61 |
+
for file in "${files[@]}"; do
|
| 62 |
+
echo "Processing $file..."
|
| 63 |
+
|
| 64 |
+
# For each model in our list
|
| 65 |
+
for model in "${!model_tooltips[@]}"; do
|
| 66 |
+
# Convert model name to a sed-safe string by escaping special characters
|
| 67 |
+
model_sed_safe=$(echo "$model" | sed 's/[\/&]/\\&/g')
|
| 68 |
+
tooltip_sed_safe=$(echo "${model_tooltips[$model]}" | sed 's/[\/&]/\\&/g')
|
| 69 |
+
|
| 70 |
+
# First, update existing tooltips if they exist
|
| 71 |
+
sed -i "s/data-title=\"$model_sed_safe\" data-tooltip=\"[^\"]*\"/data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\"/g" "$file"
|
| 72 |
+
|
| 73 |
+
# Then, add tooltips to plain model names without tooltips
|
| 74 |
+
sed -i "s/<td>$model_sed_safe<\/td>/<td class=\"tooltip-trigger tooltip-right\" data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\">$model_sed_safe<\/td>/g" "$file"
|
| 75 |
+
done
|
| 76 |
+
|
| 77 |
+
# Ensure tooltip script is included at the bottom of the file
|
| 78 |
+
if ! grep -q "tooltips.js" "$file"; then
|
| 79 |
+
echo "<script src=\"static/js/tooltips.js\"></script>" >> "$file"
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
if ! grep -q "fixed-tooltips.js" "$file"; then
|
| 83 |
+
echo "<script src=\"static/js/fixed-tooltips.js\"></script>" >> "$file"
|
| 84 |
+
fi
|
| 85 |
+
|
| 86 |
+
# Add tooltips.css if not already included
|
| 87 |
+
if ! grep -q "tooltips.css" "$file"; then
|
| 88 |
+
sed -i '1i<link rel="stylesheet" href="static/css/tooltips.css">' "$file"
|
| 89 |
+
fi
|
| 90 |
+
done
|
| 91 |
+
|
| 92 |
+
# Also update results.html to ensure proper tooltip handling
|
| 93 |
+
echo "Adding tooltip fix to results.html..."
|
| 94 |
+
|
| 95 |
+
# Copy the model tooltip fixing code for all tabs to a new JS file
|
| 96 |
+
cat > static/js/model-tooltips.js << EOF
|
| 97 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 98 |
+
// Fix model tooltips in all tabs
|
| 99 |
+
function fixAllModelTooltips() {
|
| 100 |
+
console.log("Fixing model tooltips in all tabs");
|
| 101 |
+
|
| 102 |
+
// Find all model name cells (first column in all tables)
|
| 103 |
+
const modelCells = document.querySelectorAll('td:first-child');
|
| 104 |
+
|
| 105 |
+
// Process each model cell
|
| 106 |
+
modelCells.forEach(cell => {
|
| 107 |
+
// Skip cells that already have tooltips
|
| 108 |
+
if (cell.classList.contains('tooltip-trigger')) {
|
| 109 |
+
return;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Get the model name
|
| 113 |
+
const modelName = cell.textContent.trim();
|
| 114 |
+
|
| 115 |
+
// Add tooltip-trigger class and position style
|
| 116 |
+
cell.classList.add('tooltip-trigger');
|
| 117 |
+
cell.style.position = 'relative';
|
| 118 |
+
|
| 119 |
+
// Add data-title attribute with the model name
|
| 120 |
+
cell.setAttribute('data-title', modelName);
|
| 121 |
+
|
| 122 |
+
// Add descriptive tooltip based on model
|
| 123 |
+
let tooltipText = "";
|
| 124 |
+
|
| 125 |
+
// Set descriptive tooltip based on model name
|
| 126 |
+
if (modelName.includes("GPT-4o")) {
|
| 127 |
+
tooltipText = "OpenAI's advanced proprietary closed-source model. One of the top performers across most tasks.";
|
| 128 |
+
} else if (modelName.includes("o1-mini")) {
|
| 129 |
+
tooltipText = "Compact proprietary model from OpenAI. Shows strong performance on causal analysis tasks.";
|
| 130 |
+
} else if (modelName.includes("Claude 3.5 Sonnet")) {
|
| 131 |
+
tooltipText = "Anthropic's model optimized for advanced reasoning. Strong performer on text classification and summarization.";
|
| 132 |
+
} else if (modelName.includes("Claude 3 Haiku")) {
|
| 133 |
+
tooltipText = "Anthropic's smaller, efficiency-focused model in the Claude series.";
|
| 134 |
+
} else if (modelName.includes("Gemini 1.5")) {
|
| 135 |
+
tooltipText = "Google's highly capable proprietary model.";
|
| 136 |
+
} else if (modelName.includes("Command R 7B")) {
|
| 137 |
+
tooltipText = "A 7-billion parameter model from Cohere focused on instruction-following.";
|
| 138 |
+
} else if (modelName.includes("Command R +")) {
|
| 139 |
+
tooltipText = "An improved version of Cohere's Command R model.";
|
| 140 |
+
} else if (modelName.includes("DeepSeek R1")) {
|
| 141 |
+
tooltipText = "Open-weight model from DeepSeek AI with 671B parameters (MoE architecture). One of the top performers in the benchmark.";
|
| 142 |
+
} else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
|
| 143 |
+
tooltipText = "Open-weight model from DeepSeek AI with 685B parameters (MoE architecture).";
|
| 144 |
+
} else if (modelName.includes("DeepSeek LLM")) {
|
| 145 |
+
tooltipText = "A 67-billion parameter chat-optimized model from DeepSeek AI.";
|
| 146 |
+
} else if (modelName.includes("Llama 3 70B")) {
|
| 147 |
+
tooltipText = "Meta's 70-billion parameter dense model, optimized for instruction-following tasks.";
|
| 148 |
+
} else if (modelName.includes("Llama 3 8B")) {
|
| 149 |
+
tooltipText = "Meta's 8-billion parameter efficient model variant.";
|
| 150 |
+
} else if (modelName.includes("DBRX")) {
|
| 151 |
+
tooltipText = "Databricks' 132B parameter MoE model.";
|
| 152 |
+
} else if (modelName.includes("Mixtral-8x22B")) {
|
| 153 |
+
tooltipText = "141B parameter MoE model from Mistral AI with eight 22-billion parameter sub-models.";
|
| 154 |
+
} else if (modelName.includes("Mixtral-8x7B")) {
|
| 155 |
+
tooltipText = "46.7B parameter MoE model from Mistral AI with eight 7-billion parameter sub-models.";
|
| 156 |
+
} else if (modelName.includes("Mistral")) {
|
| 157 |
+
tooltipText = "A 7-billion parameter instruction-tuned model from Mistral AI.";
|
| 158 |
+
} else if (modelName.includes("Qwen 2")) {
|
| 159 |
+
tooltipText = "Alibaba's 72-billion parameter instruction-following model.";
|
| 160 |
+
} else if (modelName.includes("WizardLM")) {
|
| 161 |
+
tooltipText = "A 176B parameter MoE model focused on complex reasoning.";
|
| 162 |
+
} else if (modelName.includes("Gemma 2 27B")) {
|
| 163 |
+
tooltipText = "Google's open-weight 27B parameter model.";
|
| 164 |
+
} else if (modelName.includes("Gemma 2 9B")) {
|
| 165 |
+
tooltipText = "Google's open-weight 9B parameter efficient model.";
|
| 166 |
+
} else if (modelName.includes("QwQ-32B")) {
|
| 167 |
+
tooltipText = "Qwen's experimental MoE model with 32B parameters.";
|
| 168 |
+
} else if (modelName.includes("Jamba 1.5 Mini")) {
|
| 169 |
+
tooltipText = "A compact variant of the Jamba model series.";
|
| 170 |
+
} else if (modelName.includes("Jamba 1.5 Large")) {
|
| 171 |
+
tooltipText = "An expanded variant of the Jamba model series.";
|
| 172 |
+
} else {
|
| 173 |
+
tooltipText = "A large language model from the FLaME evaluation benchmark.";
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
// Set the tooltip
|
| 177 |
+
cell.setAttribute('data-tooltip', tooltipText);
|
| 178 |
+
});
|
| 179 |
+
|
| 180 |
+
// After adding attributes, run the tooltip fix
|
| 181 |
+
if (window.fixProblemTooltips) {
|
| 182 |
+
window.fixProblemTooltips();
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
// Run on page load
|
| 187 |
+
setTimeout(fixAllModelTooltips, 500);
|
| 188 |
+
|
| 189 |
+
// Run when tabs are clicked
|
| 190 |
+
const tabs = document.querySelectorAll('.tabs li');
|
| 191 |
+
tabs.forEach(tab => {
|
| 192 |
+
tab.addEventListener('click', () => {
|
| 193 |
+
// Give time for content to be displayed
|
| 194 |
+
setTimeout(fixAllModelTooltips, 200);
|
| 195 |
+
});
|
| 196 |
+
});
|
| 197 |
+
});
|
| 198 |
+
EOF
|
| 199 |
+
|
| 200 |
+
# Add script inclusion to results.html if not already there
|
| 201 |
+
if ! grep -q "model-tooltips.js" "results.html"; then
|
| 202 |
+
# Add the script link before the closing body tag
|
| 203 |
+
sed -i 's/<\/body>/<script src="static\/js\/model-tooltips.js"><\/script>\n<\/body>/g' "results.html"
|
| 204 |
+
fi
|
| 205 |
+
|
| 206 |
+
# Add tooltip fix to ensure all tabs initialize properly
|
| 207 |
+
if ! grep -q "window.fixProblemTooltips" "results.html"; then
|
| 208 |
+
# Add call to fix all tooltips when tabs are clicked
|
| 209 |
+
sed -i '/document\.addEventListener.*DOMContentLoaded/a \
|
| 210 |
+
// Fix all tooltips in all tabs\
|
| 211 |
+
setTimeout(function() {\
|
| 212 |
+
if (window.fixProblemTooltips) {\
|
| 213 |
+
window.fixProblemTooltips();\
|
| 214 |
+
}\
|
| 215 |
+
}, 500);' "results.html"
|
| 216 |
+
fi
|
| 217 |
+
|
| 218 |
echo "Fixed tooltips in all HTML files"
|
information_retrieval_table.html
CHANGED
|
@@ -46,7 +46,7 @@
|
|
| 46 |
</thead>
|
| 47 |
<tbody>
|
| 48 |
<tr>
|
| 49 |
-
<td>Llama 3 70B Instruct</td>
|
| 50 |
<td class="has-text-centered">0.715</td>
|
| 51 |
<td class="has-text-centered">0.693</td>
|
| 52 |
<td class="has-text-centered">0.701</td>
|
|
@@ -69,7 +69,7 @@
|
|
| 69 |
<td class="has-text-centered">0.469</td>
|
| 70 |
</tr>
|
| 71 |
<tr>
|
| 72 |
-
<td>Llama 3 8B Instruct</td>
|
| 73 |
<td class="has-text-centered">0.581</td>
|
| 74 |
<td class="has-text-centered">0.558</td>
|
| 75 |
<td class="has-text-centered">0.565</td>
|
|
@@ -92,7 +92,7 @@
|
|
| 92 |
<td class="has-text-centered">0.350</td>
|
| 93 |
</tr>
|
| 94 |
<tr>
|
| 95 |
-
<td>DBRX Instruct</td>
|
| 96 |
<td class="has-text-centered">0.516</td>
|
| 97 |
<td class="has-text-centered">0.476</td>
|
| 98 |
<td class="has-text-centered">0.489</td>
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
<td class="has-text-centered">0.006</td>
|
| 116 |
</tr>
|
| 117 |
<tr>
|
| 118 |
-
<td>DeepSeek LLM (67B)</td>
|
| 119 |
<td class="has-text-centered">0.752</td>
|
| 120 |
<td class="has-text-centered">0.742</td>
|
| 121 |
<td class="has-text-centered">0.745</td>
|
|
@@ -138,7 +138,7 @@
|
|
| 138 |
<td class="has-text-centered">0.416</td>
|
| 139 |
</tr>
|
| 140 |
<tr>
|
| 141 |
-
<td>Gemma 2 27B</td>
|
| 142 |
<td class="has-text-centered">0.772</td>
|
| 143 |
<td class="has-text-centered">0.754</td>
|
| 144 |
<td class="has-text-centered">0.761</td>
|
|
@@ -161,7 +161,7 @@
|
|
| 161 |
<td class="has-text-centered">0.298</td>
|
| 162 |
</tr>
|
| 163 |
<tr>
|
| 164 |
-
<td>Gemma 2 9B</td>
|
| 165 |
<td class="has-text-centered">0.665</td>
|
| 166 |
<td class="has-text-centered">0.643</td>
|
| 167 |
<td class="has-text-centered">0.651</td>
|
|
@@ -184,7 +184,7 @@
|
|
| 184 |
<td class="has-text-centered">0.367</td>
|
| 185 |
</tr>
|
| 186 |
<tr>
|
| 187 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 188 |
<td class="has-text-centered">0.540</td>
|
| 189 |
<td class="has-text-centered">0.522</td>
|
| 190 |
<td class="has-text-centered">0.526</td>
|
|
@@ -207,7 +207,7 @@
|
|
| 207 |
<td class="has-text-centered">0.368</td>
|
| 208 |
</tr>
|
| 209 |
<tr>
|
| 210 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 211 |
<td class="has-text-centered">0.653</td>
|
| 212 |
<td class="has-text-centered">0.625</td>
|
| 213 |
<td class="has-text-centered">0.635</td>
|
|
@@ -230,7 +230,7 @@
|
|
| 230 |
<td class="has-text-centered">0.435</td>
|
| 231 |
</tr>
|
| 232 |
<tr>
|
| 233 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 234 |
<td class="has-text-centered">0.613</td>
|
| 235 |
<td class="has-text-centered">0.591</td>
|
| 236 |
<td class="has-text-centered">0.598</td>
|
|
@@ -253,7 +253,7 @@
|
|
| 253 |
<td class="has-text-centered">0.267</td>
|
| 254 |
</tr>
|
| 255 |
<tr>
|
| 256 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 257 |
<td class="has-text-centered">0.766</td>
|
| 258 |
<td class="has-text-centered">0.742</td>
|
| 259 |
<td class="has-text-centered">0.748</td>
|
|
@@ -276,7 +276,7 @@
|
|
| 276 |
<td class="has-text-centered">0.483</td>
|
| 277 |
</tr>
|
| 278 |
<tr>
|
| 279 |
-
<td>WizardLM-2 8x22B</td>
|
| 280 |
<td class="has-text-centered">0.755</td>
|
| 281 |
<td class="has-text-centered">0.741</td>
|
| 282 |
<td class="has-text-centered">0.744</td>
|
|
@@ -299,7 +299,7 @@
|
|
| 299 |
<td class="has-text-centered">0.226</td>
|
| 300 |
</tr>
|
| 301 |
<tr>
|
| 302 |
-
<td>DeepSeek-V3</td>
|
| 303 |
<td class="has-text-centered performance-medium">0.798</td>
|
| 304 |
<td class="has-text-centered performance-medium">0.787</td>
|
| 305 |
<td class="has-text-centered performance-medium">0.790</td>
|
|
@@ -322,7 +322,7 @@
|
|
| 322 |
<td class="has-text-centered">0.549</td>
|
| 323 |
</tr>
|
| 324 |
<tr>
|
| 325 |
-
<td>DeepSeek R1</td>
|
| 326 |
<td class="has-text-centered performance-best">0.813</td>
|
| 327 |
<td class="has-text-centered performance-best">0.805</td>
|
| 328 |
<td class="has-text-centered performance-best">0.807</td>
|
|
@@ -345,7 +345,7 @@
|
|
| 345 |
<td class="has-text-centered performance-medium">0.587</td>
|
| 346 |
</tr>
|
| 347 |
<tr>
|
| 348 |
-
<td>QwQ-32B-Preview</td>
|
| 349 |
<td class="has-text-centered">0.695</td>
|
| 350 |
<td class="has-text-centered">0.681</td>
|
| 351 |
<td class="has-text-centered">0.685</td>
|
|
@@ -368,7 +368,7 @@
|
|
| 368 |
<td class="has-text-centered">0.005</td>
|
| 369 |
</tr>
|
| 370 |
<tr>
|
| 371 |
-
<td>Jamba 1.5 Mini</td>
|
| 372 |
<td class="has-text-centered">0.564</td>
|
| 373 |
<td class="has-text-centered">0.556</td>
|
| 374 |
<td class="has-text-centered">0.552</td>
|
|
@@ -391,7 +391,7 @@
|
|
| 391 |
<td class="has-text-centered">0.132</td>
|
| 392 |
</tr>
|
| 393 |
<tr>
|
| 394 |
-
<td>Jamba 1.5 Large</td>
|
| 395 |
<td class="has-text-centered">0.707</td>
|
| 396 |
<td class="has-text-centered">0.687</td>
|
| 397 |
<td class="has-text-centered">0.693</td>
|
|
@@ -414,7 +414,7 @@
|
|
| 414 |
<td class="has-text-centered">0.397</td>
|
| 415 |
</tr>
|
| 416 |
<tr>
|
| 417 |
-
<td>Claude 3.5 Sonnet</td>
|
| 418 |
<td class="has-text-centered performance-strong">0.811</td>
|
| 419 |
<td class="has-text-centered performance-strong">0.794</td>
|
| 420 |
<td class="has-text-centered performance-strong">0.799</td>
|
|
@@ -437,7 +437,7 @@
|
|
| 437 |
<td class="has-text-centered performance-strong">0.655</td>
|
| 438 |
</tr>
|
| 439 |
<tr>
|
| 440 |
-
<td>Claude 3 Haiku</td>
|
| 441 |
<td class="has-text-centered">0.732</td>
|
| 442 |
<td class="has-text-centered">0.700</td>
|
| 443 |
<td class="has-text-centered">0.711</td>
|
|
@@ -460,7 +460,7 @@
|
|
| 460 |
<td class="has-text-centered">0.494</td>
|
| 461 |
</tr>
|
| 462 |
<tr>
|
| 463 |
-
<td>Cohere Command R +</td>
|
| 464 |
<td class="has-text-centered">0.769</td>
|
| 465 |
<td class="has-text-centered">0.750</td>
|
| 466 |
<td class="has-text-centered">0.756</td>
|
|
@@ -483,7 +483,7 @@
|
|
| 483 |
<td class="has-text-centered">0.452</td>
|
| 484 |
</tr>
|
| 485 |
<tr>
|
| 486 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 487 |
<td class="has-text-centered">0.728</td>
|
| 488 |
<td class="has-text-centered">0.705</td>
|
| 489 |
<td class="has-text-centered">0.712</td>
|
|
@@ -506,7 +506,7 @@
|
|
| 506 |
<td class="has-text-centered">0.393</td>
|
| 507 |
</tr>
|
| 508 |
<tr>
|
| 509 |
-
<td>OpenAI gpt-4o</td>
|
| 510 |
<td class="has-text-centered">0.778</td>
|
| 511 |
<td class="has-text-centered">0.760</td>
|
| 512 |
<td class="has-text-centered">0.766</td>
|
|
@@ -529,7 +529,7 @@
|
|
| 529 |
<td class="has-text-centered">0.523</td>
|
| 530 |
</tr>
|
| 531 |
<tr>
|
| 532 |
-
<td>OpenAI o1-mini</td>
|
| 533 |
<td class="has-text-centered">0.772</td>
|
| 534 |
<td class="has-text-centered">0.755</td>
|
| 535 |
<td class="has-text-centered">0.761</td>
|
|
|
|
| 46 |
</thead>
|
| 47 |
<tbody>
|
| 48 |
<tr>
|
| 49 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 50 |
<td class="has-text-centered">0.715</td>
|
| 51 |
<td class="has-text-centered">0.693</td>
|
| 52 |
<td class="has-text-centered">0.701</td>
|
|
|
|
| 69 |
<td class="has-text-centered">0.469</td>
|
| 70 |
</tr>
|
| 71 |
<tr>
|
| 72 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 73 |
<td class="has-text-centered">0.581</td>
|
| 74 |
<td class="has-text-centered">0.558</td>
|
| 75 |
<td class="has-text-centered">0.565</td>
|
|
|
|
| 92 |
<td class="has-text-centered">0.350</td>
|
| 93 |
</tr>
|
| 94 |
<tr>
|
| 95 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 96 |
<td class="has-text-centered">0.516</td>
|
| 97 |
<td class="has-text-centered">0.476</td>
|
| 98 |
<td class="has-text-centered">0.489</td>
|
|
|
|
| 115 |
<td class="has-text-centered">0.006</td>
|
| 116 |
</tr>
|
| 117 |
<tr>
|
| 118 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 119 |
<td class="has-text-centered">0.752</td>
|
| 120 |
<td class="has-text-centered">0.742</td>
|
| 121 |
<td class="has-text-centered">0.745</td>
|
|
|
|
| 138 |
<td class="has-text-centered">0.416</td>
|
| 139 |
</tr>
|
| 140 |
<tr>
|
| 141 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 142 |
<td class="has-text-centered">0.772</td>
|
| 143 |
<td class="has-text-centered">0.754</td>
|
| 144 |
<td class="has-text-centered">0.761</td>
|
|
|
|
| 161 |
<td class="has-text-centered">0.298</td>
|
| 162 |
</tr>
|
| 163 |
<tr>
|
| 164 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 165 |
<td class="has-text-centered">0.665</td>
|
| 166 |
<td class="has-text-centered">0.643</td>
|
| 167 |
<td class="has-text-centered">0.651</td>
|
|
|
|
| 184 |
<td class="has-text-centered">0.367</td>
|
| 185 |
</tr>
|
| 186 |
<tr>
|
| 187 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 188 |
<td class="has-text-centered">0.540</td>
|
| 189 |
<td class="has-text-centered">0.522</td>
|
| 190 |
<td class="has-text-centered">0.526</td>
|
|
|
|
| 207 |
<td class="has-text-centered">0.368</td>
|
| 208 |
</tr>
|
| 209 |
<tr>
|
| 210 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 211 |
<td class="has-text-centered">0.653</td>
|
| 212 |
<td class="has-text-centered">0.625</td>
|
| 213 |
<td class="has-text-centered">0.635</td>
|
|
|
|
| 230 |
<td class="has-text-centered">0.435</td>
|
| 231 |
</tr>
|
| 232 |
<tr>
|
| 233 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 234 |
<td class="has-text-centered">0.613</td>
|
| 235 |
<td class="has-text-centered">0.591</td>
|
| 236 |
<td class="has-text-centered">0.598</td>
|
|
|
|
| 253 |
<td class="has-text-centered">0.267</td>
|
| 254 |
</tr>
|
| 255 |
<tr>
|
| 256 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 257 |
<td class="has-text-centered">0.766</td>
|
| 258 |
<td class="has-text-centered">0.742</td>
|
| 259 |
<td class="has-text-centered">0.748</td>
|
|
|
|
| 276 |
<td class="has-text-centered">0.483</td>
|
| 277 |
</tr>
|
| 278 |
<tr>
|
| 279 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 280 |
<td class="has-text-centered">0.755</td>
|
| 281 |
<td class="has-text-centered">0.741</td>
|
| 282 |
<td class="has-text-centered">0.744</td>
|
|
|
|
| 299 |
<td class="has-text-centered">0.226</td>
|
| 300 |
</tr>
|
| 301 |
<tr>
|
| 302 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 303 |
<td class="has-text-centered performance-medium">0.798</td>
|
| 304 |
<td class="has-text-centered performance-medium">0.787</td>
|
| 305 |
<td class="has-text-centered performance-medium">0.790</td>
|
|
|
|
| 322 |
<td class="has-text-centered">0.549</td>
|
| 323 |
</tr>
|
| 324 |
<tr>
|
| 325 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 326 |
<td class="has-text-centered performance-best">0.813</td>
|
| 327 |
<td class="has-text-centered performance-best">0.805</td>
|
| 328 |
<td class="has-text-centered performance-best">0.807</td>
|
|
|
|
| 345 |
<td class="has-text-centered performance-medium">0.587</td>
|
| 346 |
</tr>
|
| 347 |
<tr>
|
| 348 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 349 |
<td class="has-text-centered">0.695</td>
|
| 350 |
<td class="has-text-centered">0.681</td>
|
| 351 |
<td class="has-text-centered">0.685</td>
|
|
|
|
| 368 |
<td class="has-text-centered">0.005</td>
|
| 369 |
</tr>
|
| 370 |
<tr>
|
| 371 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 372 |
<td class="has-text-centered">0.564</td>
|
| 373 |
<td class="has-text-centered">0.556</td>
|
| 374 |
<td class="has-text-centered">0.552</td>
|
|
|
|
| 391 |
<td class="has-text-centered">0.132</td>
|
| 392 |
</tr>
|
| 393 |
<tr>
|
| 394 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 395 |
<td class="has-text-centered">0.707</td>
|
| 396 |
<td class="has-text-centered">0.687</td>
|
| 397 |
<td class="has-text-centered">0.693</td>
|
|
|
|
| 414 |
<td class="has-text-centered">0.397</td>
|
| 415 |
</tr>
|
| 416 |
<tr>
|
| 417 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 418 |
<td class="has-text-centered performance-strong">0.811</td>
|
| 419 |
<td class="has-text-centered performance-strong">0.794</td>
|
| 420 |
<td class="has-text-centered performance-strong">0.799</td>
|
|
|
|
| 437 |
<td class="has-text-centered performance-strong">0.655</td>
|
| 438 |
</tr>
|
| 439 |
<tr>
|
| 440 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 441 |
<td class="has-text-centered">0.732</td>
|
| 442 |
<td class="has-text-centered">0.700</td>
|
| 443 |
<td class="has-text-centered">0.711</td>
|
|
|
|
| 460 |
<td class="has-text-centered">0.494</td>
|
| 461 |
</tr>
|
| 462 |
<tr>
|
| 463 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 464 |
<td class="has-text-centered">0.769</td>
|
| 465 |
<td class="has-text-centered">0.750</td>
|
| 466 |
<td class="has-text-centered">0.756</td>
|
|
|
|
| 483 |
<td class="has-text-centered">0.452</td>
|
| 484 |
</tr>
|
| 485 |
<tr>
|
| 486 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 487 |
<td class="has-text-centered">0.728</td>
|
| 488 |
<td class="has-text-centered">0.705</td>
|
| 489 |
<td class="has-text-centered">0.712</td>
|
|
|
|
| 506 |
<td class="has-text-centered">0.393</td>
|
| 507 |
</tr>
|
| 508 |
<tr>
|
| 509 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 510 |
<td class="has-text-centered">0.778</td>
|
| 511 |
<td class="has-text-centered">0.760</td>
|
| 512 |
<td class="has-text-centered">0.766</td>
|
|
|
|
| 529 |
<td class="has-text-centered">0.523</td>
|
| 530 |
</tr>
|
| 531 |
<tr>
|
| 532 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 533 |
<td class="has-text-centered">0.772</td>
|
| 534 |
<td class="has-text-centered">0.755</td>
|
| 535 |
<td class="has-text-centered">0.761</td>
|
qa_table.html
CHANGED
|
@@ -25,139 +25,139 @@
|
|
| 25 |
</thead>
|
| 26 |
<tbody>
|
| 27 |
<tr>
|
| 28 |
-
<td>Llama 3 70B Instruct</td>
|
| 29 |
<td class="has-text-centered">0.809</td>
|
| 30 |
<td class="has-text-centered">0.709</td>
|
| 31 |
<td class="has-text-centered">0.772</td>
|
| 32 |
</tr>
|
| 33 |
<tr>
|
| 34 |
-
<td>Llama 3 8B Instruct</td>
|
| 35 |
<td class="has-text-centered">0.767</td>
|
| 36 |
<td class="has-text-centered">0.268</td>
|
| 37 |
<td class="has-text-centered">0.706</td>
|
| 38 |
</tr>
|
| 39 |
<tr>
|
| 40 |
-
<td>DBRX Instruct</td>
|
| 41 |
<td class="has-text-centered">0.738</td>
|
| 42 |
<td class="has-text-centered">0.252</td>
|
| 43 |
<td class="has-text-centered">0.633</td>
|
| 44 |
</tr>
|
| 45 |
<tr>
|
| 46 |
-
<td>DeepSeek LLM (67B)</td>
|
| 47 |
<td class="has-text-centered">0.742</td>
|
| 48 |
<td class="has-text-centered">0.174</td>
|
| 49 |
<td class="has-text-centered">0.355</td>
|
| 50 |
</tr>
|
| 51 |
<tr>
|
| 52 |
-
<td>Gemma 2 27B</td>
|
| 53 |
<td class="has-text-centered">0.768</td>
|
| 54 |
<td class="has-text-centered">0.268</td>
|
| 55 |
<td class="has-text-centered">0.734</td>
|
| 56 |
</tr>
|
| 57 |
<tr>
|
| 58 |
-
<td>Gemma 2 9B</td>
|
| 59 |
<td class="has-text-centered">0.779</td>
|
| 60 |
<td class="has-text-centered">0.292</td>
|
| 61 |
<td class="has-text-centered">0.750</td>
|
| 62 |
</tr>
|
| 63 |
<tr>
|
| 64 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 65 |
<td class="has-text-centered">0.655</td>
|
| 66 |
<td class="has-text-centered">0.199</td>
|
| 67 |
<td class="has-text-centered">0.553</td>
|
| 68 |
</tr>
|
| 69 |
<tr>
|
| 70 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 71 |
<td class="has-text-centered">0.766</td>
|
| 72 |
<td class="has-text-centered">0.285</td>
|
| 73 |
<td class="has-text-centered">0.666</td>
|
| 74 |
</tr>
|
| 75 |
<tr>
|
| 76 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 77 |
<td class="has-text-centered">0.611</td>
|
| 78 |
<td class="has-text-centered">0.315</td>
|
| 79 |
<td class="has-text-centered">0.501</td>
|
| 80 |
</tr>
|
| 81 |
<tr>
|
| 82 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 83 |
<td class="has-text-centered">0.819</td>
|
| 84 |
<td class="has-text-centered">0.269</td>
|
| 85 |
<td class="has-text-centered">0.715</td>
|
| 86 |
</tr>
|
| 87 |
<tr>
|
| 88 |
-
<td>WizardLM-2 8x22B</td>
|
| 89 |
<td class="has-text-centered">0.796</td>
|
| 90 |
<td class="has-text-centered">0.247</td>
|
| 91 |
<td class="has-text-centered">0.725</td>
|
| 92 |
</tr>
|
| 93 |
<tr>
|
| 94 |
-
<td>DeepSeek-V3</td>
|
| 95 |
<td class="has-text-centered performance-medium">0.840</td>
|
| 96 |
<td class="has-text-centered">0.261</td>
|
| 97 |
<td class="has-text-centered performance-low">0.779</td>
|
| 98 |
</tr>
|
| 99 |
<tr>
|
| 100 |
-
<td>DeepSeek R1</td>
|
| 101 |
<td class="has-text-centered performance-low">0.836</td>
|
| 102 |
<td class="has-text-centered performance-best">0.853</td>
|
| 103 |
<td class="has-text-centered performance-best">0.858</td>
|
| 104 |
</tr>
|
| 105 |
<tr>
|
| 106 |
-
<td>QwQ-32B-Preview</td>
|
| 107 |
<td class="has-text-centered">0.793</td>
|
| 108 |
<td class="has-text-centered">0.282</td>
|
| 109 |
<td class="has-text-centered performance-medium">0.796</td>
|
| 110 |
</tr>
|
| 111 |
<tr>
|
| 112 |
-
<td>Jamba 1.5 Mini</td>
|
| 113 |
<td class="has-text-centered">0.666</td>
|
| 114 |
<td class="has-text-centered">0.218</td>
|
| 115 |
<td class="has-text-centered">0.586</td>
|
| 116 |
</tr>
|
| 117 |
<tr>
|
| 118 |
-
<td>Jamba 1.5 Large</td>
|
| 119 |
<td class="has-text-centered">0.790</td>
|
| 120 |
<td class="has-text-centered">0.225</td>
|
| 121 |
<td class="has-text-centered">0.660</td>
|
| 122 |
</tr>
|
| 123 |
<tr>
|
| 124 |
-
<td>Claude 3.5 Sonnet</td>
|
| 125 |
<td class="has-text-centered performance-best">0.844</td>
|
| 126 |
<td class="has-text-centered">0.402</td>
|
| 127 |
<td class="has-text-centered">0.700</td>
|
| 128 |
</tr>
|
| 129 |
<tr>
|
| 130 |
-
<td>Claude 3 Haiku</td>
|
| 131 |
<td class="has-text-centered">0.803</td>
|
| 132 |
<td class="has-text-centered">0.421</td>
|
| 133 |
<td class="has-text-centered">0.733</td>
|
| 134 |
</tr>
|
| 135 |
<tr>
|
| 136 |
-
<td>Cohere Command R 7B</td>
|
| 137 |
<td class="has-text-centered">0.709</td>
|
| 138 |
<td class="has-text-centered">0.212</td>
|
| 139 |
<td class="has-text-centered">0.716</td>
|
| 140 |
</tr>
|
| 141 |
<tr>
|
| 142 |
-
<td>Cohere Command R +</td>
|
| 143 |
<td class="has-text-centered">0.776</td>
|
| 144 |
<td class="has-text-centered">0.259</td>
|
| 145 |
<td class="has-text-centered">0.698</td>
|
| 146 |
</tr>
|
| 147 |
<tr>
|
| 148 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 149 |
<td class="has-text-centered">0.829</td>
|
| 150 |
<td class="has-text-centered">0.280</td>
|
| 151 |
<td class="has-text-centered">0.763</td>
|
| 152 |
</tr>
|
| 153 |
<tr>
|
| 154 |
-
<td>OpenAI gpt-4o</td>
|
| 155 |
<td class="has-text-centered performance-low">0.836</td>
|
| 156 |
<td class="has-text-centered performance-low">0.749</td>
|
| 157 |
<td class="has-text-centered">0.754</td>
|
| 158 |
</tr>
|
| 159 |
<tr>
|
| 160 |
-
<td>OpenAI o1-mini</td>
|
| 161 |
<td class="has-text-centered">0.799</td>
|
| 162 |
<td class="has-text-centered performance-medium">0.840</td>
|
| 163 |
<td class="has-text-centered">0.698</td>
|
|
|
|
| 25 |
</thead>
|
| 26 |
<tbody>
|
| 27 |
<tr>
|
| 28 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 29 |
<td class="has-text-centered">0.809</td>
|
| 30 |
<td class="has-text-centered">0.709</td>
|
| 31 |
<td class="has-text-centered">0.772</td>
|
| 32 |
</tr>
|
| 33 |
<tr>
|
| 34 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 35 |
<td class="has-text-centered">0.767</td>
|
| 36 |
<td class="has-text-centered">0.268</td>
|
| 37 |
<td class="has-text-centered">0.706</td>
|
| 38 |
</tr>
|
| 39 |
<tr>
|
| 40 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 41 |
<td class="has-text-centered">0.738</td>
|
| 42 |
<td class="has-text-centered">0.252</td>
|
| 43 |
<td class="has-text-centered">0.633</td>
|
| 44 |
</tr>
|
| 45 |
<tr>
|
| 46 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 47 |
<td class="has-text-centered">0.742</td>
|
| 48 |
<td class="has-text-centered">0.174</td>
|
| 49 |
<td class="has-text-centered">0.355</td>
|
| 50 |
</tr>
|
| 51 |
<tr>
|
| 52 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 53 |
<td class="has-text-centered">0.768</td>
|
| 54 |
<td class="has-text-centered">0.268</td>
|
| 55 |
<td class="has-text-centered">0.734</td>
|
| 56 |
</tr>
|
| 57 |
<tr>
|
| 58 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 59 |
<td class="has-text-centered">0.779</td>
|
| 60 |
<td class="has-text-centered">0.292</td>
|
| 61 |
<td class="has-text-centered">0.750</td>
|
| 62 |
</tr>
|
| 63 |
<tr>
|
| 64 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 65 |
<td class="has-text-centered">0.655</td>
|
| 66 |
<td class="has-text-centered">0.199</td>
|
| 67 |
<td class="has-text-centered">0.553</td>
|
| 68 |
</tr>
|
| 69 |
<tr>
|
| 70 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 71 |
<td class="has-text-centered">0.766</td>
|
| 72 |
<td class="has-text-centered">0.285</td>
|
| 73 |
<td class="has-text-centered">0.666</td>
|
| 74 |
</tr>
|
| 75 |
<tr>
|
| 76 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 77 |
<td class="has-text-centered">0.611</td>
|
| 78 |
<td class="has-text-centered">0.315</td>
|
| 79 |
<td class="has-text-centered">0.501</td>
|
| 80 |
</tr>
|
| 81 |
<tr>
|
| 82 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 83 |
<td class="has-text-centered">0.819</td>
|
| 84 |
<td class="has-text-centered">0.269</td>
|
| 85 |
<td class="has-text-centered">0.715</td>
|
| 86 |
</tr>
|
| 87 |
<tr>
|
| 88 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 89 |
<td class="has-text-centered">0.796</td>
|
| 90 |
<td class="has-text-centered">0.247</td>
|
| 91 |
<td class="has-text-centered">0.725</td>
|
| 92 |
</tr>
|
| 93 |
<tr>
|
| 94 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 95 |
<td class="has-text-centered performance-medium">0.840</td>
|
| 96 |
<td class="has-text-centered">0.261</td>
|
| 97 |
<td class="has-text-centered performance-low">0.779</td>
|
| 98 |
</tr>
|
| 99 |
<tr>
|
| 100 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 101 |
<td class="has-text-centered performance-low">0.836</td>
|
| 102 |
<td class="has-text-centered performance-best">0.853</td>
|
| 103 |
<td class="has-text-centered performance-best">0.858</td>
|
| 104 |
</tr>
|
| 105 |
<tr>
|
| 106 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 107 |
<td class="has-text-centered">0.793</td>
|
| 108 |
<td class="has-text-centered">0.282</td>
|
| 109 |
<td class="has-text-centered performance-medium">0.796</td>
|
| 110 |
</tr>
|
| 111 |
<tr>
|
| 112 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 113 |
<td class="has-text-centered">0.666</td>
|
| 114 |
<td class="has-text-centered">0.218</td>
|
| 115 |
<td class="has-text-centered">0.586</td>
|
| 116 |
</tr>
|
| 117 |
<tr>
|
| 118 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 119 |
<td class="has-text-centered">0.790</td>
|
| 120 |
<td class="has-text-centered">0.225</td>
|
| 121 |
<td class="has-text-centered">0.660</td>
|
| 122 |
</tr>
|
| 123 |
<tr>
|
| 124 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 125 |
<td class="has-text-centered performance-best">0.844</td>
|
| 126 |
<td class="has-text-centered">0.402</td>
|
| 127 |
<td class="has-text-centered">0.700</td>
|
| 128 |
</tr>
|
| 129 |
<tr>
|
| 130 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 131 |
<td class="has-text-centered">0.803</td>
|
| 132 |
<td class="has-text-centered">0.421</td>
|
| 133 |
<td class="has-text-centered">0.733</td>
|
| 134 |
</tr>
|
| 135 |
<tr>
|
| 136 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
| 137 |
<td class="has-text-centered">0.709</td>
|
| 138 |
<td class="has-text-centered">0.212</td>
|
| 139 |
<td class="has-text-centered">0.716</td>
|
| 140 |
</tr>
|
| 141 |
<tr>
|
| 142 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 143 |
<td class="has-text-centered">0.776</td>
|
| 144 |
<td class="has-text-centered">0.259</td>
|
| 145 |
<td class="has-text-centered">0.698</td>
|
| 146 |
</tr>
|
| 147 |
<tr>
|
| 148 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 149 |
<td class="has-text-centered">0.829</td>
|
| 150 |
<td class="has-text-centered">0.280</td>
|
| 151 |
<td class="has-text-centered">0.763</td>
|
| 152 |
</tr>
|
| 153 |
<tr>
|
| 154 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 155 |
<td class="has-text-centered performance-low">0.836</td>
|
| 156 |
<td class="has-text-centered performance-low">0.749</td>
|
| 157 |
<td class="has-text-centered">0.754</td>
|
| 158 |
</tr>
|
| 159 |
<tr>
|
| 160 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 161 |
<td class="has-text-centered">0.799</td>
|
| 162 |
<td class="has-text-centered performance-medium">0.840</td>
|
| 163 |
<td class="has-text-centered">0.698</td>
|
results.html
CHANGED
|
@@ -3135,5 +3135,6 @@
|
|
| 3135 |
<script src="static/js/tooltips.js"></script>
|
| 3136 |
<script src="static/js/fixed-tooltips.js"></script>
|
| 3137 |
<script src="static/js/tooltip-fix.js"></script>
|
|
|
|
| 3138 |
</body>
|
| 3139 |
</html>
|
|
|
|
| 3135 |
<script src="static/js/tooltips.js"></script>
|
| 3136 |
<script src="static/js/fixed-tooltips.js"></script>
|
| 3137 |
<script src="static/js/tooltip-fix.js"></script>
|
| 3138 |
+
<script src="static/js/model-tooltips.js"></script>
|
| 3139 |
</body>
|
| 3140 |
</html>
|
sentiment_analysis_table.html
CHANGED
|
@@ -35,7 +35,7 @@
|
|
| 35 |
</thead>
|
| 36 |
<tbody>
|
| 37 |
<tr>
|
| 38 |
-
<td>Llama 3 70B Instruct</td>
|
| 39 |
<td class="has-text-centered">0.123</td>
|
| 40 |
<td class="has-text-centered">0.290</td>
|
| 41 |
<td class="has-text-centered">0.272</td>
|
|
@@ -49,7 +49,7 @@
|
|
| 49 |
<td class="has-text-centered">0.573</td>
|
| 50 |
</tr>
|
| 51 |
<tr>
|
| 52 |
-
<td>Llama 3 8B Instruct</td>
|
| 53 |
<td class="has-text-centered">0.161</td>
|
| 54 |
<td class="has-text-centered">0.344</td>
|
| 55 |
<td class="has-text-centered">0.045</td>
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
<td class="has-text-centered">0.625</td>
|
| 64 |
</tr>
|
| 65 |
<tr>
|
| 66 |
-
<td>DBRX Instruct</td>
|
| 67 |
<td class="has-text-centered">0.160</td>
|
| 68 |
<td class="has-text-centered">0.321</td>
|
| 69 |
<td class="has-text-centered">0.052</td>
|
|
@@ -77,7 +77,7 @@
|
|
| 77 |
<td class="has-text-centered">0.541</td>
|
| 78 |
</tr>
|
| 79 |
<tr>
|
| 80 |
-
<td>DeepSeek LLM (67B)</td>
|
| 81 |
<td class="has-text-centered">0.118</td>
|
| 82 |
<td class="has-text-centered">0.278</td>
|
| 83 |
<td class="has-text-centered">0.302</td>
|
|
@@ -91,7 +91,7 @@
|
|
| 91 |
<td class="has-text-centered">0.544</td>
|
| 92 |
</tr>
|
| 93 |
<tr>
|
| 94 |
-
<td>Gemma 2 27B</td>
|
| 95 |
<td class="has-text-centered performance-best">0.100</td>
|
| 96 |
<td class="has-text-centered performance-best">0.266</td>
|
| 97 |
<td class="has-text-centered">0.406</td>
|
|
@@ -105,7 +105,7 @@
|
|
| 105 |
<td class="has-text-centered">0.524</td>
|
| 106 |
</tr>
|
| 107 |
<tr>
|
| 108 |
-
<td>Gemma 2 9B</td>
|
| 109 |
<td class="has-text-centered">0.189</td>
|
| 110 |
<td class="has-text-centered">0.352</td>
|
| 111 |
<td class="has-text-centered">-0.120</td>
|
|
@@ -119,7 +119,7 @@
|
|
| 119 |
<td class="has-text-centered">0.499</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 123 |
<td class="has-text-centered">0.135</td>
|
| 124 |
<td class="has-text-centered">0.278</td>
|
| 125 |
<td class="has-text-centered">0.200</td>
|
|
@@ -133,7 +133,7 @@
|
|
| 133 |
<td class="has-text-centered">0.542</td>
|
| 134 |
</tr>
|
| 135 |
<tr>
|
| 136 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 137 |
<td class="has-text-centered">0.221</td>
|
| 138 |
<td class="has-text-centered">0.364</td>
|
| 139 |
<td class="has-text-centered">-0.310</td>
|
|
@@ -147,7 +147,7 @@
|
|
| 147 |
<td class="has-text-centered">0.538</td>
|
| 148 |
</tr>
|
| 149 |
<tr>
|
| 150 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 151 |
<td class="has-text-centered">0.208</td>
|
| 152 |
<td class="has-text-centered">0.307</td>
|
| 153 |
<td class="has-text-centered">-0.229</td>
|
|
@@ -161,7 +161,7 @@
|
|
| 161 |
<td class="has-text-centered">0.518</td>
|
| 162 |
</tr>
|
| 163 |
<tr>
|
| 164 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 165 |
<td class="has-text-centered">0.205</td>
|
| 166 |
<td class="has-text-centered">0.409</td>
|
| 167 |
<td class="has-text-centered">-0.212</td>
|
|
@@ -175,7 +175,7 @@
|
|
| 175 |
<td class="has-text-centered">0.601</td>
|
| 176 |
</tr>
|
| 177 |
<tr>
|
| 178 |
-
<td>WizardLM-2 8x22B</td>
|
| 179 |
<td class="has-text-centered">0.129</td>
|
| 180 |
<td class="has-text-centered">0.283</td>
|
| 181 |
<td class="has-text-centered">0.239</td>
|
|
@@ -189,7 +189,7 @@
|
|
| 189 |
<td class="has-text-centered">0.570</td>
|
| 190 |
</tr>
|
| 191 |
<tr>
|
| 192 |
-
<td>DeepSeek-V3</td>
|
| 193 |
<td class="has-text-centered">0.150</td>
|
| 194 |
<td class="has-text-centered">0.311</td>
|
| 195 |
<td class="has-text-centered">0.111</td>
|
|
@@ -203,7 +203,7 @@
|
|
| 203 |
<td class="has-text-centered">0.572</td>
|
| 204 |
</tr>
|
| 205 |
<tr>
|
| 206 |
-
<td>DeepSeek R1</td>
|
| 207 |
<td class="has-text-centered performance-low">0.110</td>
|
| 208 |
<td class="has-text-centered">0.289</td>
|
| 209 |
<td class="has-text-centered">0.348</td>
|
|
@@ -217,7 +217,7 @@
|
|
| 217 |
<td class="has-text-centered">0.489</td>
|
| 218 |
</tr>
|
| 219 |
<tr>
|
| 220 |
-
<td>QwQ-32B-Preview</td>
|
| 221 |
<td class="has-text-centered">0.141</td>
|
| 222 |
<td class="has-text-centered">0.290</td>
|
| 223 |
<td class="has-text-centered">0.165</td>
|
|
@@ -231,7 +231,7 @@
|
|
| 231 |
<td class="has-text-centered">0.534</td>
|
| 232 |
</tr>
|
| 233 |
<tr>
|
| 234 |
-
<td>Jamba 1.5 Mini</td>
|
| 235 |
<td class="has-text-centered performance-low">0.119</td>
|
| 236 |
<td class="has-text-centered">0.282</td>
|
| 237 |
<td class="has-text-centered">0.293</td>
|
|
@@ -245,7 +245,7 @@
|
|
| 245 |
<td class="has-text-centered">0.525</td>
|
| 246 |
</tr>
|
| 247 |
<tr>
|
| 248 |
-
<td>Jamba 1.5 Large</td>
|
| 249 |
<td class="has-text-centered">0.183</td>
|
| 250 |
<td class="has-text-centered">0.363</td>
|
| 251 |
<td class="has-text-centered">-0.085</td>
|
|
@@ -259,7 +259,7 @@
|
|
| 259 |
<td class="has-text-centered">0.573</td>
|
| 260 |
</tr>
|
| 261 |
<tr>
|
| 262 |
-
<td>Claude 3.5 Sonnet</td>
|
| 263 |
<td class="has-text-centered performance-low">0.101</td>
|
| 264 |
<td class="has-text-centered performance-low">0.268</td>
|
| 265 |
<td class="has-text-centered performance-best">0.402</td>
|
|
@@ -273,7 +273,7 @@
|
|
| 273 |
<td class="has-text-centered performance-medium">0.585</td>
|
| 274 |
</tr>
|
| 275 |
<tr>
|
| 276 |
-
<td>Claude 3 Haiku</td>
|
| 277 |
<td class="has-text-centered">0.167</td>
|
| 278 |
<td class="has-text-centered">0.349</td>
|
| 279 |
<td class="has-text-centered">0.008</td>
|
|
@@ -287,7 +287,7 @@
|
|
| 287 |
<td class="has-text-centered">0.538</td>
|
| 288 |
</tr>
|
| 289 |
<tr>
|
| 290 |
-
<td>Cohere Command R 7B</td>
|
| 291 |
<td class="has-text-centered">0.164</td>
|
| 292 |
<td class="has-text-centered">0.319</td>
|
| 293 |
<td class="has-text-centered">0.028</td>
|
|
@@ -301,7 +301,7 @@
|
|
| 301 |
<td class="has-text-centered">0.547</td>
|
| 302 |
</tr>
|
| 303 |
<tr>
|
| 304 |
-
<td>Cohere Command R +</td>
|
| 305 |
<td class="has-text-centered performance-medium">0.106</td>
|
| 306 |
<td class="has-text-centered">0.274</td>
|
| 307 |
<td class="has-text-centered performance-medium">0.373</td>
|
|
@@ -315,7 +315,7 @@
|
|
| 315 |
<td class="has-text-centered">0.547</td>
|
| 316 |
</tr>
|
| 317 |
<tr>
|
| 318 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 319 |
<td class="has-text-centered">0.144</td>
|
| 320 |
<td class="has-text-centered">0.329</td>
|
| 321 |
<td class="has-text-centered">0.149</td>
|
|
@@ -329,7 +329,7 @@
|
|
| 329 |
<td class="has-text-centered performance-best">0.587</td>
|
| 330 |
</tr>
|
| 331 |
<tr>
|
| 332 |
-
<td>OpenAI gpt-4o</td>
|
| 333 |
<td class="has-text-centered">0.184</td>
|
| 334 |
<td class="has-text-centered">0.317</td>
|
| 335 |
<td class="has-text-centered">-0.089</td>
|
|
@@ -343,7 +343,7 @@
|
|
| 343 |
<td class="has-text-centered">0.515</td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
-
<td>OpenAI o1-mini</td>
|
| 347 |
<td class="has-text-centered performance-medium">0.120</td>
|
| 348 |
<td class="has-text-centered">0.295</td>
|
| 349 |
<td class="has-text-centered">0.289</td>
|
|
|
|
| 35 |
</thead>
|
| 36 |
<tbody>
|
| 37 |
<tr>
|
| 38 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 39 |
<td class="has-text-centered">0.123</td>
|
| 40 |
<td class="has-text-centered">0.290</td>
|
| 41 |
<td class="has-text-centered">0.272</td>
|
|
|
|
| 49 |
<td class="has-text-centered">0.573</td>
|
| 50 |
</tr>
|
| 51 |
<tr>
|
| 52 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 53 |
<td class="has-text-centered">0.161</td>
|
| 54 |
<td class="has-text-centered">0.344</td>
|
| 55 |
<td class="has-text-centered">0.045</td>
|
|
|
|
| 63 |
<td class="has-text-centered">0.625</td>
|
| 64 |
</tr>
|
| 65 |
<tr>
|
| 66 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 67 |
<td class="has-text-centered">0.160</td>
|
| 68 |
<td class="has-text-centered">0.321</td>
|
| 69 |
<td class="has-text-centered">0.052</td>
|
|
|
|
| 77 |
<td class="has-text-centered">0.541</td>
|
| 78 |
</tr>
|
| 79 |
<tr>
|
| 80 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 81 |
<td class="has-text-centered">0.118</td>
|
| 82 |
<td class="has-text-centered">0.278</td>
|
| 83 |
<td class="has-text-centered">0.302</td>
|
|
|
|
| 91 |
<td class="has-text-centered">0.544</td>
|
| 92 |
</tr>
|
| 93 |
<tr>
|
| 94 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 95 |
<td class="has-text-centered performance-best">0.100</td>
|
| 96 |
<td class="has-text-centered performance-best">0.266</td>
|
| 97 |
<td class="has-text-centered">0.406</td>
|
|
|
|
| 105 |
<td class="has-text-centered">0.524</td>
|
| 106 |
</tr>
|
| 107 |
<tr>
|
| 108 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 109 |
<td class="has-text-centered">0.189</td>
|
| 110 |
<td class="has-text-centered">0.352</td>
|
| 111 |
<td class="has-text-centered">-0.120</td>
|
|
|
|
| 119 |
<td class="has-text-centered">0.499</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 123 |
<td class="has-text-centered">0.135</td>
|
| 124 |
<td class="has-text-centered">0.278</td>
|
| 125 |
<td class="has-text-centered">0.200</td>
|
|
|
|
| 133 |
<td class="has-text-centered">0.542</td>
|
| 134 |
</tr>
|
| 135 |
<tr>
|
| 136 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 137 |
<td class="has-text-centered">0.221</td>
|
| 138 |
<td class="has-text-centered">0.364</td>
|
| 139 |
<td class="has-text-centered">-0.310</td>
|
|
|
|
| 147 |
<td class="has-text-centered">0.538</td>
|
| 148 |
</tr>
|
| 149 |
<tr>
|
| 150 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 151 |
<td class="has-text-centered">0.208</td>
|
| 152 |
<td class="has-text-centered">0.307</td>
|
| 153 |
<td class="has-text-centered">-0.229</td>
|
|
|
|
| 161 |
<td class="has-text-centered">0.518</td>
|
| 162 |
</tr>
|
| 163 |
<tr>
|
| 164 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 165 |
<td class="has-text-centered">0.205</td>
|
| 166 |
<td class="has-text-centered">0.409</td>
|
| 167 |
<td class="has-text-centered">-0.212</td>
|
|
|
|
| 175 |
<td class="has-text-centered">0.601</td>
|
| 176 |
</tr>
|
| 177 |
<tr>
|
| 178 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 179 |
<td class="has-text-centered">0.129</td>
|
| 180 |
<td class="has-text-centered">0.283</td>
|
| 181 |
<td class="has-text-centered">0.239</td>
|
|
|
|
| 189 |
<td class="has-text-centered">0.570</td>
|
| 190 |
</tr>
|
| 191 |
<tr>
|
| 192 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 193 |
<td class="has-text-centered">0.150</td>
|
| 194 |
<td class="has-text-centered">0.311</td>
|
| 195 |
<td class="has-text-centered">0.111</td>
|
|
|
|
| 203 |
<td class="has-text-centered">0.572</td>
|
| 204 |
</tr>
|
| 205 |
<tr>
|
| 206 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 207 |
<td class="has-text-centered performance-low">0.110</td>
|
| 208 |
<td class="has-text-centered">0.289</td>
|
| 209 |
<td class="has-text-centered">0.348</td>
|
|
|
|
| 217 |
<td class="has-text-centered">0.489</td>
|
| 218 |
</tr>
|
| 219 |
<tr>
|
| 220 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 221 |
<td class="has-text-centered">0.141</td>
|
| 222 |
<td class="has-text-centered">0.290</td>
|
| 223 |
<td class="has-text-centered">0.165</td>
|
|
|
|
| 231 |
<td class="has-text-centered">0.534</td>
|
| 232 |
</tr>
|
| 233 |
<tr>
|
| 234 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 235 |
<td class="has-text-centered performance-low">0.119</td>
|
| 236 |
<td class="has-text-centered">0.282</td>
|
| 237 |
<td class="has-text-centered">0.293</td>
|
|
|
|
| 245 |
<td class="has-text-centered">0.525</td>
|
| 246 |
</tr>
|
| 247 |
<tr>
|
| 248 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 249 |
<td class="has-text-centered">0.183</td>
|
| 250 |
<td class="has-text-centered">0.363</td>
|
| 251 |
<td class="has-text-centered">-0.085</td>
|
|
|
|
| 259 |
<td class="has-text-centered">0.573</td>
|
| 260 |
</tr>
|
| 261 |
<tr>
|
| 262 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 263 |
<td class="has-text-centered performance-low">0.101</td>
|
| 264 |
<td class="has-text-centered performance-low">0.268</td>
|
| 265 |
<td class="has-text-centered performance-best">0.402</td>
|
|
|
|
| 273 |
<td class="has-text-centered performance-medium">0.585</td>
|
| 274 |
</tr>
|
| 275 |
<tr>
|
| 276 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 277 |
<td class="has-text-centered">0.167</td>
|
| 278 |
<td class="has-text-centered">0.349</td>
|
| 279 |
<td class="has-text-centered">0.008</td>
|
|
|
|
| 287 |
<td class="has-text-centered">0.538</td>
|
| 288 |
</tr>
|
| 289 |
<tr>
|
| 290 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
| 291 |
<td class="has-text-centered">0.164</td>
|
| 292 |
<td class="has-text-centered">0.319</td>
|
| 293 |
<td class="has-text-centered">0.028</td>
|
|
|
|
| 301 |
<td class="has-text-centered">0.547</td>
|
| 302 |
</tr>
|
| 303 |
<tr>
|
| 304 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 305 |
<td class="has-text-centered performance-medium">0.106</td>
|
| 306 |
<td class="has-text-centered">0.274</td>
|
| 307 |
<td class="has-text-centered performance-medium">0.373</td>
|
|
|
|
| 315 |
<td class="has-text-centered">0.547</td>
|
| 316 |
</tr>
|
| 317 |
<tr>
|
| 318 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 319 |
<td class="has-text-centered">0.144</td>
|
| 320 |
<td class="has-text-centered">0.329</td>
|
| 321 |
<td class="has-text-centered">0.149</td>
|
|
|
|
| 329 |
<td class="has-text-centered performance-best">0.587</td>
|
| 330 |
</tr>
|
| 331 |
<tr>
|
| 332 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 333 |
<td class="has-text-centered">0.184</td>
|
| 334 |
<td class="has-text-centered">0.317</td>
|
| 335 |
<td class="has-text-centered">-0.089</td>
|
|
|
|
| 343 |
<td class="has-text-centered">0.515</td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 347 |
<td class="has-text-centered performance-medium">0.120</td>
|
| 348 |
<td class="has-text-centered">0.295</td>
|
| 349 |
<td class="has-text-centered">0.289</td>
|
static/js/model-tooltips.js
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 2 |
+
// Fix model tooltips in all tabs
|
| 3 |
+
function fixAllModelTooltips() {
|
| 4 |
+
console.log("Fixing model tooltips in all tabs");
|
| 5 |
+
|
| 6 |
+
// Find all model name cells (first column in all tables)
|
| 7 |
+
const modelCells = document.querySelectorAll('td:first-child');
|
| 8 |
+
|
| 9 |
+
// Process each model cell
|
| 10 |
+
modelCells.forEach(cell => {
|
| 11 |
+
// Skip cells that already have tooltips
|
| 12 |
+
if (cell.classList.contains('tooltip-trigger')) {
|
| 13 |
+
return;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
// Get the model name
|
| 17 |
+
const modelName = cell.textContent.trim();
|
| 18 |
+
|
| 19 |
+
// Add tooltip-trigger class and position style
|
| 20 |
+
cell.classList.add('tooltip-trigger');
|
| 21 |
+
cell.style.position = 'relative';
|
| 22 |
+
|
| 23 |
+
// Add data-title attribute with the model name
|
| 24 |
+
cell.setAttribute('data-title', modelName);
|
| 25 |
+
|
| 26 |
+
// Add descriptive tooltip based on model
|
| 27 |
+
let tooltipText = "";
|
| 28 |
+
|
| 29 |
+
// Set descriptive tooltip based on model name - exact descriptions from cost analysis tab
|
| 30 |
+
if (modelName.includes("GPT-4o") || modelName.includes("gpt-4o")) {
|
| 31 |
+
tooltipText = "OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.";
|
| 32 |
+
} else if (modelName.includes("o1-mini")) {
|
| 33 |
+
tooltipText = "OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.";
|
| 34 |
+
} else if (modelName.includes("Claude 3.5 Sonnet")) {
|
| 35 |
+
tooltipText = "Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.";
|
| 36 |
+
} else if (modelName.includes("Claude 3 Haiku")) {
|
| 37 |
+
tooltipText = "Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.";
|
| 38 |
+
} else if (modelName.includes("Gemini 1.5")) {
|
| 39 |
+
tooltipText = "Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.";
|
| 40 |
+
} else if (modelName.includes("Command R 7B")) {
|
| 41 |
+
tooltipText = "Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.";
|
| 42 |
+
} else if (modelName.includes("Command R +")) {
|
| 43 |
+
tooltipText = "Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.";
|
| 44 |
+
} else if (modelName.includes("DeepSeek R1")) {
|
| 45 |
+
tooltipText = "DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.";
|
| 46 |
+
} else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
|
| 47 |
+
tooltipText = "DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.";
|
| 48 |
+
} else if (modelName.includes("DeepSeek LLM")) {
|
| 49 |
+
tooltipText = "DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.";
|
| 50 |
+
} else if (modelName.includes("Llama 3 70B")) {
|
| 51 |
+
tooltipText = "Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.";
|
| 52 |
+
} else if (modelName.includes("Llama 3 8B")) {
|
| 53 |
+
tooltipText = "Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.";
|
| 54 |
+
} else if (modelName.includes("DBRX")) {
|
| 55 |
+
tooltipText = "Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.";
|
| 56 |
+
} else if (modelName.includes("Mixtral-8x22B")) {
|
| 57 |
+
tooltipText = "Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.";
|
| 58 |
+
} else if (modelName.includes("Mixtral-8x7B")) {
|
| 59 |
+
tooltipText = "Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.";
|
| 60 |
+
} else if (modelName.includes("Mistral")) {
|
| 61 |
+
tooltipText = "Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.";
|
| 62 |
+
} else if (modelName.includes("Qwen 2")) {
|
| 63 |
+
tooltipText = "Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.";
|
| 64 |
+
} else if (modelName.includes("WizardLM")) {
|
| 65 |
+
tooltipText = "A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.";
|
| 66 |
+
} else if (modelName.includes("Gemma 2 27B")) {
|
| 67 |
+
tooltipText = "Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.";
|
| 68 |
+
} else if (modelName.includes("Gemma 2 9B")) {
|
| 69 |
+
tooltipText = "Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.";
|
| 70 |
+
} else if (modelName.includes("QwQ-32B")) {
|
| 71 |
+
tooltipText = "Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.";
|
| 72 |
+
} else if (modelName.includes("Jamba 1.5 Mini")) {
|
| 73 |
+
tooltipText = "A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.";
|
| 74 |
+
} else if (modelName.includes("Jamba 1.5 Large")) {
|
| 75 |
+
tooltipText = "An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.";
|
| 76 |
+
} else {
|
| 77 |
+
tooltipText = "A language model evaluated in the FLaME financial benchmark. Assessed across multiple financial NLP tasks including classification, summarization, QA, and more.";
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// Set the tooltip
|
| 81 |
+
cell.setAttribute('data-tooltip', tooltipText);
|
| 82 |
+
});
|
| 83 |
+
|
| 84 |
+
// After adding attributes, run the tooltip fix
|
| 85 |
+
if (window.fixProblemTooltips) {
|
| 86 |
+
window.fixProblemTooltips();
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// Run on page load
|
| 91 |
+
setTimeout(fixAllModelTooltips, 500);
|
| 92 |
+
|
| 93 |
+
// Run when tabs are clicked
|
| 94 |
+
const tabs = document.querySelectorAll('.tabs li');
|
| 95 |
+
tabs.forEach(tab => {
|
| 96 |
+
tab.addEventListener('click', () => {
|
| 97 |
+
// Give time for content to be displayed
|
| 98 |
+
setTimeout(fixAllModelTooltips, 200);
|
| 99 |
+
});
|
| 100 |
+
});
|
| 101 |
+
});
|
text_classification_table.html
CHANGED
|
@@ -43,7 +43,7 @@
|
|
| 43 |
</thead>
|
| 44 |
<tbody>
|
| 45 |
<tr>
|
| 46 |
-
<td>Llama 3 70B Instruct</td>
|
| 47 |
<td class="has-text-centered">0.660</td>
|
| 48 |
<td class="has-text-centered">0.748</td>
|
| 49 |
<td class="has-text-centered">0.660</td>
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
<td class="has-text-centered">0.811</td>
|
| 64 |
</tr>
|
| 65 |
<tr>
|
| 66 |
-
<td>Llama 3 8B Instruct</td>
|
| 67 |
<td class="has-text-centered">0.534</td>
|
| 68 |
<td class="has-text-centered">0.672</td>
|
| 69 |
<td class="has-text-centered">0.534</td>
|
|
@@ -83,7 +83,7 @@
|
|
| 83 |
<td class="has-text-centered">0.763</td>
|
| 84 |
</tr>
|
| 85 |
<tr>
|
| 86 |
-
<td>DBRX Instruct</td>
|
| 87 |
<td class="has-text-centered">0.578</td>
|
| 88 |
<td class="has-text-centered">0.706</td>
|
| 89 |
<td class="has-text-centered">0.578</td>
|
|
@@ -103,7 +103,7 @@
|
|
| 103 |
<td class="has-text-centered">0.746</td>
|
| 104 |
</tr>
|
| 105 |
<tr>
|
| 106 |
-
<td>DeepSeek LLM (67B)</td>
|
| 107 |
<td class="has-text-centered">0.596</td>
|
| 108 |
<td class="has-text-centered">0.711</td>
|
| 109 |
<td class="has-text-centered">0.596</td>
|
|
@@ -123,7 +123,7 @@
|
|
| 123 |
<td class="has-text-centered">0.778</td>
|
| 124 |
</tr>
|
| 125 |
<tr>
|
| 126 |
-
<td>Gemma 2 27B</td>
|
| 127 |
<td class="has-text-centered">0.639</td>
|
| 128 |
<td class="has-text-centered">0.730</td>
|
| 129 |
<td class="has-text-centered">0.639</td>
|
|
@@ -143,7 +143,7 @@
|
|
| 143 |
<td class="has-text-centered">0.808</td>
|
| 144 |
</tr>
|
| 145 |
<tr>
|
| 146 |
-
<td>Gemma 2 9B</td>
|
| 147 |
<td class="has-text-centered">0.630</td>
|
| 148 |
<td class="has-text-centered">0.710</td>
|
| 149 |
<td class="has-text-centered">0.630</td>
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
<td class="has-text-centered performance-best">0.856</td>
|
| 164 |
</tr>
|
| 165 |
<tr>
|
| 166 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 167 |
<td class="has-text-centered">0.547</td>
|
| 168 |
<td class="has-text-centered">0.677</td>
|
| 169 |
<td class="has-text-centered">0.547</td>
|
|
@@ -183,7 +183,7 @@
|
|
| 183 |
<td class="has-text-centered">0.779</td>
|
| 184 |
</tr>
|
| 185 |
<tr>
|
| 186 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 187 |
<td class="has-text-centered">0.622</td>
|
| 188 |
<td class="has-text-centered">0.718</td>
|
| 189 |
<td class="has-text-centered">0.622</td>
|
|
@@ -203,7 +203,7 @@
|
|
| 203 |
<td class="has-text-centered performance-medium">0.835</td>
|
| 204 |
</tr>
|
| 205 |
<tr>
|
| 206 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 207 |
<td class="has-text-centered">0.567</td>
|
| 208 |
<td class="has-text-centered">0.693</td>
|
| 209 |
<td class="has-text-centered">0.567</td>
|
|
@@ -223,7 +223,7 @@
|
|
| 223 |
<td class="has-text-centered">0.805</td>
|
| 224 |
</tr>
|
| 225 |
<tr>
|
| 226 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 227 |
<td class="has-text-centered">0.644</td>
|
| 228 |
<td class="has-text-centered">0.730</td>
|
| 229 |
<td class="has-text-centered">0.644</td>
|
|
@@ -243,7 +243,7 @@
|
|
| 243 |
<td class="has-text-centered">0.830</td>
|
| 244 |
</tr>
|
| 245 |
<tr>
|
| 246 |
-
<td>WizardLM-2 8x22B</td>
|
| 247 |
<td class="has-text-centered">0.664</td>
|
| 248 |
<td class="has-text-centered">0.737</td>
|
| 249 |
<td class="has-text-centered">0.664</td>
|
|
@@ -263,7 +263,7 @@
|
|
| 263 |
<td class="has-text-centered">0.797</td>
|
| 264 |
</tr>
|
| 265 |
<tr>
|
| 266 |
-
<td>DeepSeek-V3</td>
|
| 267 |
<td class="has-text-centered performance-strong">0.722</td>
|
| 268 |
<td class="has-text-centered performance-medium">0.774</td>
|
| 269 |
<td class="has-text-centered performance-strong">0.722</td>
|
|
@@ -283,7 +283,7 @@
|
|
| 283 |
<td class="has-text-centered">0.729</td>
|
| 284 |
</tr>
|
| 285 |
<tr>
|
| 286 |
-
<td>DeepSeek R1</td>
|
| 287 |
<td class="has-text-centered performance-best">0.772</td>
|
| 288 |
<td class="has-text-centered performance-strong">0.789</td>
|
| 289 |
<td class="has-text-centered performance-best">0.772</td>
|
|
@@ -303,7 +303,7 @@
|
|
| 303 |
<td class="has-text-centered">0.769</td>
|
| 304 |
</tr>
|
| 305 |
<tr>
|
| 306 |
-
<td>QwQ-32B-Preview</td>
|
| 307 |
<td class="has-text-centered">0.577</td>
|
| 308 |
<td class="has-text-centered">0.747</td>
|
| 309 |
<td class="has-text-centered">0.577</td>
|
|
@@ -323,7 +323,7 @@
|
|
| 323 |
<td class="has-text-centered">0.744</td>
|
| 324 |
</tr>
|
| 325 |
<tr>
|
| 326 |
-
<td>Jamba 1.5 Mini</td>
|
| 327 |
<td class="has-text-centered">0.528</td>
|
| 328 |
<td class="has-text-centered">0.630</td>
|
| 329 |
<td class="has-text-centered">0.528</td>
|
|
@@ -343,7 +343,7 @@
|
|
| 343 |
<td class="has-text-centered">0.682</td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
-
<td>Jamba 1.5 Large</td>
|
| 347 |
<td class="has-text-centered">0.642</td>
|
| 348 |
<td class="has-text-centered">0.746</td>
|
| 349 |
<td class="has-text-centered">0.642</td>
|
|
@@ -363,7 +363,7 @@
|
|
| 363 |
<td class="has-text-centered">0.782</td>
|
| 364 |
</tr>
|
| 365 |
<tr>
|
| 366 |
-
<td>Claude 3.5 Sonnet</td>
|
| 367 |
<td class="has-text-centered">0.682</td>
|
| 368 |
<td class="has-text-centered">0.755</td>
|
| 369 |
<td class="has-text-centered">0.682</td>
|
|
@@ -383,7 +383,7 @@
|
|
| 383 |
<td class="has-text-centered">0.827</td>
|
| 384 |
</tr>
|
| 385 |
<tr>
|
| 386 |
-
<td>Claude 3 Haiku</td>
|
| 387 |
<td class="has-text-centered">0.639</td>
|
| 388 |
<td class="has-text-centered">0.735</td>
|
| 389 |
<td class="has-text-centered">0.639</td>
|
|
@@ -403,7 +403,7 @@
|
|
| 403 |
<td class="has-text-centered">0.781</td>
|
| 404 |
</tr>
|
| 405 |
<tr>
|
| 406 |
-
<td>Cohere Command R 7B</td>
|
| 407 |
<td class="has-text-centered">0.530</td>
|
| 408 |
<td class="has-text-centered">0.650</td>
|
| 409 |
<td class="has-text-centered">0.530</td>
|
|
@@ -423,7 +423,7 @@
|
|
| 423 |
<td class="has-text-centered">0.770</td>
|
| 424 |
</tr>
|
| 425 |
<tr>
|
| 426 |
-
<td>Cohere Command R +</td>
|
| 427 |
<td class="has-text-centered">0.660</td>
|
| 428 |
<td class="has-text-centered">0.747</td>
|
| 429 |
<td class="has-text-centered">0.660</td>
|
|
@@ -443,7 +443,7 @@
|
|
| 443 |
<td class="has-text-centered">0.812</td>
|
| 444 |
</tr>
|
| 445 |
<tr>
|
| 446 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 447 |
<td class="has-text-centered">0.483</td>
|
| 448 |
<td class="has-text-centered">0.487</td>
|
| 449 |
<td class="has-text-centered">0.483</td>
|
|
@@ -463,7 +463,7 @@
|
|
| 463 |
<td class="has-text-centered performance-strong">0.837</td>
|
| 464 |
</tr>
|
| 465 |
<tr>
|
| 466 |
-
<td>OpenAI gpt-4o</td>
|
| 467 |
<td class="has-text-centered performance-medium">0.704</td>
|
| 468 |
<td class="has-text-centered performance-best">0.792</td>
|
| 469 |
<td class="has-text-centered performance-medium">0.704</td>
|
|
@@ -483,7 +483,7 @@
|
|
| 483 |
<td class="has-text-centered">0.824</td>
|
| 484 |
</tr>
|
| 485 |
<tr>
|
| 486 |
-
<td>OpenAI o1-mini</td>
|
| 487 |
<td class="has-text-centered">0.681</td>
|
| 488 |
<td class="has-text-centered">0.760</td>
|
| 489 |
<td class="has-text-centered">0.681</td>
|
|
|
|
| 43 |
</thead>
|
| 44 |
<tbody>
|
| 45 |
<tr>
|
| 46 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 47 |
<td class="has-text-centered">0.660</td>
|
| 48 |
<td class="has-text-centered">0.748</td>
|
| 49 |
<td class="has-text-centered">0.660</td>
|
|
|
|
| 63 |
<td class="has-text-centered">0.811</td>
|
| 64 |
</tr>
|
| 65 |
<tr>
|
| 66 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 67 |
<td class="has-text-centered">0.534</td>
|
| 68 |
<td class="has-text-centered">0.672</td>
|
| 69 |
<td class="has-text-centered">0.534</td>
|
|
|
|
| 83 |
<td class="has-text-centered">0.763</td>
|
| 84 |
</tr>
|
| 85 |
<tr>
|
| 86 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 87 |
<td class="has-text-centered">0.578</td>
|
| 88 |
<td class="has-text-centered">0.706</td>
|
| 89 |
<td class="has-text-centered">0.578</td>
|
|
|
|
| 103 |
<td class="has-text-centered">0.746</td>
|
| 104 |
</tr>
|
| 105 |
<tr>
|
| 106 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 107 |
<td class="has-text-centered">0.596</td>
|
| 108 |
<td class="has-text-centered">0.711</td>
|
| 109 |
<td class="has-text-centered">0.596</td>
|
|
|
|
| 123 |
<td class="has-text-centered">0.778</td>
|
| 124 |
</tr>
|
| 125 |
<tr>
|
| 126 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 127 |
<td class="has-text-centered">0.639</td>
|
| 128 |
<td class="has-text-centered">0.730</td>
|
| 129 |
<td class="has-text-centered">0.639</td>
|
|
|
|
| 143 |
<td class="has-text-centered">0.808</td>
|
| 144 |
</tr>
|
| 145 |
<tr>
|
| 146 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 147 |
<td class="has-text-centered">0.630</td>
|
| 148 |
<td class="has-text-centered">0.710</td>
|
| 149 |
<td class="has-text-centered">0.630</td>
|
|
|
|
| 163 |
<td class="has-text-centered performance-best">0.856</td>
|
| 164 |
</tr>
|
| 165 |
<tr>
|
| 166 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 167 |
<td class="has-text-centered">0.547</td>
|
| 168 |
<td class="has-text-centered">0.677</td>
|
| 169 |
<td class="has-text-centered">0.547</td>
|
|
|
|
| 183 |
<td class="has-text-centered">0.779</td>
|
| 184 |
</tr>
|
| 185 |
<tr>
|
| 186 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 187 |
<td class="has-text-centered">0.622</td>
|
| 188 |
<td class="has-text-centered">0.718</td>
|
| 189 |
<td class="has-text-centered">0.622</td>
|
|
|
|
| 203 |
<td class="has-text-centered performance-medium">0.835</td>
|
| 204 |
</tr>
|
| 205 |
<tr>
|
| 206 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 207 |
<td class="has-text-centered">0.567</td>
|
| 208 |
<td class="has-text-centered">0.693</td>
|
| 209 |
<td class="has-text-centered">0.567</td>
|
|
|
|
| 223 |
<td class="has-text-centered">0.805</td>
|
| 224 |
</tr>
|
| 225 |
<tr>
|
| 226 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 227 |
<td class="has-text-centered">0.644</td>
|
| 228 |
<td class="has-text-centered">0.730</td>
|
| 229 |
<td class="has-text-centered">0.644</td>
|
|
|
|
| 243 |
<td class="has-text-centered">0.830</td>
|
| 244 |
</tr>
|
| 245 |
<tr>
|
| 246 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 247 |
<td class="has-text-centered">0.664</td>
|
| 248 |
<td class="has-text-centered">0.737</td>
|
| 249 |
<td class="has-text-centered">0.664</td>
|
|
|
|
| 263 |
<td class="has-text-centered">0.797</td>
|
| 264 |
</tr>
|
| 265 |
<tr>
|
| 266 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 267 |
<td class="has-text-centered performance-strong">0.722</td>
|
| 268 |
<td class="has-text-centered performance-medium">0.774</td>
|
| 269 |
<td class="has-text-centered performance-strong">0.722</td>
|
|
|
|
| 283 |
<td class="has-text-centered">0.729</td>
|
| 284 |
</tr>
|
| 285 |
<tr>
|
| 286 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 287 |
<td class="has-text-centered performance-best">0.772</td>
|
| 288 |
<td class="has-text-centered performance-strong">0.789</td>
|
| 289 |
<td class="has-text-centered performance-best">0.772</td>
|
|
|
|
| 303 |
<td class="has-text-centered">0.769</td>
|
| 304 |
</tr>
|
| 305 |
<tr>
|
| 306 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 307 |
<td class="has-text-centered">0.577</td>
|
| 308 |
<td class="has-text-centered">0.747</td>
|
| 309 |
<td class="has-text-centered">0.577</td>
|
|
|
|
| 323 |
<td class="has-text-centered">0.744</td>
|
| 324 |
</tr>
|
| 325 |
<tr>
|
| 326 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 327 |
<td class="has-text-centered">0.528</td>
|
| 328 |
<td class="has-text-centered">0.630</td>
|
| 329 |
<td class="has-text-centered">0.528</td>
|
|
|
|
| 343 |
<td class="has-text-centered">0.682</td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 347 |
<td class="has-text-centered">0.642</td>
|
| 348 |
<td class="has-text-centered">0.746</td>
|
| 349 |
<td class="has-text-centered">0.642</td>
|
|
|
|
| 363 |
<td class="has-text-centered">0.782</td>
|
| 364 |
</tr>
|
| 365 |
<tr>
|
| 366 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 367 |
<td class="has-text-centered">0.682</td>
|
| 368 |
<td class="has-text-centered">0.755</td>
|
| 369 |
<td class="has-text-centered">0.682</td>
|
|
|
|
| 383 |
<td class="has-text-centered">0.827</td>
|
| 384 |
</tr>
|
| 385 |
<tr>
|
| 386 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 387 |
<td class="has-text-centered">0.639</td>
|
| 388 |
<td class="has-text-centered">0.735</td>
|
| 389 |
<td class="has-text-centered">0.639</td>
|
|
|
|
| 403 |
<td class="has-text-centered">0.781</td>
|
| 404 |
</tr>
|
| 405 |
<tr>
|
| 406 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
| 407 |
<td class="has-text-centered">0.530</td>
|
| 408 |
<td class="has-text-centered">0.650</td>
|
| 409 |
<td class="has-text-centered">0.530</td>
|
|
|
|
| 423 |
<td class="has-text-centered">0.770</td>
|
| 424 |
</tr>
|
| 425 |
<tr>
|
| 426 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 427 |
<td class="has-text-centered">0.660</td>
|
| 428 |
<td class="has-text-centered">0.747</td>
|
| 429 |
<td class="has-text-centered">0.660</td>
|
|
|
|
| 443 |
<td class="has-text-centered">0.812</td>
|
| 444 |
</tr>
|
| 445 |
<tr>
|
| 446 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 447 |
<td class="has-text-centered">0.483</td>
|
| 448 |
<td class="has-text-centered">0.487</td>
|
| 449 |
<td class="has-text-centered">0.483</td>
|
|
|
|
| 463 |
<td class="has-text-centered performance-strong">0.837</td>
|
| 464 |
</tr>
|
| 465 |
<tr>
|
| 466 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 467 |
<td class="has-text-centered performance-medium">0.704</td>
|
| 468 |
<td class="has-text-centered performance-best">0.792</td>
|
| 469 |
<td class="has-text-centered performance-medium">0.704</td>
|
|
|
|
| 483 |
<td class="has-text-centered">0.824</td>
|
| 484 |
</tr>
|
| 485 |
<tr>
|
| 486 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 487 |
<td class="has-text-centered">0.681</td>
|
| 488 |
<td class="has-text-centered">0.760</td>
|
| 489 |
<td class="has-text-centered">0.681</td>
|
text_summarization_table.html
CHANGED
|
@@ -29,7 +29,7 @@
|
|
| 29 |
</thead>
|
| 30 |
<tbody>
|
| 31 |
<tr>
|
| 32 |
-
<td>Llama 3 70B Instruct</td>
|
| 33 |
<td class="has-text-centered">0.715</td>
|
| 34 |
<td class="has-text-centered">0.801</td>
|
| 35 |
<td class="has-text-centered">0.754</td>
|
|
@@ -38,7 +38,7 @@
|
|
| 38 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 39 |
</tr>
|
| 40 |
<tr>
|
| 41 |
-
<td>Llama 3 8B Instruct</td>
|
| 42 |
<td class="has-text-centered">0.724</td>
|
| 43 |
<td class="has-text-centered">0.796</td>
|
| 44 |
<td class="has-text-centered">0.757</td>
|
|
@@ -47,7 +47,7 @@
|
|
| 47 |
<td class="has-text-centered">0.811</td>
|
| 48 |
</tr>
|
| 49 |
<tr>
|
| 50 |
-
<td>DBRX Instruct</td>
|
| 51 |
<td class="has-text-centered">0.680</td>
|
| 52 |
<td class="has-text-centered">0.786</td>
|
| 53 |
<td class="has-text-centered">0.729</td>
|
|
@@ -56,7 +56,7 @@
|
|
| 56 |
<td class="has-text-centered">0.806</td>
|
| 57 |
</tr>
|
| 58 |
<tr>
|
| 59 |
-
<td>DeepSeek LLM (67B)</td>
|
| 60 |
<td class="has-text-centered">0.692</td>
|
| 61 |
<td class="has-text-centered">0.678</td>
|
| 62 |
<td class="has-text-centered">0.681</td>
|
|
@@ -65,7 +65,7 @@
|
|
| 65 |
<td class="has-text-centered">0.807</td>
|
| 66 |
</tr>
|
| 67 |
<tr>
|
| 68 |
-
<td>Gemma 2 27B</td>
|
| 69 |
<td class="has-text-centered">0.680</td>
|
| 70 |
<td class="has-text-centered">0.777</td>
|
| 71 |
<td class="has-text-centered">0.723</td>
|
|
@@ -74,7 +74,7 @@
|
|
| 74 |
<td class="has-text-centered">0.814</td>
|
| 75 |
</tr>
|
| 76 |
<tr>
|
| 77 |
-
<td>Gemma 2 9B</td>
|
| 78 |
<td class="has-text-centered">0.651</td>
|
| 79 |
<td class="has-text-centered">0.531</td>
|
| 80 |
<td class="has-text-centered">0.585</td>
|
|
@@ -83,7 +83,7 @@
|
|
| 83 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 84 |
</tr>
|
| 85 |
<tr>
|
| 86 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
| 87 |
<td class="has-text-centered">0.702</td>
|
| 88 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 89 |
<td class="has-text-centered">0.750</td>
|
|
@@ -92,7 +92,7 @@
|
|
| 92 |
<td class="has-text-centered">0.811</td>
|
| 93 |
</tr>
|
| 94 |
<tr>
|
| 95 |
-
<td>Mixtral-8x22B Instruct</td>
|
| 96 |
<td class="has-text-centered">0.713</td>
|
| 97 |
<td class="has-text-centered performance-best">0.812</td>
|
| 98 |
<td class="has-text-centered">0.758</td>
|
|
@@ -101,7 +101,7 @@
|
|
| 101 |
<td class="has-text-centered">0.815</td>
|
| 102 |
</tr>
|
| 103 |
<tr>
|
| 104 |
-
<td>Mixtral-8x7B Instruct</td>
|
| 105 |
<td class="has-text-centered">0.727</td>
|
| 106 |
<td class="has-text-centered">0.773</td>
|
| 107 |
<td class="has-text-centered">0.747</td>
|
|
@@ -110,7 +110,7 @@
|
|
| 110 |
<td class="has-text-centered">0.810</td>
|
| 111 |
</tr>
|
| 112 |
<tr>
|
| 113 |
-
<td>Qwen 2 Instruct (72B)</td>
|
| 114 |
<td class="has-text-centered">0.709</td>
|
| 115 |
<td class="has-text-centered performance-medium">0.804</td>
|
| 116 |
<td class="has-text-centered">0.752</td>
|
|
@@ -119,7 +119,7 @@
|
|
| 119 |
<td class="has-text-centered">0.811</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
-
<td>WizardLM-2 8x22B</td>
|
| 123 |
<td class="has-text-centered">0.677</td>
|
| 124 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 125 |
<td class="has-text-centered">0.735</td>
|
|
@@ -128,7 +128,7 @@
|
|
| 128 |
<td class="has-text-centered">0.808</td>
|
| 129 |
</tr>
|
| 130 |
<tr>
|
| 131 |
-
<td>DeepSeek-V3</td>
|
| 132 |
<td class="has-text-centered">0.703</td>
|
| 133 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 134 |
<td class="has-text-centered">0.750</td>
|
|
@@ -137,7 +137,7 @@
|
|
| 137 |
<td class="has-text-centered">0.815</td>
|
| 138 |
</tr>
|
| 139 |
<tr>
|
| 140 |
-
<td>DeepSeek R1</td>
|
| 141 |
<td class="has-text-centered">0.724</td>
|
| 142 |
<td class="has-text-centered">0.800</td>
|
| 143 |
<td class="has-text-centered">0.759</td>
|
|
@@ -146,7 +146,7 @@
|
|
| 146 |
<td class="has-text-centered">0.804</td>
|
| 147 |
</tr>
|
| 148 |
<tr>
|
| 149 |
-
<td>QwQ-32B-Preview</td>
|
| 150 |
<td class="has-text-centered">0.653</td>
|
| 151 |
<td class="has-text-centered">0.751</td>
|
| 152 |
<td class="has-text-centered">0.696</td>
|
|
@@ -155,7 +155,7 @@
|
|
| 155 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 156 |
</tr>
|
| 157 |
<tr>
|
| 158 |
-
<td>Jamba 1.5 Mini</td>
|
| 159 |
<td class="has-text-centered">0.692</td>
|
| 160 |
<td class="has-text-centered">0.798</td>
|
| 161 |
<td class="has-text-centered">0.741</td>
|
|
@@ -164,7 +164,7 @@
|
|
| 164 |
<td class="has-text-centered performance-medium">0.816</td>
|
| 165 |
</tr>
|
| 166 |
<tr>
|
| 167 |
-
<td>Jamba 1.5 Large</td>
|
| 168 |
<td class="has-text-centered">0.679</td>
|
| 169 |
<td class="has-text-centered">0.800</td>
|
| 170 |
<td class="has-text-centered">0.734</td>
|
|
@@ -173,7 +173,7 @@
|
|
| 173 |
<td class="has-text-centered performance-best">0.818</td>
|
| 174 |
</tr>
|
| 175 |
<tr>
|
| 176 |
-
<td>Claude 3.5 Sonnet</td>
|
| 177 |
<td class="has-text-centered performance-medium">0.737</td>
|
| 178 |
<td class="has-text-centered">0.802</td>
|
| 179 |
<td class="has-text-centered performance-medium">0.767</td>
|
|
@@ -182,7 +182,7 @@
|
|
| 182 |
<td class="has-text-centered">0.813</td>
|
| 183 |
</tr>
|
| 184 |
<tr>
|
| 185 |
-
<td>Claude 3 Haiku</td>
|
| 186 |
<td class="has-text-centered">0.683</td>
|
| 187 |
<td class="has-text-centered">0.617</td>
|
| 188 |
<td class="has-text-centered">0.646</td>
|
|
@@ -191,7 +191,7 @@
|
|
| 191 |
<td class="has-text-centered">0.808</td>
|
| 192 |
</tr>
|
| 193 |
<tr>
|
| 194 |
-
<td>Cohere Command R 7B</td>
|
| 195 |
<td class="has-text-centered">0.724</td>
|
| 196 |
<td class="has-text-centered">0.781</td>
|
| 197 |
<td class="has-text-centered">0.750</td>
|
|
@@ -200,7 +200,7 @@
|
|
| 200 |
<td class="has-text-centered">0.815</td>
|
| 201 |
</tr>
|
| 202 |
<tr>
|
| 203 |
-
<td>Cohere Command R +</td>
|
| 204 |
<td class="has-text-centered">0.724</td>
|
| 205 |
<td class="has-text-centered">0.782</td>
|
| 206 |
<td class="has-text-centered">0.751</td>
|
|
@@ -209,7 +209,7 @@
|
|
| 209 |
<td class="has-text-centered">0.810</td>
|
| 210 |
</tr>
|
| 211 |
<tr>
|
| 212 |
-
<td>Google Gemini 1.5 Pro</td>
|
| 213 |
<td class="has-text-centered performance-best">0.757</td>
|
| 214 |
<td class="has-text-centered">0.800</td>
|
| 215 |
<td class="has-text-centered performance-best">0.777</td>
|
|
@@ -218,7 +218,7 @@
|
|
| 218 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 219 |
</tr>
|
| 220 |
<tr>
|
| 221 |
-
<td>OpenAI gpt-4o</td>
|
| 222 |
<td class="has-text-centered performance-strong">0.755</td>
|
| 223 |
<td class="has-text-centered">0.793</td>
|
| 224 |
<td class="has-text-centered performance-strong">0.773</td>
|
|
@@ -227,7 +227,7 @@
|
|
| 227 |
<td class="has-text-centered performance-medium">0.816</td>
|
| 228 |
</tr>
|
| 229 |
<tr>
|
| 230 |
-
<td>OpenAI o1-mini</td>
|
| 231 |
<td class="has-text-centered">0.731</td>
|
| 232 |
<td class="has-text-centered">0.801</td>
|
| 233 |
<td class="has-text-centered">0.763</td>
|
|
|
|
| 29 |
</thead>
|
| 30 |
<tbody>
|
| 31 |
<tr>
|
| 32 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
| 33 |
<td class="has-text-centered">0.715</td>
|
| 34 |
<td class="has-text-centered">0.801</td>
|
| 35 |
<td class="has-text-centered">0.754</td>
|
|
|
|
| 38 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 39 |
</tr>
|
| 40 |
<tr>
|
| 41 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
| 42 |
<td class="has-text-centered">0.724</td>
|
| 43 |
<td class="has-text-centered">0.796</td>
|
| 44 |
<td class="has-text-centered">0.757</td>
|
|
|
|
| 47 |
<td class="has-text-centered">0.811</td>
|
| 48 |
</tr>
|
| 49 |
<tr>
|
| 50 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
| 51 |
<td class="has-text-centered">0.680</td>
|
| 52 |
<td class="has-text-centered">0.786</td>
|
| 53 |
<td class="has-text-centered">0.729</td>
|
|
|
|
| 56 |
<td class="has-text-centered">0.806</td>
|
| 57 |
</tr>
|
| 58 |
<tr>
|
| 59 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
| 60 |
<td class="has-text-centered">0.692</td>
|
| 61 |
<td class="has-text-centered">0.678</td>
|
| 62 |
<td class="has-text-centered">0.681</td>
|
|
|
|
| 65 |
<td class="has-text-centered">0.807</td>
|
| 66 |
</tr>
|
| 67 |
<tr>
|
| 68 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
| 69 |
<td class="has-text-centered">0.680</td>
|
| 70 |
<td class="has-text-centered">0.777</td>
|
| 71 |
<td class="has-text-centered">0.723</td>
|
|
|
|
| 74 |
<td class="has-text-centered">0.814</td>
|
| 75 |
</tr>
|
| 76 |
<tr>
|
| 77 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
| 78 |
<td class="has-text-centered">0.651</td>
|
| 79 |
<td class="has-text-centered">0.531</td>
|
| 80 |
<td class="has-text-centered">0.585</td>
|
|
|
|
| 83 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 84 |
</tr>
|
| 85 |
<tr>
|
| 86 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
| 87 |
<td class="has-text-centered">0.702</td>
|
| 88 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 89 |
<td class="has-text-centered">0.750</td>
|
|
|
|
| 92 |
<td class="has-text-centered">0.811</td>
|
| 93 |
</tr>
|
| 94 |
<tr>
|
| 95 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
| 96 |
<td class="has-text-centered">0.713</td>
|
| 97 |
<td class="has-text-centered performance-best">0.812</td>
|
| 98 |
<td class="has-text-centered">0.758</td>
|
|
|
|
| 101 |
<td class="has-text-centered">0.815</td>
|
| 102 |
</tr>
|
| 103 |
<tr>
|
| 104 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
| 105 |
<td class="has-text-centered">0.727</td>
|
| 106 |
<td class="has-text-centered">0.773</td>
|
| 107 |
<td class="has-text-centered">0.747</td>
|
|
|
|
| 110 |
<td class="has-text-centered">0.810</td>
|
| 111 |
</tr>
|
| 112 |
<tr>
|
| 113 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
| 114 |
<td class="has-text-centered">0.709</td>
|
| 115 |
<td class="has-text-centered performance-medium">0.804</td>
|
| 116 |
<td class="has-text-centered">0.752</td>
|
|
|
|
| 119 |
<td class="has-text-centered">0.811</td>
|
| 120 |
</tr>
|
| 121 |
<tr>
|
| 122 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
| 123 |
<td class="has-text-centered">0.677</td>
|
| 124 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 125 |
<td class="has-text-centered">0.735</td>
|
|
|
|
| 128 |
<td class="has-text-centered">0.808</td>
|
| 129 |
</tr>
|
| 130 |
<tr>
|
| 131 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
| 132 |
<td class="has-text-centered">0.703</td>
|
| 133 |
<td class="has-text-centered performance-strong">0.806</td>
|
| 134 |
<td class="has-text-centered">0.750</td>
|
|
|
|
| 137 |
<td class="has-text-centered">0.815</td>
|
| 138 |
</tr>
|
| 139 |
<tr>
|
| 140 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
| 141 |
<td class="has-text-centered">0.724</td>
|
| 142 |
<td class="has-text-centered">0.800</td>
|
| 143 |
<td class="has-text-centered">0.759</td>
|
|
|
|
| 146 |
<td class="has-text-centered">0.804</td>
|
| 147 |
</tr>
|
| 148 |
<tr>
|
| 149 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
| 150 |
<td class="has-text-centered">0.653</td>
|
| 151 |
<td class="has-text-centered">0.751</td>
|
| 152 |
<td class="has-text-centered">0.696</td>
|
|
|
|
| 155 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 156 |
</tr>
|
| 157 |
<tr>
|
| 158 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
| 159 |
<td class="has-text-centered">0.692</td>
|
| 160 |
<td class="has-text-centered">0.798</td>
|
| 161 |
<td class="has-text-centered">0.741</td>
|
|
|
|
| 164 |
<td class="has-text-centered performance-medium">0.816</td>
|
| 165 |
</tr>
|
| 166 |
<tr>
|
| 167 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
| 168 |
<td class="has-text-centered">0.679</td>
|
| 169 |
<td class="has-text-centered">0.800</td>
|
| 170 |
<td class="has-text-centered">0.734</td>
|
|
|
|
| 173 |
<td class="has-text-centered performance-best">0.818</td>
|
| 174 |
</tr>
|
| 175 |
<tr>
|
| 176 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
| 177 |
<td class="has-text-centered performance-medium">0.737</td>
|
| 178 |
<td class="has-text-centered">0.802</td>
|
| 179 |
<td class="has-text-centered performance-medium">0.767</td>
|
|
|
|
| 182 |
<td class="has-text-centered">0.813</td>
|
| 183 |
</tr>
|
| 184 |
<tr>
|
| 185 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
| 186 |
<td class="has-text-centered">0.683</td>
|
| 187 |
<td class="has-text-centered">0.617</td>
|
| 188 |
<td class="has-text-centered">0.646</td>
|
|
|
|
| 191 |
<td class="has-text-centered">0.808</td>
|
| 192 |
</tr>
|
| 193 |
<tr>
|
| 194 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
| 195 |
<td class="has-text-centered">0.724</td>
|
| 196 |
<td class="has-text-centered">0.781</td>
|
| 197 |
<td class="has-text-centered">0.750</td>
|
|
|
|
| 200 |
<td class="has-text-centered">0.815</td>
|
| 201 |
</tr>
|
| 202 |
<tr>
|
| 203 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
| 204 |
<td class="has-text-centered">0.724</td>
|
| 205 |
<td class="has-text-centered">0.782</td>
|
| 206 |
<td class="has-text-centered">0.751</td>
|
|
|
|
| 209 |
<td class="has-text-centered">0.810</td>
|
| 210 |
</tr>
|
| 211 |
<tr>
|
| 212 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
| 213 |
<td class="has-text-centered performance-best">0.757</td>
|
| 214 |
<td class="has-text-centered">0.800</td>
|
| 215 |
<td class="has-text-centered performance-best">0.777</td>
|
|
|
|
| 218 |
<td class="has-text-centered performance-strong">0.817</td>
|
| 219 |
</tr>
|
| 220 |
<tr>
|
| 221 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
| 222 |
<td class="has-text-centered performance-strong">0.755</td>
|
| 223 |
<td class="has-text-centered">0.793</td>
|
| 224 |
<td class="has-text-centered performance-strong">0.773</td>
|
|
|
|
| 227 |
<td class="has-text-centered performance-medium">0.816</td>
|
| 228 |
</tr>
|
| 229 |
<tr>
|
| 230 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
| 231 |
<td class="has-text-centered">0.731</td>
|
| 232 |
<td class="has-text-centered">0.801</td>
|
| 233 |
<td class="has-text-centered">0.763</td>
|