Spaces:
Running
Running
| <link rel="stylesheet" href="static/css/tooltips.css"> | |
| <style> | |
| .tooltip-right:hover::after { | |
| left: auto \; | |
| right: 100% \; | |
| margin-left: 0 \; | |
| margin-right: 10px \; | |
| } | |
| </style> | |
| <!-- Text Summarization --> | |
| <div id="text-summarization" class="tab-content"> | |
| <h2 class="title is-4">Text Summarization Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="3" class="has-text-centered tooltip-trigger" data-title="ECTSum" data-tooltip="ECTSum contains 2,425 document-summary pairs featuring earnings call transcripts paired with concise bullet-point summaries extracted from Reuters articles. The summarization task requires extracting and condensing key financial information from lengthy corporate communications into brief, informative points.">ECTSum</th> | |
| <th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" style="position: relative;" data-title="EDTSum" data-tooltip="EDTSum consists of 2,000 financial news articles paired with their headlines as ground-truth summaries for evaluating text summarization. The task challenges models to condense complex financial news articles into concise, informative headlines that capture the essential information.">EDTSum</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">BERTScore Precision</th> | |
| <th class="has-text-centered">BERTScore Recall</th> | |
| <th class="has-text-centered">BERTScore F1</th> | |
| <th class="has-text-centered">BERTScore Precision</th> | |
| <th class="has-text-centered">BERTScore Recall</th> | |
| <th class="has-text-centered">BERTScore F1</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.715</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.754</td> | |
| <td class="has-text-centered">0.793</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.796</td> | |
| <td class="has-text-centered">0.757</td> | |
| <td class="has-text-centered">0.785</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.786</td> | |
| <td class="has-text-centered">0.729</td> | |
| <td class="has-text-centered">0.774</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.806</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.692</td> | |
| <td class="has-text-centered">0.678</td> | |
| <td class="has-text-centered">0.681</td> | |
| <td class="has-text-centered">0.779</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered">0.807</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.777</td> | |
| <td class="has-text-centered">0.723</td> | |
| <td class="has-text-centered performance-strong">0.801</td> | |
| <td class="has-text-centered">0.829</td> | |
| <td class="has-text-centered">0.814</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.531</td> | |
| <td class="has-text-centered">0.585</td> | |
| <td class="has-text-centered performance-best">0.803</td> | |
| <td class="has-text-centered">0.833</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.702</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.783</td> | |
| <td class="has-text-centered">0.842</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.713</td> | |
| <td class="has-text-centered performance-best">0.812</td> | |
| <td class="has-text-centered">0.758</td> | |
| <td class="has-text-centered">0.790</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.727</td> | |
| <td class="has-text-centered">0.773</td> | |
| <td class="has-text-centered">0.747</td> | |
| <td class="has-text-centered">0.785</td> | |
| <td class="has-text-centered">0.839</td> | |
| <td class="has-text-centered">0.810</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.709</td> | |
| <td class="has-text-centered performance-medium">0.804</td> | |
| <td class="has-text-centered">0.752</td> | |
| <td class="has-text-centered">0.781</td> | |
| <td class="has-text-centered performance-strong">0.846</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.677</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.735</td> | |
| <td class="has-text-centered">0.774</td> | |
| <td class="has-text-centered performance-best">0.847</td> | |
| <td class="has-text-centered">0.808</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td> | |
| <td class="has-text-centered">0.703</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.791</td> | |
| <td class="has-text-centered">0.842</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered">0.759</td> | |
| <td class="has-text-centered">0.770</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.804</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.653</td> | |
| <td class="has-text-centered">0.751</td> | |
| <td class="has-text-centered">0.696</td> | |
| <td class="has-text-centered">0.797</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.692</td> | |
| <td class="has-text-centered">0.798</td> | |
| <td class="has-text-centered">0.741</td> | |
| <td class="has-text-centered">0.798</td> | |
| <td class="has-text-centered">0.838</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.679</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered">0.734</td> | |
| <td class="has-text-centered">0.799</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered performance-best">0.818</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered performance-medium">0.737</td> | |
| <td class="has-text-centered">0.802</td> | |
| <td class="has-text-centered performance-medium">0.767</td> | |
| <td class="has-text-centered">0.786</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.813</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.683</td> | |
| <td class="has-text-centered">0.617</td> | |
| <td class="has-text-centered">0.646</td> | |
| <td class="has-text-centered">0.778</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered">0.808</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.781</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.790</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.782</td> | |
| <td class="has-text-centered">0.751</td> | |
| <td class="has-text-centered">0.789</td> | |
| <td class="has-text-centered">0.834</td> | |
| <td class="has-text-centered">0.810</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered performance-best">0.757</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered performance-best">0.777</td> | |
| <td class="has-text-centered performance-medium">0.800</td> | |
| <td class="has-text-centered">0.836</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td> | |
| <td class="has-text-centered performance-strong">0.755</td> | |
| <td class="has-text-centered">0.793</td> | |
| <td class="has-text-centered performance-strong">0.773</td> | |
| <td class="has-text-centered">0.795</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.731</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.763</td> | |
| <td class="has-text-centered">0.795</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div><script src="static/js/tooltips.js"></script> | |
| <script src="static/js/fixed-tooltips.js"></script> | |