Spaces:

gtfintechlab
/

FLaME

Running

File size: 16,946 Bytes

<link rel="stylesheet" href="static/css/tooltips.css">
<style>
.tooltip-right:hover::after {
  left: auto \!important;
  right: 100% \!important;
  margin-left: 0 \!important;
  margin-right: 10px \!important;
}
</style>
<!-- Text Summarization -->
<div id="text-summarization" class="tab-content">
  <h2 class="title is-4">Text Summarization Task Results</h2>
  <div class="results-table">
    <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
      <thead>
        <tr>
          <th rowspan="2">Model</th>
          <th colspan="3" class="has-text-centered tooltip-trigger" data-title="ECTSum" data-tooltip="ECTSum contains 2,425 document-summary pairs featuring earnings call transcripts paired with concise bullet-point summaries extracted from Reuters articles. The summarization task requires extracting and condensing key financial information from lengthy corporate communications into brief, informative points.">ECTSum</th>
          <th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" style="position: relative;" data-title="EDTSum" data-tooltip="EDTSum consists of 2,000 financial news articles paired with their headlines as ground-truth summaries for evaluating text summarization. The task challenges models to condense complex financial news articles into concise, informative headlines that capture the essential information.">EDTSum</th>
        </tr>
        <tr>
          <th class="has-text-centered">BERTScore Precision</th>
          <th class="has-text-centered">BERTScore Recall</th>
          <th class="has-text-centered">BERTScore F1</th>
          <th class="has-text-centered">BERTScore Precision</th>
          <th class="has-text-centered">BERTScore Recall</th>
          <th class="has-text-centered">BERTScore F1</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
          <td class="has-text-centered">0.715</td>
          <td class="has-text-centered">0.801</td>
          <td class="has-text-centered">0.754</td>
          <td class="has-text-centered">0.793</td>
          <td class="has-text-centered performance-medium">0.844</td>
          <td class="has-text-centered performance-strong">0.817</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
          <td class="has-text-centered">0.724</td>
          <td class="has-text-centered">0.796</td>
          <td class="has-text-centered">0.757</td>
          <td class="has-text-centered">0.785</td>
          <td class="has-text-centered">0.841</td>
          <td class="has-text-centered">0.811</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
          <td class="has-text-centered">0.680</td>
          <td class="has-text-centered">0.786</td>
          <td class="has-text-centered">0.729</td>
          <td class="has-text-centered">0.774</td>
          <td class="has-text-centered">0.843</td>
          <td class="has-text-centered">0.806</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
          <td class="has-text-centered">0.692</td>
          <td class="has-text-centered">0.678</td>
          <td class="has-text-centered">0.681</td>
          <td class="has-text-centered">0.779</td>
          <td class="has-text-centered">0.840</td>
          <td class="has-text-centered">0.807</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
          <td class="has-text-centered">0.680</td>
          <td class="has-text-centered">0.777</td>
          <td class="has-text-centered">0.723</td>
          <td class="has-text-centered performance-strong">0.801</td>
          <td class="has-text-centered">0.829</td>
          <td class="has-text-centered">0.814</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
          <td class="has-text-centered">0.651</td>
          <td class="has-text-centered">0.531</td>
          <td class="has-text-centered">0.585</td>
          <td class="has-text-centered performance-best">0.803</td>
          <td class="has-text-centered">0.833</td>
          <td class="has-text-centered performance-strong">0.817</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
          <td class="has-text-centered">0.702</td>
          <td class="has-text-centered performance-strong">0.806</td>
          <td class="has-text-centered">0.750</td>
          <td class="has-text-centered">0.783</td>
          <td class="has-text-centered">0.842</td>
          <td class="has-text-centered">0.811</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
          <td class="has-text-centered">0.713</td>
          <td class="has-text-centered performance-best">0.812</td>
          <td class="has-text-centered">0.758</td>
          <td class="has-text-centered">0.790</td>
          <td class="has-text-centered">0.843</td>
          <td class="has-text-centered">0.815</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
          <td class="has-text-centered">0.727</td>
          <td class="has-text-centered">0.773</td>
          <td class="has-text-centered">0.747</td>
          <td class="has-text-centered">0.785</td>
          <td class="has-text-centered">0.839</td>
          <td class="has-text-centered">0.810</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
          <td class="has-text-centered">0.709</td>
          <td class="has-text-centered performance-medium">0.804</td>
          <td class="has-text-centered">0.752</td>
          <td class="has-text-centered">0.781</td>
          <td class="has-text-centered performance-strong">0.846</td>
          <td class="has-text-centered">0.811</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
          <td class="has-text-centered">0.677</td>
          <td class="has-text-centered performance-strong">0.806</td>
          <td class="has-text-centered">0.735</td>
          <td class="has-text-centered">0.774</td>
          <td class="has-text-centered performance-best">0.847</td>
          <td class="has-text-centered">0.808</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
          <td class="has-text-centered">0.703</td>
          <td class="has-text-centered performance-strong">0.806</td>
          <td class="has-text-centered">0.750</td>
          <td class="has-text-centered">0.791</td>
          <td class="has-text-centered">0.842</td>
          <td class="has-text-centered">0.815</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
          <td class="has-text-centered">0.724</td>
          <td class="has-text-centered">0.800</td>
          <td class="has-text-centered">0.759</td>
          <td class="has-text-centered">0.770</td>
          <td class="has-text-centered">0.843</td>
          <td class="has-text-centered">0.804</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
          <td class="has-text-centered">0.653</td>
          <td class="has-text-centered">0.751</td>
          <td class="has-text-centered">0.696</td>
          <td class="has-text-centered">0.797</td>
          <td class="has-text-centered">0.841</td>
          <td class="has-text-centered performance-strong">0.817</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
          <td class="has-text-centered">0.692</td>
          <td class="has-text-centered">0.798</td>
          <td class="has-text-centered">0.741</td>
          <td class="has-text-centered">0.798</td>
          <td class="has-text-centered">0.838</td>
          <td class="has-text-centered performance-medium">0.816</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
          <td class="has-text-centered">0.679</td>
          <td class="has-text-centered">0.800</td>
          <td class="has-text-centered">0.734</td>
          <td class="has-text-centered">0.799</td>
          <td class="has-text-centered">0.841</td>
          <td class="has-text-centered performance-best">0.818</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
          <td class="has-text-centered performance-medium">0.737</td>
          <td class="has-text-centered">0.802</td>
          <td class="has-text-centered performance-medium">0.767</td>
          <td class="has-text-centered">0.786</td>
          <td class="has-text-centered">0.843</td>
          <td class="has-text-centered">0.813</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
          <td class="has-text-centered">0.683</td>
          <td class="has-text-centered">0.617</td>
          <td class="has-text-centered">0.646</td>
          <td class="has-text-centered">0.778</td>
          <td class="has-text-centered performance-medium">0.844</td>
          <td class="has-text-centered">0.808</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
          <td class="has-text-centered">0.724</td>
          <td class="has-text-centered">0.781</td>
          <td class="has-text-centered">0.750</td>
          <td class="has-text-centered">0.790</td>
          <td class="has-text-centered performance-medium">0.844</td>
          <td class="has-text-centered">0.815</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
          <td class="has-text-centered">0.724</td>
          <td class="has-text-centered">0.782</td>
          <td class="has-text-centered">0.751</td>
          <td class="has-text-centered">0.789</td>
          <td class="has-text-centered">0.834</td>
          <td class="has-text-centered">0.810</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
          <td class="has-text-centered performance-best">0.757</td>
          <td class="has-text-centered">0.800</td>
          <td class="has-text-centered performance-best">0.777</td>
          <td class="has-text-centered performance-medium">0.800</td>
          <td class="has-text-centered">0.836</td>
          <td class="has-text-centered performance-strong">0.817</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
          <td class="has-text-centered performance-strong">0.755</td>
          <td class="has-text-centered">0.793</td>
          <td class="has-text-centered performance-strong">0.773</td>
          <td class="has-text-centered">0.795</td>
          <td class="has-text-centered">0.840</td>
          <td class="has-text-centered performance-medium">0.816</td>
        </tr>
        <tr>
          <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
          <td class="has-text-centered">0.731</td>
          <td class="has-text-centered">0.801</td>
          <td class="has-text-centered">0.763</td>
          <td class="has-text-centered">0.795</td>
          <td class="has-text-centered">0.840</td>
          <td class="has-text-centered performance-medium">0.816</td>
        </tr>
      </tbody>
    </table>
    <div class="content is-small mt-4">
      <p><strong>Note:</strong> Color highlighting indicates performance ranking: 
        <span class="performance-best">&nbsp;Best&nbsp;</span>, 
        <span class="performance-medium">&nbsp;Strong&nbsp;</span>,
        <span class="performance-low">&nbsp;Good&nbsp;</span>
      </p>
    </div>
  </div>
</div><script src="static/js/tooltips.js"></script>
<script src="static/js/fixed-tooltips.js"></script>