Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="description" | |
| content="FLaME: Holistic Financial Language Model Evaluation - A comprehensive benchmarking suite for evaluating language models on financial NLP tasks."> | |
| <meta name="keywords" content="FLaME, Finance, Language Models, NLP, Benchmarking, FinNLP"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>FLaME: Holistic Financial Language Model Evaluation</title> | |
| <link rel="icon" href="./static/images/FLaME_favicon.png" type="image/png"> | |
| <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" | |
| rel="stylesheet"> | |
| <link rel="stylesheet" href="./static/css/bulma.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-slider.min.css"> | |
| <link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> | |
| <link rel="stylesheet" | |
| href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> | |
| <link rel="stylesheet" href="./static/css/index.css"> | |
| <link rel="stylesheet" href="./static/css/results.css"> | |
| <link rel="stylesheet" href="./static/css/tooltips.css"> | |
| <link rel="icon" href="./static/images/favicon.svg"> | |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
| <script defer src="./static/js/fontawesome.all.min.js"></script> | |
| <script src="./static/js/bulma-carousel.min.js"></script> | |
| <script src="./static/js/bulma-slider.min.js"></script> | |
| <script src="./static/js/index.js"></script> | |
| <script src="./static/js/results.js"></script> | |
| <script src="./static/js/tooltips.js"></script> | |
| <script src="./static/js/fixed-tooltips.js"></script> | |
| <script src="./static/js/tooltip-fix.js"></script> | |
| <script src="./static/js/model-tooltips.js"></script> | |
| </head> | |
| <body> | |
| <!-- Navigation bar --> | |
| <nav class="navbar is-fixed-top" role="navigation" aria-label="main navigation" style="background-color: #004d99; box-shadow: 0 2px 10px rgba(0,0,0,0.1);"> | |
| <div class="container"> | |
| <div class="navbar-brand"> | |
| <a class="navbar-item" href="#" style="color: white;"> | |
| <img src="./static/images/FLaME_favicon.png" alt="FLaME Logo" style="width: 30px; height: 30px;"> | |
| <span class="flame">FLaME</span> | |
| </a> | |
| <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false" data-target="navbarMenu"> | |
| <span aria-hidden="true"></span> | |
| <span aria-hidden="true"></span> | |
| <span aria-hidden="true"></span> | |
| </a> | |
| </div> | |
| <div id="navbarMenu" class="navbar-menu" style="background-color: #004d99;"> | |
| <div class="navbar-end"> | |
| <a class="navbar-item" href="#abstract" style="color: white;"> | |
| Abstract | |
| </a> | |
| <a class="navbar-item" href="#findings" style="color: white;"> | |
| Findings | |
| </a> | |
| <a class="navbar-item" href="#methodology" style="color: white;"> | |
| Methodology | |
| </a> | |
| <a class="navbar-item" href="#results" style="color: white;"> | |
| Results | |
| </a> | |
| <a class="navbar-item" href="#framework" style="color: white;"> | |
| Framework | |
| </a> | |
| <a class="navbar-item" href="#BibTeX" style="color: white;"> | |
| Citation | |
| </a> | |
| <div class="navbar-item" style="color: white;"> | |
| <div class="buttons"> | |
| <a class="button is-info" href="https://github.com/gtfintechlab/FLaME" target="_blank"> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>GitHub</span> | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </nav> | |
| <section class="hero has-text-centered pt-6"> | |
| <div class="hero-body"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column has-text-centered"> | |
| <h1 class="title is-1 publication-title">FLaME: Holistic Financial Language Model Evaluation</h1> | |
| <div class="is-size-5 publication-authors"> | |
| <span class="author-block"> | |
| <a href="#" target="_blank">Glenn Matlin</a><sup>1</sup>,</span> | |
| <span class="author-block"> | |
| <a href="#" target="_blank">Mika Okamoto</a><sup>1</sup>,</span> | |
| <span class="author-block"> | |
| <a href="#" target="_blank">Huzaifa Pardawala</a><sup>1</sup>,</span> | |
| <span class="author-block"> | |
| <a href="#" target="_blank">Yang Yang</a><sup>1</sup>,</span> | |
| <span class="author-block"> | |
| <a href="#" target="_blank">Sudheer Chava</a><sup>1</sup> | |
| </span> | |
| </div> | |
| <div class="is-size-5 publication-authors"> | |
| <span class="author-block"><sup>1</sup>Georgia Institute of Technology</span> | |
| </div> | |
| <div class="column has-text-centered"> | |
| <div class="publication-links"> | |
| <!-- PDF Link. --> | |
| <span class="link-block"> | |
| <a href="FLaME/FLaME.pdf" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fas fa-file-pdf"></i> | |
| </span> | |
| <span>Paper</span> | |
| </a> | |
| </span> | |
| <!-- ArXiv Link --> | |
| <!-- <span class="link-block"> | |
| <a href="https://arxiv.org/abs/" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="ai ai-arxiv"></i> | |
| </span> | |
| <span>arXiv</span> | |
| </a> | |
| </span> --> | |
| <!-- Code Link. --> | |
| <span class="link-block"> | |
| <a href="https://github.com/gtfintechlab/FLaME" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Code</span> | |
| </a> | |
| </span> | |
| <!-- HuggingFace Link. --> | |
| <span class="link-block"> | |
| <a href="https://huggingface.co/gtfintechlab/" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <img src="./static/images/huggingface_logo.svg" alt="HuggingFace" width="20" height="20"> | |
| </span> | |
| <span>HuggingFace</span> | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <section class="hero teaser"> | |
| <div class="container is-max-desktop"> | |
| <div class="hero-body"> | |
| <h2 class="subtitle has-text-centered mb-5"> | |
| <span class="flame">FLaME</span> is the first comprehensive benchmarking suite for evaluating language models on financial NLP tasks. | |
| </h2> | |
| <div class="has-text-centered"> | |
| <!-- Main overview figure --> | |
| <div class="figure-container mb-5"> | |
| <figure class="image"> | |
| <!-- Note: In production, convert PDF to PNG/JPG --> | |
| <img src="FLaME/content/figures/FLaME-fig_overview_flow.png" alt="FLaME Evaluation Framework"> | |
| <figcaption class="has-text-centered mt-2 is-italic"> | |
| <strong>Figure 1:</strong> Functional Overview of FLaME. The evaluation framework includes a comprehensive taxonomy, carefully selected datasets, diverse models, and standardized metrics. | |
| </figcaption> | |
| </figure> | |
| </div> | |
| <!-- Framework visualization with cards --> | |
| <div class="columns is-multiline mt-5"> | |
| <!-- Task Taxonomy --> | |
| <div class="column is-half"> | |
| <div class="card has-background-light h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-tasks"></i></span> | |
| Task Taxonomy | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <figure class="image"> | |
| <!-- Note: In production, convert PDF to PNG/JPG --> | |
| <img src="FLaME/content/figures/FLaME-fig_methodology_tasks.png" alt="FLaME Task Taxonomy"> | |
| </figure> | |
| <p class="mt-3"><strong>Task Taxonomy:</strong> Illustrative breakdown for each of the six core NLP task categories (Classification, Sentiment Analysis, Information Retrieval, Causal Analysis, Summarization, and Question Answering). Each category encompasses specialized variants depending on data format, user needs, and domain constraints.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Domain Taxonomy --> | |
| <div class="column is-half"> | |
| <div class="card has-background-light h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-building"></i></span> | |
| Domain Taxonomy | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <figure class="image"> | |
| <!-- Note: In production, convert PDF to PNG/JPG --> | |
| <img src="FLaME/content/figures/FLaME-fig_methodology_domain.png" alt="FLaME Domain Taxonomy"> | |
| </figure> | |
| <p class="mt-3"><strong>Domain Taxonomy:</strong> Holistic taxonomy for FLaME. Unlike previous FinNLP benchmarks that were tied to specific tasks with single metrics, FLaME takes a comprehensive approach by mapping the full space of tasks, scenarios, and metrics across multiple dimensions for complete analysis.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <section id="abstract" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Abstract. --> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3 section-title has-text-centered">Abstract</h2> | |
| <div class="content has-text-justified abstract-content"> | |
| <div class="abstract-box p-5 has-background-white-ter"> | |
| <p> | |
| Language Models (LMs) have demonstrated impressive capabilities with core Natural Language Processing (NLP) tasks. The effectiveness of LMs for highly specialized knowledge-intensive tasks in finance remains difficult to assess due to major gaps in the methodologies of existing evaluation frameworks, which have caused an erroneous belief in a far lower bound of LMs' performance on common Finance NLP (FinNLP) tasks. | |
| </p> | |
| <p> | |
| To address this issue, we present <span class="flame">FLaME</span> (Financial Language Model Evaluation), the first comprehensive benchmark suite for evaluating language models on financial NLP tasks. <span class="flame">FLaME</span> includes a standardized evaluation framework, carefully selected datasets across six core financial tasks, and extensive testing across both open and proprietary LMs. Our benchmark assesses models on text classification, sentiment analysis, information retrieval, causal analysis, summarization, and question answering within the financial domain. | |
| </p> | |
| <p> | |
| Our results reveal significant variations in model performance across different financial tasks and domains, with no single model emerging as universally superior. We also find that performance does not always correlate with model size, and that open-weight models can offer strong cost-performance efficiency compared to their proprietary counterparts. <span class="flame">FLaME</span> provides a foundation for building, testing, and advancing specialized NLP models for finance. | |
| </p> | |
| </div> | |
| <div class="paper-stats has-text-centered mt-5"> | |
| <div class="columns is-mobile is-multiline"> | |
| <div class="column"> | |
| <div class="stat-box p-3"> | |
| <span class="icon is-large"><i class="fas fa-tasks fa-2x"></i></span> | |
| <p class="is-size-5 mt-2"><strong>6</strong></p> | |
| <p>FinNLP Task Categories</p> | |
| </div> | |
| </div> | |
| <div class="column"> | |
| <div class="stat-box p-3"> | |
| <span class="icon is-large"><i class="fas fa-database fa-2x"></i></span> | |
| <p class="is-size-5 mt-2"><strong>19</strong></p> | |
| <p>Datasets</p> | |
| </div> | |
| </div> | |
| <div class="column"> | |
| <div class="stat-box p-3"> | |
| <span class="icon is-large"><i class="fas fa-robot fa-2x"></i></span> | |
| <p class="is-size-5 mt-2"><strong>24</strong></p> | |
| <p>LLMs Evaluated</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Abstract. --> | |
| </div> | |
| </section> | |
| <section id="findings" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Key Findings --> | |
| <div class="columns is-centered"> | |
| <div class="column is-full"> | |
| <h2 class="title is-3 section-title has-text-centered">Key Findings</h2> | |
| <div class="content"> | |
| <div class="notification is-light mb-5"> | |
| <p class="is-size-5 has-text-centered mb-4"> | |
| Our comprehensive evaluation reveals several important insights about language model performance on financial tasks: | |
| </p> | |
| </div> | |
| <!-- Cards for key findings --> | |
| <div class="columns is-multiline"> | |
| <!-- Finding 1 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-trophy fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">No Universal Winner</h4> | |
| <p>No single language model performs best across all financial NLP tasks</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Finding 2 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-chart-line fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">Domain Variations</h4> | |
| <p>Performance varies significantly based on domain and task structure</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Finding 3 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-balance-scale fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">Cost Efficiency</h4> | |
| <p>Open-weight and mid-scale models show strong cost/performance efficiency</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Finding 4 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-calculator fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">Numeric Challenges</h4> | |
| <p>Numeric reasoning tasks (FNXL, ConvFinQA) remain challenging for all models</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Finding 5 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-ruler fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">Size Doesn't Matter</h4> | |
| <p>There's often no consistent correlation between model size and performance</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Finding 6 --> | |
| <div class="column is-one-third"> | |
| <div class="card h-100"> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <span class="icon is-large has-text-primary mb-3"> | |
| <i class="fas fa-award fa-2x"></i> | |
| </span> | |
| <h4 class="title is-5">Top Performers</h4> | |
| <p>DeepSeek R1, OpenAI o1-mini, and Claude 3.5 Sonnet demonstrate strong capabilities</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Key Findings --> | |
| </div> | |
| </section> | |
| <section id="methodology" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Methodology --> | |
| <div class="columns is-centered"> | |
| <div class="column is-full"> | |
| <h2 class="title is-3 section-title has-text-centered">Methodology</h2> | |
| <div class="content"> | |
| <div class="notification is-primary is-light mb-5"> | |
| <p class="is-size-5 has-text-centered mb-0"> | |
| Our methodological framework consists of four key components designed to provide comprehensive evaluation of language models on financial tasks. | |
| </p> | |
| </div> | |
| <!-- Methodology workflow diagram --> | |
| <div class="methodology-diagram has-text-centered mb-6"> | |
| <div class="columns is-mobile is-multiline"> | |
| <div class="column is-3"> | |
| <div class="methodology-step"> | |
| <div class="step-number">1</div> | |
| <div class="step-icon"> | |
| <span class="icon is-large"><i class="fas fa-sitemap fa-2x"></i></span> | |
| </div> | |
| <h5 class="title is-6 mt-2">Taxonomy</h5> | |
| <p class="is-size-7">Define task categories</p> | |
| </div> | |
| </div> | |
| <div class="column is-3"> | |
| <div class="methodology-step"> | |
| <div class="step-number">2</div> | |
| <div class="step-icon"> | |
| <span class="icon is-large"><i class="fas fa-database fa-2x"></i></span> | |
| </div> | |
| <h5 class="title is-6 mt-2">Datasets</h5> | |
| <p class="is-size-7">Select & prepare data</p> | |
| </div> | |
| </div> | |
| <div class="column is-3"> | |
| <div class="methodology-step"> | |
| <div class="step-number">3</div> | |
| <div class="step-icon"> | |
| <span class="icon is-large"><i class="fas fa-robot fa-2x"></i></span> | |
| </div> | |
| <h5 class="title is-6 mt-2">Models</h5> | |
| <p class="is-size-7">Benchmark LMs</p> | |
| </div> | |
| </div> | |
| <div class="column is-3"> | |
| <div class="methodology-step"> | |
| <div class="step-number">4</div> | |
| <div class="step-icon"> | |
| <span class="icon is-large"><i class="fas fa-chart-bar fa-2x"></i></span> | |
| </div> | |
| <h5 class="title is-6 mt-2">Evaluation</h5> | |
| <p class="is-size-7">Measure performance</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Methodology detailed info --> | |
| <div class="columns is-multiline"> | |
| <!-- Taxonomy --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-sitemap"></i></span> | |
| Taxonomy | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p>We developed a scenario-based taxonomy organizing financial NLP tasks along three dimensions:</p> | |
| <ul style="list-style-type: none;"> | |
| <li><strong>Tasks:</strong> Six core tasks - text classification, sentiment analysis, information retrieval, causal analysis, text summarization, and question answering</li> | |
| <li><strong>Domains:</strong> Categorized by what (type of data), who (data source), where (origination), when (time period), how (generation method), and why (purpose)</li> | |
| <li><strong>Languages:</strong> Currently focusing on English with identified need for multilingual expansion</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Datasets --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-database"></i></span> | |
| Datasets | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p>We carefully selected datasets based on:</p> | |
| <ul style="list-style-type: none;"> | |
| <li><strong>Domain relevance:</strong> Majority of content directly related to finance</li> | |
| <li><strong>Licensing:</strong> Fair usage licensing and proper attribution</li> | |
| <li><strong>Quality:</strong> Transparent sourcing with minimal risk of label corruption</li> | |
| <li><strong>Complexity:</strong> Exercises real financial knowledge, not trivial tasks</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Models --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-robot"></i></span> | |
| Models | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p>We evaluated diverse language models across two categories:</p> | |
| <div class="columns"> | |
| <div class="column is-half"> | |
| <h6 class="title is-6">Proprietary</h6> | |
| <ul style="list-style-type: none;"> | |
| <li>GPT-4o & o1-mini</li> | |
| <li>Gemini-1.5 Pro</li> | |
| <li>Claude 3.5 Sonnet</li> | |
| <li>Claude 3 Haiku</li> | |
| <li>Cohere Command (R 7B, R+)</li> | |
| <li>Jamba 1.5 Mini & Large</li> | |
| </ul> | |
| </div> | |
| <div class="column is-half"> | |
| <h6 class="title is-6">Open-weight</h6> | |
| <ul style="list-style-type: none;"> | |
| <li>Llama-3 (8B, 70B)</li> | |
| <li>DeepSeek V3 & R-1</li> | |
| <li>DBRX Instruct</li> | |
| <li>Qwen-2 (72B) & QwQ-32B</li> | |
| <li>Mistral (7B, 8x7B, 8x22B)</li> | |
| <li>Gemma-2 (9B, 27B)</li> | |
| <li>WizardLM-2 (8x22B)</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Evaluation Process --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-chart-bar"></i></span> | |
| Evaluation Process | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p>Our two-stage evaluation approach includes:</p> | |
| <ol> | |
| <li><strong>Generation:</strong> Language model generates responses to task-specific inputs</li> | |
| <li><strong>Extraction:</strong> Separate process identifies relevant output using structured pattern matching</li> | |
| </ol> | |
| <p>Pipeline stages:</p> | |
| <ul style="list-style-type: none;"> | |
| <li>Configuration of tasks, datasets, and parameters</li> | |
| <li>Model interaction via local instantiation or API</li> | |
| <li>Post-processing and structured output extraction</li> | |
| <li>Task-specific metric computation and logging</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Methodology --> | |
| </div> | |
| </section> | |
| <section id="results" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Results --> | |
| <div class="columns is-centered"> | |
| <div class="column is-full"> | |
| <h2 class="title is-3 section-title has-text-centered">Results</h2> | |
| <div class="content"> | |
| <div class="notification is-info is-light mb-5"> | |
| <p class="is-size-5 has-text-centered mb-0"> | |
| Our comprehensive evaluation reveals significant performance variations across different financial tasks and models. | |
| </p> | |
| </div> | |
| </div> | |
| <section class="section"> | |
| <div class="container"> | |
| <h1 class="title is-2">FLaME: Financial Language Model Evaluation Results</h1> | |
| <div class="content"> | |
| <p class="is-size-5"> | |
| This page presents the results of the FLaME evaluation across various financial NLP tasks. | |
| Each tab shows performance metrics for different task categories. | |
| </p> | |
| </div> | |
| <div class="tabs-container"> | |
| <div class="tabs is-centered is-boxed"> | |
| <ul> | |
| <li class="is-active" data-tab="main"> | |
| <a> | |
| <span>All Tasks</span> | |
| </a> | |
| </li> | |
| <li data-tab="causal-analysis"> | |
| <a> | |
| <span>Causal Analysis</span> | |
| </a> | |
| </li> | |
| <li data-tab="information-retrieval"> | |
| <a> | |
| <span>Information Retrieval</span> | |
| </a> | |
| </li> | |
| <li data-tab="question-answering"> | |
| <a> | |
| <span>Question Answering</span> | |
| </a> | |
| </li> | |
| <li data-tab="sentiment-analysis"> | |
| <a> | |
| <span>Sentiment Analysis</span> | |
| </a> | |
| </li> | |
| <li data-tab="text-classification"> | |
| <a> | |
| <span>Text Classification</span> | |
| </a> | |
| </li> | |
| <li data-tab="text-summarization"> | |
| <a> | |
| <span>Text Summarization</span> | |
| </a> | |
| </li> | |
| <li data-tab="cost-analysis"> | |
| <a> | |
| <span>Cost Analysis</span> | |
| </a> | |
| </li> | |
| </ul> | |
| </div> | |
| <div id="main" class="tab-content"> | |
| <h2 class="title is-4">Overall Performance Across All Tasks</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th>Model</th> | |
| <th colspan="4" class="has-text-centered column-border-left">Information Retrieval</th> | |
| <th class="has-text-centered column-border-left">*</th> | |
| <th colspan="3" class="has-text-centered column-border-left">Sentiment Analysis</th> | |
| <th colspan="2" class="has-text-centered column-border-left">Causal Analysis</th> | |
| <th colspan="5" class="has-text-centered column-border-left">Text Classification</th> | |
| <th colspan="3" class="has-text-centered column-border-left">Question Answering</th> | |
| <th colspan="2" class="has-text-centered column-border-left">Summarization</th> | |
| </tr> | |
| <tr> | |
| <th>Dataset</th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="FiNER-ORD" data-tooltip="FiNER-ORD is a manually annotated named entity recognition dataset comprising financial news articles with detailed entity annotations. The task requires identifying and correctly classifying person, location, and organization entities in financial contexts.">FiNER<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinRED" data-tooltip="FinRED is a specialized relation extraction dataset created from financial news and earnings call transcripts using distance supervision based on Wikidata triplets. The task involves identifying and extracting financial relationships between entities to understand connections in financial contexts.">FR<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ReFiND" data-tooltip="ReFiND is a comprehensive relation extraction dataset containing approximately 29,000 annotated instances with 22 distinct relation types across 8 entity pair categories from various financial documents. The task requires identifying specific relationships between financial entities in complex documents like SEC filings.">RD<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FNXL" data-tooltip="FNXL contains 79,088 sentences with 142,922 annotated numerals extracted from SEC 10-K reports and categorized under 2,794 distinct numerical labels. The information extraction task requires identifying, categorizing and understanding the financial significance of numerical entities in regulatory filings.">FNXL<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinEntity" data-tooltip="FinEntity consists of 979 financial news paragraphs containing 2,131 manually-annotated financial entities with sentiment classifications. The task involves identifying companies and asset classes in financial texts while determining the associated sentiment expressed toward each entity.">FE<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="FiQA Task 1" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis in microblog posts and news headlines using a continuous scale from -1 (negative) to 1 (positive). The regression task requires models to accurately predict the sentiment score that reflects investor perception of financial texts.">FiQA<span class="metric-label">MSE</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="SubjECTive-QA" data-tooltip="SubjECTive-QA contains 49,446 annotations across 2,747 question-answer pairs extracted from 120 earnings call transcripts. The multi-label classification task involves analyzing six subjective features in financial discourse: assertiveness, cautiousness, optimism, specificity, clarity, and relevance.">SQA<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="Financial Phrase Bank" data-tooltip="Financial Phrase Bank (FPB) contains 4,840 sentences from financial news articles categorized as positive, negative, or neutral by 16 finance experts using majority voting. The sentiment classification task requires understanding how these statements might influence investor perception of stock prices.">FPB<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="FinCausal (CD)" data-tooltip="FinCausal Causal Discovery (CD) contains 29,444 text sections from financial news, with 2,136 annotated as expressing causal relationships. The task involves extracting precise cause and effect spans from financial texts that contain causal relationships.">CD<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinCausal (CC)" data-tooltip="FinCausal Causality Classification (CC) consists of 29,444 text sections from financial news with binary annotations indicating causal relationships. The classification task requires determining whether a given financial text section contains a causal relationship (1) or not (0).">CC<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="Banking77" data-tooltip="Banking77 is a fine-grained dataset comprising 13,083 customer service queries annotated with 77 unique intents from the banking domain. The task involves accurately classifying each customer query into the correct intent category to improve automated banking support systems.">B77<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinBench" data-tooltip="FinBench is a comprehensive evaluation dataset containing 333,000 labeled instances that combines tabular data and profile text for financial risk prediction. The task requires models to predict financial outcomes across three key risk categories: default, fraud, and customer churn.">FB<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FOMC" data-tooltip="FOMC is a dataset containing Federal Open Market Committee speeches, meeting minutes, and press conference transcripts spanning from 1996 to 2022. The classification task involves determining whether the monetary policy stance expressed in each document is hawkish (tighter monetary policy) or dovish (looser monetary policy).">FOMC<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="NumClaim" data-tooltip="NumClaim is an expert-annotated dataset for detecting and analyzing fine-grained investor claims within financial narratives that contain numerical information. The task requires identifying and categorizing claims containing numerals in analyst reports and earnings call transcripts for investment decision making.">NC<span class="metric-label">F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="Headlines" data-tooltip="Headlines is a dataset containing 11,412 human-annotated financial news headlines focused on commodities, particularly gold, spanning from 2000 to 2019. The classification task involves identifying binary indicators for price mentions and directional price movements in these concise financial texts.">HL<span class="metric-label">Acc</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ConvFinQA" data-tooltip="ConvFinQA is a multi-turn question answering dataset with 3,892 conversations containing 14,115 questions that explore chains of numerical reasoning in financial contexts. The conversational task requires maintaining context while performing sequential numerical operations to answer increasingly complex financial questions.">CFQA<span class="metric-label">Acc</span></th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="FinQA" data-tooltip="FinQA contains 8,281 question-answer pairs derived from financial reports that require numerical reasoning over tabular financial data. The question-answering task features multi-step reasoning challenges with full annotation of reasoning programs to solve complex financial queries.">FinQA<span class="metric-label">Acc</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="TATQA" data-tooltip="TATQA is a large-scale question answering dataset for hybrid data sources that combines tables and text from financial reports. The task emphasizes numerical reasoning operations across multiple formats, requiring models to integrate information from structured and unstructured sources to answer financial questions.">TQA<span class="metric-label">Acc</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right column-border-left" data-title="ECTSum" data-tooltip="ECTSum contains 2,425 document-summary pairs featuring earnings call transcripts paired with concise bullet-point summaries extracted from Reuters articles. The summarization task requires extracting and condensing key financial information from lengthy corporate communications into brief, informative points.">ECTSum<span class="metric-label">BERT-F1</span></th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="EDTSum" data-tooltip="EDTSum consists of 2,000 financial news articles paired with their headlines as ground-truth summaries for evaluating text summarization. The task challenges models to condense complex financial news articles into concise, informative headlines that capture the essential information.">EDTSum<span class="metric-label">BERT-F1</span></th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="column-border-left">.701</td><td>.332</td><td>.883</td><td>.020</td> | |
| <td class="column-border-left">.469</td><td>.123</td><td>.535</td><td>.902</td> | |
| <td class="column-border-left">.142</td><td>.192</td> | |
| <td class="column-border-left">.645</td><td>.309</td><td>.652</td><td>.386</td><td>.811</td> | |
| <td class="column-border-left">.709</td><td>.809</td><td>.772</td> | |
| <td class="column-border-left">.754</td><td class="performance-medium">.817</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="column-border-left">.565</td><td>.289</td><td>.705</td><td>.003</td> | |
| <td class="column-border-left">.350</td><td>.161</td><td class="performance-best">.600</td><td>.698</td> | |
| <td class="column-border-left">.049</td><td>.234</td> | |
| <td class="column-border-left">.512</td><td>.659</td><td>.497</td><td>.511</td><td>.763</td> | |
| <td class="column-border-left">.268</td><td>.767</td><td>.706</td> | |
| <td class="column-border-left">.757</td><td>.811</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="column-border-left">.489</td><td>.304</td><td>.778</td><td>.009</td> | |
| <td class="column-border-left">.006</td><td>.160</td><td>.436</td><td>.499</td> | |
| <td class="column-border-left">.087</td><td>.231</td> | |
| <td class="column-border-left">.574</td><td>.483</td><td>.193</td><td>.319</td><td>.746</td> | |
| <td class="column-border-left">.252</td><td>.738</td><td>.633</td> | |
| <td class="column-border-left">.729</td><td>.806</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="column-border-left">.745</td><td>.334</td><td>.879</td><td>.007</td> | |
| <td class="column-border-left">.416</td><td>.118</td><td>.462</td><td>.811</td> | |
| <td class="column-border-left">.025</td><td>.193</td> | |
| <td class="column-border-left">.578</td><td>.492</td><td>.407</td><td>.151</td><td>.778</td> | |
| <td class="column-border-left">.174</td><td>.742</td><td>.355</td> | |
| <td class="column-border-left">.681</td><td>.807</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="column-border-left">.761</td><td>.356</td><td>.902</td><td>.006</td> | |
| <td class="column-border-left">.298</td><td class="performance-best">.100</td><td>.515</td><td>.884</td> | |
| <td class="column-border-left">.133</td><td>.242</td> | |
| <td class="column-border-left">.621</td><td>.538</td><td>.620</td><td>.408</td><td>.808</td> | |
| <td class="column-border-left">.268</td><td>.768</td><td>.734</td> | |
| <td class="column-border-left">.723</td><td>.814</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="column-border-left">.651</td><td>.331</td><td>.892</td><td>.005</td> | |
| <td class="column-border-left">.367</td><td>.189</td><td>.491</td><td class="performance-medium">.940</td> | |
| <td class="column-border-left">.105</td><td>.207</td> | |
| <td class="column-border-left">.609</td><td>.541</td><td>.519</td><td>.365</td><td class="performance-best">.856</td> | |
| <td class="column-border-left">.292</td><td>.779</td><td>.750</td> | |
| <td class="column-border-left">.585</td><td class="performance-medium">.817</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="column-border-left">.526</td><td>.276</td><td>.771</td><td>.004</td> | |
| <td class="column-border-left">.368</td><td>.135</td><td>.522</td><td>.841</td> | |
| <td class="column-border-left">.052</td><td>.227</td> | |
| <td class="column-border-left">.528</td><td>.503</td><td>.542</td><td>.412</td><td>.779</td> | |
| <td class="column-border-left">.199</td><td>.655</td><td>.553</td> | |
| <td class="column-border-left">.750</td><td>.811</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="column-border-left">.635</td><td>.367</td><td>.811</td><td>.009</td> | |
| <td class="column-border-left">.435</td><td>.221</td><td>.510</td><td>.776</td> | |
| <td class="column-border-left">.125</td><td class="performance-best">.308</td> | |
| <td class="column-border-left">.602</td><td>.221</td><td>.465</td><td>.513</td><td class="performance-low">.835</td> | |
| <td class="column-border-left">.285</td><td>.766</td><td>.666</td> | |
| <td class="column-border-left">.758</td><td>.815</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="column-border-left">.598</td><td>.282</td><td>.845</td><td>.009</td> | |
| <td class="column-border-left">.267</td><td>.208</td><td>.498</td><td>.893</td> | |
| <td class="column-border-left">.055</td><td>.229</td> | |
| <td class="column-border-left">.547</td><td>.396</td><td>.603</td><td>.583</td><td>.805</td> | |
| <td class="column-border-left">.315</td><td>.611</td><td>.501</td> | |
| <td class="column-border-left">.747</td><td>.810</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="column-border-left">.748</td><td>.348</td><td>.854</td><td>.012</td> | |
| <td class="column-border-left">.483</td><td>.205</td><td>.576</td><td>.901</td> | |
| <td class="column-border-left">.190</td><td>.184</td> | |
| <td class="column-border-left">.627</td><td>.495</td><td>.605</td><td>.639</td><td>.830</td> | |
| <td class="column-border-left">.269</td><td>.819</td><td>.715</td> | |
| <td class="column-border-left">.752</td><td>.811</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="column-border-left">.744</td><td>.355</td><td>.852</td><td>.008</td> | |
| <td class="column-border-left">.226</td><td>.129</td><td>.566</td><td>.779</td> | |
| <td class="column-border-left">.114</td><td>.201</td> | |
| <td class="column-border-left">.648</td><td>.500</td><td>.505</td><td>.272</td><td>.797</td> | |
| <td class="column-border-left">.247</td><td>.796</td><td>.725</td> | |
| <td class="column-border-left">.735</td><td>.808</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="column-border-left performance-low">.790</td><td class="performance-medium">.437</td><td>.934</td><td class="performance-low">.045</td> | |
| <td class="column-border-left">.549</td><td>.150</td><td class="performance-low">.583</td><td>.814</td> | |
| <td class="column-border-left performance-low">.198</td><td>.170</td> | |
| <td class="column-border-left performance-medium">.714</td><td>.487</td><td>.578</td><td>.675</td><td>.729</td> | |
| <td class="column-border-left">.261</td><td class="performance-medium">.840</td><td class="performance-low">.779</td> | |
| <td class="column-border-left">.750</td><td>.815</td> | |
| </tr> | |
| <tr> | |
| <td><strong>DeepSeek R1</strong></td> | |
| <td class="column-border-left performance-best">.807</td><td>.393</td><td class="performance-best">.952</td><td class="performance-best">.057</td> | |
| <td class="column-border-left performance-low">.587</td><td>.110</td><td>.499</td><td>.902</td> | |
| <td class="column-border-left performance-best">.337</td><td>.202</td> | |
| <td class="column-border-left performance-best">.763</td><td>.419</td><td class="performance-medium">.670</td><td>.688</td><td>.769</td> | |
| <td class="column-border-left performance-best">.853</td><td class="performance-low">.836</td><td class="performance-best">.858</td> | |
| <td class="column-border-left">.759</td><td>.804</td> | |
| </tr> | |
| <tr> | |
| <td><strong>QwQ-32B-Preview</strong></td> | |
| <td class="column-border-left">.685</td><td>.270</td><td>.656</td><td>.001</td> | |
| <td class="column-border-left">.005</td><td>.141</td><td>.550</td><td>.815</td> | |
| <td class="column-border-left">.131</td><td>.220</td> | |
| <td class="column-border-left">.613</td><td class="performance-medium">.784</td><td>.555</td><td>.020</td><td>.744</td> | |
| <td class="column-border-left">.282</td><td>.793</td><td class="performance-medium">.796</td> | |
| <td class="column-border-left">.696</td><td class="performance-medium">.817</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="column-border-left">.552</td><td>.284</td><td>.844</td><td>.005</td> | |
| <td class="column-border-left">.132</td><td>.119</td><td>.418</td><td>.765</td> | |
| <td class="column-border-left">.043</td><td class="performance-medium">.270</td> | |
| <td class="column-border-left">.508</td><td class="performance-best">.898</td><td>.499</td><td>.151</td><td>.682</td> | |
| <td class="column-border-left">.218</td><td>.666</td><td>.586</td> | |
| <td class="column-border-left">.741</td><td>.816</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="column-border-left">.693</td><td>.341</td><td>.862</td><td>.005</td> | |
| <td class="column-border-left">.397</td><td>.183</td><td>.582</td><td>.798</td> | |
| <td class="column-border-left">.074</td><td>.176</td> | |
| <td class="column-border-left">.628</td><td>.618</td><td>.550</td><td>.541</td><td>.782</td> | |
| <td class="column-border-left">.225</td><td>.790</td><td>.660</td> | |
| <td class="column-border-left">.734</td><td class="performance-best">.818</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="column-border-left performance-medium">.799</td><td class="performance-best">.439</td><td>.891</td><td class="performance-medium">.047</td> | |
| <td class="column-border-left performance-medium">.655</td><td class="performance-medium">.101</td><td>.553</td><td class="performance-best">.944</td> | |
| <td class="column-border-left">.196</td><td>.197</td> | |
| <td class="column-border-left">.668</td><td>.634</td><td class="performance-best">.674</td><td class="performance-low">.692</td><td>.827</td> | |
| <td class="column-border-left">.402</td><td class="performance-best">.844</td><td>.700</td> | |
| <td class="column-border-left performance-low">.767</td><td>.813</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="column-border-left">.711</td><td>.285</td><td>.883</td><td>.015</td> | |
| <td class="column-border-left">.494</td><td>.167</td><td>.463</td><td>.908</td> | |
| <td class="column-border-left">.081</td><td>.200</td> | |
| <td class="column-border-left">.622</td><td>.022</td><td>.631</td><td>.558</td><td>.781</td> | |
| <td class="column-border-left">.421</td><td>.803</td><td>.733</td> | |
| <td class="column-border-left">.646</td><td>.808</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="column-border-left">.748</td><td>.194</td><td>.845</td><td>.018</td> | |
| <td class="column-border-left">.441</td><td>.164</td><td>.532</td><td>.840</td> | |
| <td class="column-border-left">.057</td><td class="performance-low">.255</td> | |
| <td class="column-border-left">.516</td><td class="performance-low">.762</td><td>.459</td><td>.068</td><td>.770</td> | |
| <td class="column-border-left">.212</td><td>.709</td><td>.716</td> | |
| <td class="column-border-left">.750</td><td>.815</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="column-border-left">.756</td><td>.333</td><td>.922</td><td>.021</td> | |
| <td class="column-border-left">.452</td><td class="performance-low">.106</td><td>.533</td><td>.699</td> | |
| <td class="column-border-left">.080</td><td>.238</td> | |
| <td class="column-border-left">.651</td><td>.684</td><td>.393</td><td>.118</td><td>.812</td> | |
| <td class="column-border-left">.259</td><td>.776</td><td>.698</td> | |
| <td class="column-border-left">.751</td><td>.810</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="column-border-left">.712</td><td>.374</td><td class="performance-medium">.944</td><td>.019</td> | |
| <td class="column-border-left">.393</td><td>.144</td><td class="performance-medium">.593</td><td>.885</td> | |
| <td class="column-border-left">.196</td><td>.217</td> | |
| <td class="column-border-left">.418</td><td>.336</td><td>.579</td><td>.525</td><td class="performance-medium">.837</td> | |
| <td class="column-border-left">.280</td><td>.829</td><td>.763</td> | |
| <td class="column-border-left performance-best">.777</td><td class="performance-medium">.817</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="column-border-left">.766</td><td>.399</td><td class="performance-low">.942</td><td>.037</td> | |
| <td class="column-border-left">.523</td><td>.184</td><td>.541</td><td class="performance-low">.928</td> | |
| <td class="column-border-left">.130</td><td>.222</td> | |
| <td class="column-border-left performance-low">.710</td><td>.524</td><td class="performance-low">.664</td><td class="performance-best">.750</td><td>.824</td> | |
| <td class="column-border-left performance-low">.749</td><td>.836</td><td>.754</td> | |
| <td class="column-border-left performance-medium">.773</td><td>.816</td> | |
| </tr> | |
| <tr> | |
| <td><strong>OpenAI o1-mini</strong></td> | |
| <td class="column-border-left">.761</td><td class="performance-low">.403</td><td>.876</td><td>.010</td> | |
| <td class="column-border-left performance-best">.662</td><td>.120</td><td>.542</td><td>.917</td> | |
| <td class="column-border-left performance-medium">.289</td><td>.209</td> | |
| <td class="column-border-left">.670</td><td>.612</td><td>.635</td><td class="performance-medium">.720</td><td>.769</td> | |
| <td class="column-border-left performance-medium">.840</td><td>.799</td><td>.698</td> | |
| <td class="column-border-left">.763</td><td>.816</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Causal Analysis tab content --> | |
| <div id="causal-analysis" class="tab-content"> | |
| <h2 class="title is-4">Causal Analysis Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinCausal (CD)" data-tooltip="FinCausal Causal Discovery (CD) contains 29,444 text sections from financial news, with 2,136 annotated as expressing causal relationships. The task involves extracting precise cause and effect spans from financial texts that contain causal relationships.">Causal Detection (CD)</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinCausal (CC)" data-tooltip="FinCausal Causality Classification (CC) consists of 29,444 text sections from financial news with binary annotations indicating causal relationships. The classification task requires determining whether a given financial text section contains a causal relationship (1) or not (0).">Causal Classification (CC)</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.148</td> | |
| <td class="has-text-centered">0.429</td> | |
| <td class="has-text-centered">0.148</td> | |
| <td class="has-text-centered">0.142</td> | |
| <td class="has-text-centered">0.241</td> | |
| <td class="has-text-centered">0.329</td> | |
| <td class="has-text-centered">0.192</td> | |
| <td class="has-text-centered">0.198</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.097</td> | |
| <td class="has-text-centered">0.341</td> | |
| <td class="has-text-centered">0.097</td> | |
| <td class="has-text-centered">0.049</td> | |
| <td class="has-text-centered">0.232</td> | |
| <td class="has-text-centered">0.241</td> | |
| <td class="has-text-centered">0.234</td> | |
| <td class="has-text-centered performance-medium">0.380</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.078</td> | |
| <td class="has-text-centered">0.521</td> | |
| <td class="has-text-centered">0.078</td> | |
| <td class="has-text-centered">0.087</td> | |
| <td class="has-text-centered">0.276</td> | |
| <td class="has-text-centered">0.313</td> | |
| <td class="has-text-centered">0.231</td> | |
| <td class="has-text-centered">0.235</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.026</td> | |
| <td class="has-text-centered">0.214</td> | |
| <td class="has-text-centered">0.026</td> | |
| <td class="has-text-centered">0.025</td> | |
| <td class="has-text-centered">0.141</td> | |
| <td class="has-text-centered">0.328</td> | |
| <td class="has-text-centered">0.193</td> | |
| <td class="has-text-centered">0.221</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered">0.115</td> | |
| <td class="has-text-centered">0.510</td> | |
| <td class="has-text-centered">0.115</td> | |
| <td class="has-text-centered">0.133</td> | |
| <td class="has-text-centered">0.309</td> | |
| <td class="has-text-centered">0.310</td> | |
| <td class="has-text-centered">0.242</td> | |
| <td class="has-text-centered">0.262</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.115</td> | |
| <td class="has-text-centered">0.394</td> | |
| <td class="has-text-centered">0.115</td> | |
| <td class="has-text-centered">0.105</td> | |
| <td class="has-text-centered">0.275</td> | |
| <td class="has-text-centered">0.294</td> | |
| <td class="has-text-centered">0.207</td> | |
| <td class="has-text-centered">0.258</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.078</td> | |
| <td class="has-text-centered">0.455</td> | |
| <td class="has-text-centered">0.078</td> | |
| <td class="has-text-centered">0.052</td> | |
| <td class="has-text-centered">0.339</td> | |
| <td class="has-text-centered performance-best">0.361</td> | |
| <td class="has-text-centered">0.227</td> | |
| <td class="has-text-centered">0.258</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.131</td> | |
| <td class="has-text-centered">0.486</td> | |
| <td class="has-text-centered">0.131</td> | |
| <td class="has-text-centered">0.125</td> | |
| <td class="has-text-centered">0.344</td> | |
| <td class="has-text-centered">0.310</td> | |
| <td class="has-text-centered performance-best">0.308</td> | |
| <td class="has-text-centered performance-medium">0.318</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.088</td> | |
| <td class="has-text-centered">0.510</td> | |
| <td class="has-text-centered">0.088</td> | |
| <td class="has-text-centered">0.055</td> | |
| <td class="has-text-centered">0.308</td> | |
| <td class="has-text-centered">0.314</td> | |
| <td class="has-text-centered">0.229</td> | |
| <td class="has-text-centered">0.273</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.139</td> | |
| <td class="has-text-centered">0.489</td> | |
| <td class="has-text-centered">0.139</td> | |
| <td class="has-text-centered">0.190</td> | |
| <td class="has-text-centered">0.208</td> | |
| <td class="has-text-centered">0.330</td> | |
| <td class="has-text-centered">0.184</td> | |
| <td class="has-text-centered">0.188</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.076</td> | |
| <td class="has-text-centered">0.453</td> | |
| <td class="has-text-centered">0.076</td> | |
| <td class="has-text-centered">0.114</td> | |
| <td class="has-text-centered">0.263</td> | |
| <td class="has-text-centered">0.347</td> | |
| <td class="has-text-centered">0.201</td> | |
| <td class="has-text-centered">0.237</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered">0.164</td> | |
| <td class="has-text-centered">0.528</td> | |
| <td class="has-text-centered">0.164</td> | |
| <td class="has-text-centered performance-medium">0.198</td> | |
| <td class="has-text-centered">0.194</td> | |
| <td class="has-text-centered">0.327</td> | |
| <td class="has-text-centered">0.170</td> | |
| <td class="has-text-centered">0.248</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered performance-best">0.245</td> | |
| <td class="has-text-centered performance-medium">0.643</td> | |
| <td class="has-text-centered performance-best">0.245</td> | |
| <td class="has-text-centered performance-best">0.337</td> | |
| <td class="has-text-centered performance-best">0.385</td> | |
| <td class="has-text-centered">0.318</td> | |
| <td class="has-text-centered">0.202</td> | |
| <td class="has-text-centered">0.221</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.110</td> | |
| <td class="has-text-centered">0.473</td> | |
| <td class="has-text-centered">0.110</td> | |
| <td class="has-text-centered">0.131</td> | |
| <td class="has-text-centered">0.193</td> | |
| <td class="has-text-centered">0.262</td> | |
| <td class="has-text-centered">0.220</td> | |
| <td class="has-text-centered performance-best">0.465</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.050</td> | |
| <td class="has-text-centered">0.280</td> | |
| <td class="has-text-centered">0.050</td> | |
| <td class="has-text-centered">0.043</td> | |
| <td class="has-text-centered">0.323</td> | |
| <td class="has-text-centered">0.283</td> | |
| <td class="has-text-centered performance-medium">0.270</td> | |
| <td class="has-text-centered">0.295</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.076</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.076</td> | |
| <td class="has-text-centered">0.074</td> | |
| <td class="has-text-centered">0.268</td> | |
| <td class="has-text-centered">0.248</td> | |
| <td class="has-text-centered">0.176</td> | |
| <td class="has-text-centered">0.200</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered">0.154</td> | |
| <td class="has-text-centered">0.564</td> | |
| <td class="has-text-centered">0.154</td> | |
| <td class="has-text-centered">0.196</td> | |
| <td class="has-text-centered">0.259</td> | |
| <td class="has-text-centered">0.336</td> | |
| <td class="has-text-centered">0.197</td> | |
| <td class="has-text-centered">0.235</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.082</td> | |
| <td class="has-text-centered">0.388</td> | |
| <td class="has-text-centered">0.082</td> | |
| <td class="has-text-centered">0.081</td> | |
| <td class="has-text-centered performance-medium">0.369</td> | |
| <td class="has-text-centered">0.347</td> | |
| <td class="has-text-centered">0.200</td> | |
| <td class="has-text-centered">0.203</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.089</td> | |
| <td class="has-text-centered">0.363</td> | |
| <td class="has-text-centered">0.089</td> | |
| <td class="has-text-centered">0.057</td> | |
| <td class="has-text-centered performance-medium">0.379</td> | |
| <td class="has-text-centered performance-medium">0.356</td> | |
| <td class="has-text-centered performance-medium">0.255</td> | |
| <td class="has-text-centered">0.275</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered">0.090</td> | |
| <td class="has-text-centered">0.453</td> | |
| <td class="has-text-centered">0.090</td> | |
| <td class="has-text-centered">0.080</td> | |
| <td class="has-text-centered">0.353</td> | |
| <td class="has-text-centered">0.336</td> | |
| <td class="has-text-centered">0.238</td> | |
| <td class="has-text-centered">0.265</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered performance-medium">0.165</td> | |
| <td class="has-text-centered">0.514</td> | |
| <td class="has-text-centered performance-medium">0.165</td> | |
| <td class="has-text-centered">0.196</td> | |
| <td class="has-text-centered">0.265</td> | |
| <td class="has-text-centered performance-medium">0.357</td> | |
| <td class="has-text-centered">0.217</td> | |
| <td class="has-text-centered">0.258</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered">0.082</td> | |
| <td class="has-text-centered performance-medium">0.576</td> | |
| <td class="has-text-centered">0.082</td> | |
| <td class="has-text-centered">0.130</td> | |
| <td class="has-text-centered">0.254</td> | |
| <td class="has-text-centered">0.327</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.235</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered performance-medium">0.206</td> | |
| <td class="has-text-centered performance-best">0.648</td> | |
| <td class="has-text-centered performance-medium">0.206</td> | |
| <td class="has-text-centered performance-medium">0.289</td> | |
| <td class="has-text-centered">0.325</td> | |
| <td class="has-text-centered">0.316</td> | |
| <td class="has-text-centered">0.209</td> | |
| <td class="has-text-centered">0.233</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Information Retrieval tab content --> | |
| <div id="information-retrieval" class="tab-content"> | |
| <h2 class="title is-4">Information Retrieval Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FiNER-ORD" data-tooltip="FiNER-ORD is a manually annotated named entity recognition dataset comprising financial news articles with detailed entity annotations. The task requires identifying and correctly classifying person, location, and organization entities in financial contexts.">FiNER-ORD</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinRED" data-tooltip="FinRED is a specialized relation extraction dataset created from financial news and earnings call transcripts using distance supervision based on Wikidata triplets. The task involves identifying and extracting financial relationships between entities to understand connections in financial contexts.">FinRED</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="ReFiND" data-tooltip="ReFiND is a comprehensive relation extraction dataset containing approximately 29,000 annotated instances with 22 distinct relation types across 8 entity pair categories from various financial documents. The task requires identifying specific relationships between financial entities in complex documents like SEC filings.">ReFiND</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FNXL" data-tooltip="FNXL is a financial news cross-lingual dataset containing 1,000 news articles in 5 languages with 5,000 annotated entity mentions. The task requires identifying and classifying entities in financial news articles across multiple languages.">FNXL</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinEntity" data-tooltip="FinEntity consists of 979 financial news paragraphs containing 2,131 manually-annotated financial entities with sentiment classifications. The task involves identifying companies and asset classes in financial texts while determining the associated sentiment expressed toward each entity.">FinEntity</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">F1</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.715</td> | |
| <td class="has-text-centered">0.693</td> | |
| <td class="has-text-centered">0.701</td> | |
| <td class="has-text-centered">0.911</td> | |
| <td class="has-text-centered">0.314</td> | |
| <td class="has-text-centered performance-medium">0.454</td> | |
| <td class="has-text-centered">0.314</td> | |
| <td class="has-text-centered">0.332</td> | |
| <td class="has-text-centered">0.879</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.879</td> | |
| <td class="has-text-centered">0.883</td> | |
| <td class="has-text-centered">0.015</td> | |
| <td class="has-text-centered">0.030</td> | |
| <td class="has-text-centered">0.020</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.474</td> | |
| <td class="has-text-centered">0.485</td> | |
| <td class="has-text-centered">0.485</td> | |
| <td class="has-text-centered">0.469</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.581</td> | |
| <td class="has-text-centered">0.558</td> | |
| <td class="has-text-centered">0.565</td> | |
| <td class="has-text-centered">0.854</td> | |
| <td class="has-text-centered">0.296</td> | |
| <td class="has-text-centered">0.357</td> | |
| <td class="has-text-centered">0.296</td> | |
| <td class="has-text-centered">0.289</td> | |
| <td class="has-text-centered">0.723</td> | |
| <td class="has-text-centered">0.755</td> | |
| <td class="has-text-centered">0.723</td> | |
| <td class="has-text-centered">0.705</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.002</td> | |
| <td class="has-text-centered">0.301</td> | |
| <td class="has-text-centered">0.478</td> | |
| <td class="has-text-centered">0.478</td> | |
| <td class="has-text-centered">0.350</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.516</td> | |
| <td class="has-text-centered">0.476</td> | |
| <td class="has-text-centered">0.489</td> | |
| <td class="has-text-centered">0.802</td> | |
| <td class="has-text-centered">0.329</td> | |
| <td class="has-text-centered">0.371</td> | |
| <td class="has-text-centered">0.329</td> | |
| <td class="has-text-centered">0.304</td> | |
| <td class="has-text-centered">0.766</td> | |
| <td class="has-text-centered">0.825</td> | |
| <td class="has-text-centered">0.766</td> | |
| <td class="has-text-centered">0.778</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.011</td> | |
| <td class="has-text-centered">0.009</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.014</td> | |
| <td class="has-text-centered">0.014</td> | |
| <td class="has-text-centered">0.006</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.752</td> | |
| <td class="has-text-centered">0.742</td> | |
| <td class="has-text-centered">0.745</td> | |
| <td class="has-text-centered">0.917</td> | |
| <td class="has-text-centered">0.344</td> | |
| <td class="has-text-centered">0.403</td> | |
| <td class="has-text-centered">0.344</td> | |
| <td class="has-text-centered">0.334</td> | |
| <td class="has-text-centered">0.874</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.874</td> | |
| <td class="has-text-centered">0.879</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.009</td> | |
| <td class="has-text-centered">0.007</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.456</td> | |
| <td class="has-text-centered">0.405</td> | |
| <td class="has-text-centered">0.405</td> | |
| <td class="has-text-centered">0.416</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered">0.772</td> | |
| <td class="has-text-centered">0.754</td> | |
| <td class="has-text-centered">0.761</td> | |
| <td class="has-text-centered performance-medium">0.923</td> | |
| <td class="has-text-centered">0.352</td> | |
| <td class="has-text-centered">0.437</td> | |
| <td class="has-text-centered">0.352</td> | |
| <td class="has-text-centered">0.356</td> | |
| <td class="has-text-centered">0.897</td> | |
| <td class="has-text-centered">0.914</td> | |
| <td class="has-text-centered">0.897</td> | |
| <td class="has-text-centered">0.902</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.006</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.320</td> | |
| <td class="has-text-centered">0.295</td> | |
| <td class="has-text-centered">0.295</td> | |
| <td class="has-text-centered">0.298</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.665</td> | |
| <td class="has-text-centered">0.643</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.886</td> | |
| <td class="has-text-centered">0.336</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.336</td> | |
| <td class="has-text-centered">0.331</td> | |
| <td class="has-text-centered">0.885</td> | |
| <td class="has-text-centered">0.902</td> | |
| <td class="has-text-centered">0.885</td> | |
| <td class="has-text-centered">0.892</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.348</td> | |
| <td class="has-text-centered">0.419</td> | |
| <td class="has-text-centered">0.419</td> | |
| <td class="has-text-centered">0.367</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct</td> | |
| <td class="has-text-centered">0.540</td> | |
| <td class="has-text-centered">0.522</td> | |
| <td class="has-text-centered">0.526</td> | |
| <td class="has-text-centered">0.806</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.383</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.276</td> | |
| <td class="has-text-centered">0.767</td> | |
| <td class="has-text-centered">0.817</td> | |
| <td class="has-text-centered">0.767</td> | |
| <td class="has-text-centered">0.771</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.006</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.002</td> | |
| <td class="has-text-centered">0.337</td> | |
| <td class="has-text-centered">0.477</td> | |
| <td class="has-text-centered">0.477</td> | |
| <td class="has-text-centered">0.368</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.653</td> | |
| <td class="has-text-centered">0.625</td> | |
| <td class="has-text-centered">0.635</td> | |
| <td class="has-text-centered">0.870</td> | |
| <td class="has-text-centered">0.381</td> | |
| <td class="has-text-centered">0.414</td> | |
| <td class="has-text-centered">0.381</td> | |
| <td class="has-text-centered">0.367</td> | |
| <td class="has-text-centered">0.807</td> | |
| <td class="has-text-centered">0.847</td> | |
| <td class="has-text-centered">0.807</td> | |
| <td class="has-text-centered">0.811</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.009</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.428</td> | |
| <td class="has-text-centered">0.481</td> | |
| <td class="has-text-centered">0.481</td> | |
| <td class="has-text-centered">0.435</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.613</td> | |
| <td class="has-text-centered">0.591</td> | |
| <td class="has-text-centered">0.598</td> | |
| <td class="has-text-centered">0.875</td> | |
| <td class="has-text-centered">0.291</td> | |
| <td class="has-text-centered">0.376</td> | |
| <td class="has-text-centered">0.291</td> | |
| <td class="has-text-centered">0.282</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered">0.863</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered">0.845</td> | |
| <td class="has-text-centered">0.007</td> | |
| <td class="has-text-centered">0.012</td> | |
| <td class="has-text-centered">0.009</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.251</td> | |
| <td class="has-text-centered">0.324</td> | |
| <td class="has-text-centered">0.324</td> | |
| <td class="has-text-centered">0.267</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.766</td> | |
| <td class="has-text-centered">0.742</td> | |
| <td class="has-text-centered">0.748</td> | |
| <td class="has-text-centered">0.899</td> | |
| <td class="has-text-centered">0.365</td> | |
| <td class="has-text-centered">0.407</td> | |
| <td class="has-text-centered">0.365</td> | |
| <td class="has-text-centered">0.348</td> | |
| <td class="has-text-centered">0.850</td> | |
| <td class="has-text-centered">0.881</td> | |
| <td class="has-text-centered">0.850</td> | |
| <td class="has-text-centered">0.854</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.016</td> | |
| <td class="has-text-centered">0.012</td> | |
| <td class="has-text-centered">0.006</td> | |
| <td class="has-text-centered">0.468</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.483</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.755</td> | |
| <td class="has-text-centered">0.741</td> | |
| <td class="has-text-centered">0.744</td> | |
| <td class="has-text-centered">0.920</td> | |
| <td class="has-text-centered">0.362</td> | |
| <td class="has-text-centered">0.397</td> | |
| <td class="has-text-centered">0.362</td> | |
| <td class="has-text-centered">0.355</td> | |
| <td class="has-text-centered">0.846</td> | |
| <td class="has-text-centered">0.874</td> | |
| <td class="has-text-centered">0.846</td> | |
| <td class="has-text-centered">0.852</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.009</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.247</td> | |
| <td class="has-text-centered">0.247</td> | |
| <td class="has-text-centered">0.226</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered performance-low">0.798</td> | |
| <td class="has-text-centered performance-low">0.787</td> | |
| <td class="has-text-centered performance-low">0.790</td> | |
| <td class="has-text-centered performance-best">0.945</td> | |
| <td class="has-text-centered performance-medium">0.450</td> | |
| <td class="has-text-centered performance-medium">0.463</td> | |
| <td class="has-text-centered performance-medium">0.450</td> | |
| <td class="has-text-centered performance-medium">0.437</td> | |
| <td class="has-text-centered">0.927</td> | |
| <td class="has-text-centered performance-low">0.943</td> | |
| <td class="has-text-centered">0.927</td> | |
| <td class="has-text-centered">0.934</td> | |
| <td class="has-text-centered performance-medium">0.034</td> | |
| <td class="has-text-centered performance-low">0.067</td> | |
| <td class="has-text-centered performance-low">0.045</td> | |
| <td class="has-text-centered performance-low">0.023</td> | |
| <td class="has-text-centered">0.563</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.549</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered performance-best">0.813</td> | |
| <td class="has-text-centered performance-best">0.805</td> | |
| <td class="has-text-centered performance-best">0.807</td> | |
| <td class="has-text-centered performance-medium">0.944</td> | |
| <td class="has-text-centered performance-low">0.412</td> | |
| <td class="has-text-centered">0.424</td> | |
| <td class="has-text-centered performance-low">0.412</td> | |
| <td class="has-text-centered">0.393</td> | |
| <td class="has-text-centered performance-best">0.946</td> | |
| <td class="has-text-centered performance-best">0.960</td> | |
| <td class="has-text-centered performance-best">0.946</td> | |
| <td class="has-text-centered performance-best">0.952</td> | |
| <td class="has-text-centered performance-best">0.044</td> | |
| <td class="has-text-centered performance-best">0.082</td> | |
| <td class="has-text-centered performance-best">0.057</td> | |
| <td class="has-text-centered performance-best">0.029</td> | |
| <td class="has-text-centered performance-low">0.600</td> | |
| <td class="has-text-centered performance-low">0.586</td> | |
| <td class="has-text-centered performance-low">0.586</td> | |
| <td class="has-text-centered performance-low">0.587</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.695</td> | |
| <td class="has-text-centered">0.681</td> | |
| <td class="has-text-centered">0.685</td> | |
| <td class="has-text-centered">0.907</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.396</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.270</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.770</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.656</td> | |
| <td class="has-text-centered">0.001</td> | |
| <td class="has-text-centered">0.001</td> | |
| <td class="has-text-centered">0.001</td> | |
| <td class="has-text-centered">0.000</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.564</td> | |
| <td class="has-text-centered">0.556</td> | |
| <td class="has-text-centered">0.552</td> | |
| <td class="has-text-centered">0.818</td> | |
| <td class="has-text-centered">0.308</td> | |
| <td class="has-text-centered">0.450</td> | |
| <td class="has-text-centered">0.308</td> | |
| <td class="has-text-centered">0.284</td> | |
| <td class="has-text-centered">0.830</td> | |
| <td class="has-text-centered">0.864</td> | |
| <td class="has-text-centered">0.830</td> | |
| <td class="has-text-centered">0.844</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.006</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.003</td> | |
| <td class="has-text-centered">0.119</td> | |
| <td class="has-text-centered">0.182</td> | |
| <td class="has-text-centered">0.182</td> | |
| <td class="has-text-centered">0.132</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.707</td> | |
| <td class="has-text-centered">0.687</td> | |
| <td class="has-text-centered">0.693</td> | |
| <td class="has-text-centered">0.883</td> | |
| <td class="has-text-centered">0.341</td> | |
| <td class="has-text-centered">0.452</td> | |
| <td class="has-text-centered">0.341</td> | |
| <td class="has-text-centered">0.341</td> | |
| <td class="has-text-centered">0.856</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.856</td> | |
| <td class="has-text-centered">0.862</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.002</td> | |
| <td class="has-text-centered">0.403</td> | |
| <td class="has-text-centered">0.414</td> | |
| <td class="has-text-centered">0.414</td> | |
| <td class="has-text-centered">0.397</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered performance-medium">0.811</td> | |
| <td class="has-text-centered performance-medium">0.794</td> | |
| <td class="has-text-centered performance-medium">0.799</td> | |
| <td class="has-text-centered">0.922</td> | |
| <td class="has-text-centered performance-best">0.455</td> | |
| <td class="has-text-centered performance-best">0.465</td> | |
| <td class="has-text-centered performance-best">0.455</td> | |
| <td class="has-text-centered performance-best">0.439</td> | |
| <td class="has-text-centered">0.873</td> | |
| <td class="has-text-centered">0.927</td> | |
| <td class="has-text-centered">0.873</td> | |
| <td class="has-text-centered">0.891</td> | |
| <td class="has-text-centered performance-medium">0.034</td> | |
| <td class="has-text-centered performance-medium">0.080</td> | |
| <td class="has-text-centered performance-medium">0.047</td> | |
| <td class="has-text-centered performance-medium">0.024</td> | |
| <td class="has-text-centered performance-medium">0.658</td> | |
| <td class="has-text-centered performance-medium">0.668</td> | |
| <td class="has-text-centered performance-medium">0.668</td> | |
| <td class="has-text-centered performance-medium">0.655</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.732</td> | |
| <td class="has-text-centered">0.700</td> | |
| <td class="has-text-centered">0.711</td> | |
| <td class="has-text-centered">0.895</td> | |
| <td class="has-text-centered">0.294</td> | |
| <td class="has-text-centered">0.330</td> | |
| <td class="has-text-centered">0.294</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.879</td> | |
| <td class="has-text-centered">0.917</td> | |
| <td class="has-text-centered">0.879</td> | |
| <td class="has-text-centered">0.883</td> | |
| <td class="has-text-centered">0.011</td> | |
| <td class="has-text-centered">0.022</td> | |
| <td class="has-text-centered">0.015</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.498</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.494</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered">0.769</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.756</td> | |
| <td class="has-text-centered">0.902</td> | |
| <td class="has-text-centered">0.353</td> | |
| <td class="has-text-centered">0.405</td> | |
| <td class="has-text-centered">0.353</td> | |
| <td class="has-text-centered">0.333</td> | |
| <td class="has-text-centered">0.917</td> | |
| <td class="has-text-centered">0.930</td> | |
| <td class="has-text-centered">0.917</td> | |
| <td class="has-text-centered">0.922</td> | |
| <td class="has-text-centered">0.016</td> | |
| <td class="has-text-centered">0.032</td> | |
| <td class="has-text-centered">0.021</td> | |
| <td class="has-text-centered">0.011</td> | |
| <td class="has-text-centered">0.462</td> | |
| <td class="has-text-centered">0.459</td> | |
| <td class="has-text-centered">0.459</td> | |
| <td class="has-text-centered">0.452</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered">0.728</td> | |
| <td class="has-text-centered">0.705</td> | |
| <td class="has-text-centered">0.712</td> | |
| <td class="has-text-centered">0.891</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.436</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.374</td> | |
| <td class="has-text-centered performance-medium">0.934</td> | |
| <td class="has-text-centered performance-medium">0.955</td> | |
| <td class="has-text-centered performance-medium">0.934</td> | |
| <td class="has-text-centered performance-medium">0.944</td> | |
| <td class="has-text-centered">0.014</td> | |
| <td class="has-text-centered">0.028</td> | |
| <td class="has-text-centered">0.019</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.399</td> | |
| <td class="has-text-centered">0.400</td> | |
| <td class="has-text-centered">0.400</td> | |
| <td class="has-text-centered">0.393</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered">0.778</td> | |
| <td class="has-text-centered">0.760</td> | |
| <td class="has-text-centered">0.766</td> | |
| <td class="has-text-centered">0.911</td> | |
| <td class="has-text-centered">0.402</td> | |
| <td class="has-text-centered">0.445</td> | |
| <td class="has-text-centered">0.402</td> | |
| <td class="has-text-centered">0.399</td> | |
| <td class="has-text-centered performance-low">0.931</td> | |
| <td class="has-text-centered performance-medium">0.955</td> | |
| <td class="has-text-centered performance-low">0.931</td> | |
| <td class="has-text-centered performance-low">0.942</td> | |
| <td class="has-text-centered performance-low">0.027</td> | |
| <td class="has-text-centered">0.056</td> | |
| <td class="has-text-centered">0.037</td> | |
| <td class="has-text-centered">0.019</td> | |
| <td class="has-text-centered">0.537</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.523</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.772</td> | |
| <td class="has-text-centered">0.755</td> | |
| <td class="has-text-centered">0.761</td> | |
| <td class="has-text-centered">0.922</td> | |
| <td class="has-text-centered">0.407</td> | |
| <td class="has-text-centered">0.444</td> | |
| <td class="has-text-centered">0.407</td> | |
| <td class="has-text-centered performance-low">0.403</td> | |
| <td class="has-text-centered">0.867</td> | |
| <td class="has-text-centered">0.900</td> | |
| <td class="has-text-centered">0.867</td> | |
| <td class="has-text-centered">0.876</td> | |
| <td class="has-text-centered">0.007</td> | |
| <td class="has-text-centered">0.015</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered performance-best">0.661</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-best">0.662</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Question Answering tab content --> | |
| <div id="question-answering" class="tab-content"> | |
| <h2 class="title is-4">Question Answering Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="3" class="has-text-centered">Datasets (Accuracy)</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinQA" data-tooltip="FinQA contains 8,281 question-answer pairs derived from financial reports that require numerical reasoning over tabular financial data. The question-answering task features multi-step reasoning challenges with full annotation of reasoning programs to solve complex financial queries.">FinQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ConvFinQA" data-tooltip="ConvFinQA is a multi-turn question answering dataset with 3,892 conversations containing 14,115 questions that explore chains of numerical reasoning in financial contexts. The conversational task requires maintaining context while performing sequential numerical operations to answer increasingly complex financial questions.">ConvFinQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="TATQA" data-tooltip="TATQA is a large-scale question answering dataset for hybrid data sources that combines tables and text from financial reports. The task emphasizes numerical reasoning operations across multiple formats, requiring models to integrate information from structured and unstructured sources to answer financial questions.">TATQA</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.809</td> | |
| <td class="has-text-centered">0.709</td> | |
| <td class="has-text-centered">0.772</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.767</td> | |
| <td class="has-text-centered">0.268</td> | |
| <td class="has-text-centered">0.706</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.738</td> | |
| <td class="has-text-centered">0.252</td> | |
| <td class="has-text-centered">0.633</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.742</td> | |
| <td class="has-text-centered">0.174</td> | |
| <td class="has-text-centered">0.355</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered">0.768</td> | |
| <td class="has-text-centered">0.268</td> | |
| <td class="has-text-centered">0.734</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.779</td> | |
| <td class="has-text-centered">0.292</td> | |
| <td class="has-text-centered">0.750</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.655</td> | |
| <td class="has-text-centered">0.199</td> | |
| <td class="has-text-centered">0.553</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.766</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.666</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.611</td> | |
| <td class="has-text-centered">0.315</td> | |
| <td class="has-text-centered">0.501</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.819</td> | |
| <td class="has-text-centered">0.269</td> | |
| <td class="has-text-centered">0.715</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.796</td> | |
| <td class="has-text-centered">0.247</td> | |
| <td class="has-text-centered">0.725</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered performance-medium">0.840</td> | |
| <td class="has-text-centered">0.261</td> | |
| <td class="has-text-centered performance-low">0.779</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered performance-low">0.836</td> | |
| <td class="has-text-centered performance-best">0.853</td> | |
| <td class="has-text-centered performance-best">0.858</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.793</td> | |
| <td class="has-text-centered">0.282</td> | |
| <td class="has-text-centered performance-medium">0.796</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.666</td> | |
| <td class="has-text-centered">0.218</td> | |
| <td class="has-text-centered">0.586</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.790</td> | |
| <td class="has-text-centered">0.225</td> | |
| <td class="has-text-centered">0.660</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered performance-best">0.844</td> | |
| <td class="has-text-centered">0.402</td> | |
| <td class="has-text-centered">0.700</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.803</td> | |
| <td class="has-text-centered">0.421</td> | |
| <td class="has-text-centered">0.733</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.709</td> | |
| <td class="has-text-centered">0.212</td> | |
| <td class="has-text-centered">0.716</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered">0.776</td> | |
| <td class="has-text-centered">0.259</td> | |
| <td class="has-text-centered">0.698</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered">0.829</td> | |
| <td class="has-text-centered">0.280</td> | |
| <td class="has-text-centered">0.763</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered performance-low">0.836</td> | |
| <td class="has-text-centered performance-low">0.749</td> | |
| <td class="has-text-centered">0.754</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.799</td> | |
| <td class="has-text-centered performance-medium">0.840</td> | |
| <td class="has-text-centered">0.698</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Sentiment Analysis tab content --> | |
| <div id="sentiment-analysis" class="tab-content"> | |
| <h2 class="title is-4">Sentiment Analysis Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" data-title="FiQA Task 1" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis in microblog posts and news headlines using a continuous scale from -1 (negative) to 1 (positive). The regression task requires models to accurately predict the sentiment score that reflects investor perception of financial texts.">FiQA Task 1</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-table="FinEntity" data-tooltip="FinEntity contains 1,000 financial news articles annotated with sentiment labels for 10 financial entities. The multi-label classification task involves predicting the sentiment of each entity in the news article, requiring models to understand the nuanced sentiment expressed towards different financial entities.">FinEntity</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="SubjECTive-QA" data-tooltip="SubjECTive-QA contains 49,446 annotations across 2,747 question-answer pairs extracted from 120 earnings call transcripts. The multi-label classification task involves analyzing six subjective features in financial discourse: assertiveness, cautiousness, optimism, specificity, clarity, and relevance.">SubjECTive-QA</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="Financial Phrase Bank" data-tooltip="Financial Phrase Bank (FPB) contains 4,840 sentences from financial news articles categorized as positive, negative, or neutral by 16 finance experts using majority voting. The sentiment classification task requires understanding how these statements might influence investor perception of stock prices.">Financial Phrase Bank (FPB)</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">MSE</th> | |
| <th class="has-text-centered">MAE</th> | |
| <th class="has-text-centered">rΒ² Score</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.123</td> | |
| <td class="has-text-centered">0.290</td> | |
| <td class="has-text-centered">0.272</td> | |
| <td class="has-text-centered">0.474</td> | |
| <td class="has-text-centered">0.485</td> | |
| <td class="has-text-centered">0.485</td> | |
| <td class="has-text-centered">0.469</td> | |
| <td class="has-text-centered">0.652</td> | |
| <td class="has-text-centered">0.573</td> | |
| <td class="has-text-centered">0.535</td> | |
| <td class="has-text-centered">0.573</td> | |
| <td class="has-text-centered">0.901</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.901</td> | |
| <td class="has-text-centered">0.902</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.161</td> | |
| <td class="has-text-centered">0.344</td> | |
| <td class="has-text-centered">0.045</td> | |
| <td class="has-text-centered">0.301</td> | |
| <td class="has-text-centered">0.478</td> | |
| <td class="has-text-centered">0.478</td> | |
| <td class="has-text-centered">0.350</td> | |
| <td class="has-text-centered">0.635</td> | |
| <td class="has-text-centered performance-best">0.625</td> | |
| <td class="has-text-centered performance-best">0.600</td> | |
| <td class="has-text-centered performance-best">0.625</td> | |
| <td class="has-text-centered">0.738</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.738</td> | |
| <td class="has-text-centered">0.698</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.160</td> | |
| <td class="has-text-centered">0.321</td> | |
| <td class="has-text-centered">0.052</td> | |
| <td class="has-text-centered">0.004</td> | |
| <td class="has-text-centered">0.014</td> | |
| <td class="has-text-centered">0.014</td> | |
| <td class="has-text-centered">0.006</td> | |
| <td class="has-text-centered performance-low">0.654</td> | |
| <td class="has-text-centered">0.541</td> | |
| <td class="has-text-centered">0.436</td> | |
| <td class="has-text-centered">0.541</td> | |
| <td class="has-text-centered">0.524</td> | |
| <td class="has-text-centered">0.727</td> | |
| <td class="has-text-centered">0.524</td> | |
| <td class="has-text-centered">0.499</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.118</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.302</td> | |
| <td class="has-text-centered">0.456</td> | |
| <td class="has-text-centered">0.405</td> | |
| <td class="has-text-centered">0.405</td> | |
| <td class="has-text-centered">0.416</td> | |
| <td class="has-text-centered performance-best">0.676</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.462</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.815</td> | |
| <td class="has-text-centered">0.867</td> | |
| <td class="has-text-centered">0.815</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered performance-best">0.100</td> | |
| <td class="has-text-centered performance-best">0.266</td> | |
| <td class="has-text-centered">0.406</td> | |
| <td class="has-text-centered">0.320</td> | |
| <td class="has-text-centered">0.295</td> | |
| <td class="has-text-centered">0.295</td> | |
| <td class="has-text-centered">0.298</td> | |
| <td class="has-text-centered">0.562</td> | |
| <td class="has-text-centered">0.524</td> | |
| <td class="has-text-centered">0.515</td> | |
| <td class="has-text-centered">0.524</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.896</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.884</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.189</td> | |
| <td class="has-text-centered">0.352</td> | |
| <td class="has-text-centered">-0.120</td> | |
| <td class="has-text-centered">0.348</td> | |
| <td class="has-text-centered">0.419</td> | |
| <td class="has-text-centered">0.419</td> | |
| <td class="has-text-centered">0.367</td> | |
| <td class="has-text-centered">0.570</td> | |
| <td class="has-text-centered">0.499</td> | |
| <td class="has-text-centered">0.491</td> | |
| <td class="has-text-centered">0.499</td> | |
| <td class="has-text-centered performance-medium">0.940</td> | |
| <td class="has-text-centered performance-medium">0.941</td> | |
| <td class="has-text-centered performance-medium">0.940</td> | |
| <td class="has-text-centered performance-medium">0.940</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.135</td> | |
| <td class="has-text-centered">0.278</td> | |
| <td class="has-text-centered">0.200</td> | |
| <td class="has-text-centered">0.337</td> | |
| <td class="has-text-centered">0.477</td> | |
| <td class="has-text-centered">0.477</td> | |
| <td class="has-text-centered">0.368</td> | |
| <td class="has-text-centered">0.607</td> | |
| <td class="has-text-centered">0.542</td> | |
| <td class="has-text-centered">0.522</td> | |
| <td class="has-text-centered">0.542</td> | |
| <td class="has-text-centered">0.847</td> | |
| <td class="has-text-centered">0.854</td> | |
| <td class="has-text-centered">0.847</td> | |
| <td class="has-text-centered">0.841</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.221</td> | |
| <td class="has-text-centered">0.364</td> | |
| <td class="has-text-centered performance-best">-0.310</td> | |
| <td class="has-text-centered">0.428</td> | |
| <td class="has-text-centered">0.481</td> | |
| <td class="has-text-centered">0.481</td> | |
| <td class="has-text-centered">0.435</td> | |
| <td class="has-text-centered">0.614</td> | |
| <td class="has-text-centered">0.538</td> | |
| <td class="has-text-centered">0.510</td> | |
| <td class="has-text-centered">0.538</td> | |
| <td class="has-text-centered">0.768</td> | |
| <td class="has-text-centered">0.845</td> | |
| <td class="has-text-centered">0.768</td> | |
| <td class="has-text-centered">0.776</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.208</td> | |
| <td class="has-text-centered">0.307</td> | |
| <td class="has-text-centered performance-medium">-0.229</td> | |
| <td class="has-text-centered">0.251</td> | |
| <td class="has-text-centered">0.324</td> | |
| <td class="has-text-centered">0.324</td> | |
| <td class="has-text-centered">0.267</td> | |
| <td class="has-text-centered">0.611</td> | |
| <td class="has-text-centered">0.518</td> | |
| <td class="has-text-centered">0.498</td> | |
| <td class="has-text-centered">0.518</td> | |
| <td class="has-text-centered">0.896</td> | |
| <td class="has-text-centered">0.898</td> | |
| <td class="has-text-centered">0.896</td> | |
| <td class="has-text-centered">0.893</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.205</td> | |
| <td class="has-text-centered">0.409</td> | |
| <td class="has-text-centered performance-low">-0.212</td> | |
| <td class="has-text-centered">0.468</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.483</td> | |
| <td class="has-text-centered">0.644</td> | |
| <td class="has-text-centered performance-medium">0.601</td> | |
| <td class="has-text-centered">0.576</td> | |
| <td class="has-text-centered performance-medium">0.601</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.908</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.901</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.129</td> | |
| <td class="has-text-centered">0.283</td> | |
| <td class="has-text-centered">0.239</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.247</td> | |
| <td class="has-text-centered">0.247</td> | |
| <td class="has-text-centered">0.226</td> | |
| <td class="has-text-centered">0.611</td> | |
| <td class="has-text-centered">0.570</td> | |
| <td class="has-text-centered">0.566</td> | |
| <td class="has-text-centered">0.570</td> | |
| <td class="has-text-centered">0.765</td> | |
| <td class="has-text-centered">0.853</td> | |
| <td class="has-text-centered">0.765</td> | |
| <td class="has-text-centered">0.779</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered">0.150</td> | |
| <td class="has-text-centered">0.311</td> | |
| <td class="has-text-centered">0.111</td> | |
| <td class="has-text-centered">0.563</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.544</td> | |
| <td class="has-text-centered">0.549</td> | |
| <td class="has-text-centered">0.640</td> | |
| <td class="has-text-centered">0.572</td> | |
| <td class="has-text-centered performance-low">0.583</td> | |
| <td class="has-text-centered">0.572</td> | |
| <td class="has-text-centered">0.828</td> | |
| <td class="has-text-centered">0.851</td> | |
| <td class="has-text-centered">0.828</td> | |
| <td class="has-text-centered">0.814</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered">0.110</td> | |
| <td class="has-text-centered">0.289</td> | |
| <td class="has-text-centered">0.348</td> | |
| <td class="has-text-centered performance-low">0.600</td> | |
| <td class="has-text-centered performance-low">0.586</td> | |
| <td class="has-text-centered performance-low">0.586</td> | |
| <td class="has-text-centered performance-low">0.587</td> | |
| <td class="has-text-centered">0.644</td> | |
| <td class="has-text-centered">0.489</td> | |
| <td class="has-text-centered">0.499</td> | |
| <td class="has-text-centered">0.489</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.907</td> | |
| <td class="has-text-centered">0.904</td> | |
| <td class="has-text-centered">0.902</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.141</td> | |
| <td class="has-text-centered">0.290</td> | |
| <td class="has-text-centered">0.165</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.005</td> | |
| <td class="has-text-centered">0.629</td> | |
| <td class="has-text-centered">0.534</td> | |
| <td class="has-text-centered">0.550</td> | |
| <td class="has-text-centered">0.534</td> | |
| <td class="has-text-centered">0.812</td> | |
| <td class="has-text-centered">0.827</td> | |
| <td class="has-text-centered">0.812</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.119</td> | |
| <td class="has-text-centered">0.282</td> | |
| <td class="has-text-centered">0.293</td> | |
| <td class="has-text-centered">0.119</td> | |
| <td class="has-text-centered">0.182</td> | |
| <td class="has-text-centered">0.182</td> | |
| <td class="has-text-centered">0.132</td> | |
| <td class="has-text-centered">0.380</td> | |
| <td class="has-text-centered">0.525</td> | |
| <td class="has-text-centered">0.418</td> | |
| <td class="has-text-centered">0.525</td> | |
| <td class="has-text-centered">0.784</td> | |
| <td class="has-text-centered">0.814</td> | |
| <td class="has-text-centered">0.784</td> | |
| <td class="has-text-centered">0.765</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.183</td> | |
| <td class="has-text-centered">0.363</td> | |
| <td class="has-text-centered">-0.085</td> | |
| <td class="has-text-centered">0.403</td> | |
| <td class="has-text-centered">0.414</td> | |
| <td class="has-text-centered">0.414</td> | |
| <td class="has-text-centered">0.397</td> | |
| <td class="has-text-centered">0.635</td> | |
| <td class="has-text-centered">0.573</td> | |
| <td class="has-text-centered">0.582</td> | |
| <td class="has-text-centered">0.573</td> | |
| <td class="has-text-centered">0.824</td> | |
| <td class="has-text-centered">0.850</td> | |
| <td class="has-text-centered">0.824</td> | |
| <td class="has-text-centered">0.798</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered performance-medium">0.101</td> | |
| <td class="has-text-centered performance-medium">0.268</td> | |
| <td class="has-text-centered">0.402</td> | |
| <td class="has-text-centered performance-medium">0.658</td> | |
| <td class="has-text-centered performance-medium">0.668</td> | |
| <td class="has-text-centered performance-medium">0.668</td> | |
| <td class="has-text-centered performance-medium">0.655</td> | |
| <td class="has-text-centered">0.634</td> | |
| <td class="has-text-centered">0.585</td> | |
| <td class="has-text-centered">0.553</td> | |
| <td class="has-text-centered">0.585</td> | |
| <td class="has-text-centered performance-best">0.944</td> | |
| <td class="has-text-centered performance-best">0.945</td> | |
| <td class="has-text-centered performance-best">0.944</td> | |
| <td class="has-text-centered performance-best">0.944</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.167</td> | |
| <td class="has-text-centered">0.349</td> | |
| <td class="has-text-centered">0.008</td> | |
| <td class="has-text-centered">0.498</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.494</td> | |
| <td class="has-text-centered">0.619</td> | |
| <td class="has-text-centered">0.538</td> | |
| <td class="has-text-centered">0.463</td> | |
| <td class="has-text-centered">0.538</td> | |
| <td class="has-text-centered">0.907</td> | |
| <td class="has-text-centered">0.913</td> | |
| <td class="has-text-centered">0.907</td> | |
| <td class="has-text-centered">0.908</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.164</td> | |
| <td class="has-text-centered">0.319</td> | |
| <td class="has-text-centered">0.028</td> | |
| <td class="has-text-centered">0.457</td> | |
| <td class="has-text-centered">0.446</td> | |
| <td class="has-text-centered">0.446</td> | |
| <td class="has-text-centered">0.441</td> | |
| <td class="has-text-centered">0.609</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.532</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.835</td> | |
| <td class="has-text-centered">0.861</td> | |
| <td class="has-text-centered">0.835</td> | |
| <td class="has-text-centered">0.840</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered performance-low">0.106</td> | |
| <td class="has-text-centered performance-low">0.274</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.462</td> | |
| <td class="has-text-centered">0.459</td> | |
| <td class="has-text-centered">0.459</td> | |
| <td class="has-text-centered">0.452</td> | |
| <td class="has-text-centered">0.608</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.533</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.741</td> | |
| <td class="has-text-centered">0.806</td> | |
| <td class="has-text-centered">0.741</td> | |
| <td class="has-text-centered">0.699</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered">0.144</td> | |
| <td class="has-text-centered">0.329</td> | |
| <td class="has-text-centered">0.149</td> | |
| <td class="has-text-centered">0.399</td> | |
| <td class="has-text-centered">0.400</td> | |
| <td class="has-text-centered">0.400</td> | |
| <td class="has-text-centered">0.393</td> | |
| <td class="has-text-centered">0.642</td> | |
| <td class="has-text-centered performance-low">0.587</td> | |
| <td class="has-text-centered performance-medium">0.593</td> | |
| <td class="has-text-centered performance-low">0.587</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.895</td> | |
| <td class="has-text-centered">0.890</td> | |
| <td class="has-text-centered">0.885</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered">0.184</td> | |
| <td class="has-text-centered">0.317</td> | |
| <td class="has-text-centered">-0.089</td> | |
| <td class="has-text-centered">0.537</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.517</td> | |
| <td class="has-text-centered">0.523</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.515</td> | |
| <td class="has-text-centered">0.541</td> | |
| <td class="has-text-centered">0.515</td> | |
| <td class="has-text-centered performance-low">0.929</td> | |
| <td class="has-text-centered performance-low">0.931</td> | |
| <td class="has-text-centered performance-low">0.929</td> | |
| <td class="has-text-centered performance-low">0.928</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.120</td> | |
| <td class="has-text-centered">0.295</td> | |
| <td class="has-text-centered">0.289</td> | |
| <td class="has-text-centered performance-best">0.661</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-best">0.662</td> | |
| <td class="has-text-centered performance-medium">0.660</td> | |
| <td class="has-text-centered">0.515</td> | |
| <td class="has-text-centered">0.542</td> | |
| <td class="has-text-centered">0.515</td> | |
| <td class="has-text-centered">0.918</td> | |
| <td class="has-text-centered">0.917</td> | |
| <td class="has-text-centered">0.918</td> | |
| <td class="has-text-centered">0.917</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Text Classification tab content --> | |
| <div id="text-classification" class="tab-content"> | |
| <h2 class="title is-4">Text Classification Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="Banking77" data-tooltip="Banking77 is a fine-grained dataset comprising 13,083 customer service queries annotated with 77 unique intents from the banking domain. The task involves accurately classifying each customer query into the correct intent category to improve automated banking support systems.">Banking77</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinBench" data-tooltip="FinBench is a comprehensive evaluation dataset containing 333,000 labeled instances that combines tabular data and profile text for financial risk prediction. The task requires models to predict financial outcomes across three key risk categories: default, fraud, and customer churn.">FinBench</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FOMC" data-tooltip="FOMC is a dataset containing Federal Open Market Committee speeches, meeting minutes, and press conference transcripts spanning from 1996 to 2022. The classification task involves determining whether the monetary policy stance expressed in each document is hawkish (tighter monetary policy) or dovish (looser monetary policy).">FOMC</th> | |
| <th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="NumClaim" data-tooltip="NumClaim is an expert-annotated dataset for detecting and analyzing fine-grained investor claims within financial narratives that contain numerical information. The task requires identifying and categorizing claims containing numerals in analyst reports and earnings call transcripts for investment decision making.">NumClaim</th> | |
| <th colspan="1" class="has-text-centered tooltip-trigger tooltip-right" data-title="Headlines" data-tooltip="Headlines is a dataset containing 11,412 human-annotated financial news headlines focused on commodities, particularly gold, spanning from 2000 to 2019. The classification task involves identifying binary indicators for price mentions and directional price movements in these concise financial texts.">Headlines</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Precision</th> | |
| <th class="has-text-centered">Recall</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| <th class="has-text-centered">F1</th> | |
| <th class="has-text-centered">Accuracy</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.660</td> | |
| <td class="has-text-centered">0.748</td> | |
| <td class="has-text-centered">0.660</td> | |
| <td class="has-text-centered">0.645</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.826</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.309</td> | |
| <td class="has-text-centered">0.661</td> | |
| <td class="has-text-centered">0.662</td> | |
| <td class="has-text-centered">0.661</td> | |
| <td class="has-text-centered">0.652</td> | |
| <td class="has-text-centered">0.430</td> | |
| <td class="has-text-centered">0.240</td> | |
| <td class="has-text-centered performance-medium">0.980</td> | |
| <td class="has-text-centered">0.386</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.534</td> | |
| <td class="has-text-centered">0.672</td> | |
| <td class="has-text-centered">0.534</td> | |
| <td class="has-text-centered">0.512</td> | |
| <td class="has-text-centered">0.543</td> | |
| <td class="has-text-centered">0.857</td> | |
| <td class="has-text-centered">0.543</td> | |
| <td class="has-text-centered">0.659</td> | |
| <td class="has-text-centered">0.565</td> | |
| <td class="has-text-centered">0.618</td> | |
| <td class="has-text-centered">0.565</td> | |
| <td class="has-text-centered">0.497</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.463</td> | |
| <td class="has-text-centered">0.571</td> | |
| <td class="has-text-centered">0.511</td> | |
| <td class="has-text-centered">0.763</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.578</td> | |
| <td class="has-text-centered">0.706</td> | |
| <td class="has-text-centered">0.578</td> | |
| <td class="has-text-centered">0.574</td> | |
| <td class="has-text-centered">0.359</td> | |
| <td class="has-text-centered">0.851</td> | |
| <td class="has-text-centered">0.359</td> | |
| <td class="has-text-centered">0.483</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.572</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.193</td> | |
| <td class="has-text-centered">0.222</td> | |
| <td class="has-text-centered">0.190</td> | |
| <td class="has-text-centered performance-best">1.000</td> | |
| <td class="has-text-centered">0.319</td> | |
| <td class="has-text-centered">0.746</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.596</td> | |
| <td class="has-text-centered">0.711</td> | |
| <td class="has-text-centered">0.596</td> | |
| <td class="has-text-centered">0.578</td> | |
| <td class="has-text-centered">0.369</td> | |
| <td class="has-text-centered">0.856</td> | |
| <td class="has-text-centered">0.369</td> | |
| <td class="has-text-centered">0.492</td> | |
| <td class="has-text-centered">0.532</td> | |
| <td class="has-text-centered">0.678</td> | |
| <td class="has-text-centered">0.532</td> | |
| <td class="has-text-centered">0.407</td> | |
| <td class="has-text-centered">0.832</td> | |
| <td class="has-text-centered performance-best">1.000</td> | |
| <td class="has-text-centered">0.082</td> | |
| <td class="has-text-centered">0.151</td> | |
| <td class="has-text-centered">0.778</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.730</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.621</td> | |
| <td class="has-text-centered">0.410</td> | |
| <td class="has-text-centered">0.849</td> | |
| <td class="has-text-centered">0.410</td> | |
| <td class="has-text-centered">0.538</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.704</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.620</td> | |
| <td class="has-text-centered">0.471</td> | |
| <td class="has-text-centered">0.257</td> | |
| <td class="has-text-centered performance-best">1.000</td> | |
| <td class="has-text-centered">0.408</td> | |
| <td class="has-text-centered">0.808</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.630</td> | |
| <td class="has-text-centered">0.710</td> | |
| <td class="has-text-centered">0.630</td> | |
| <td class="has-text-centered">0.609</td> | |
| <td class="has-text-centered">0.412</td> | |
| <td class="has-text-centered">0.848</td> | |
| <td class="has-text-centered">0.412</td> | |
| <td class="has-text-centered">0.541</td> | |
| <td class="has-text-centered">0.595</td> | |
| <td class="has-text-centered">0.694</td> | |
| <td class="has-text-centered">0.595</td> | |
| <td class="has-text-centered">0.519</td> | |
| <td class="has-text-centered">0.371</td> | |
| <td class="has-text-centered">0.224</td> | |
| <td class="has-text-centered performance-strong">0.990</td> | |
| <td class="has-text-centered">0.365</td> | |
| <td class="has-text-centered performance-best">0.856</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.677</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.528</td> | |
| <td class="has-text-centered">0.375</td> | |
| <td class="has-text-centered">0.839</td> | |
| <td class="has-text-centered">0.375</td> | |
| <td class="has-text-centered">0.503</td> | |
| <td class="has-text-centered">0.587</td> | |
| <td class="has-text-centered">0.598</td> | |
| <td class="has-text-centered">0.587</td> | |
| <td class="has-text-centered">0.542</td> | |
| <td class="has-text-centered">0.521</td> | |
| <td class="has-text-centered">0.266</td> | |
| <td class="has-text-centered">0.918</td> | |
| <td class="has-text-centered">0.412</td> | |
| <td class="has-text-centered">0.779</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.622</td> | |
| <td class="has-text-centered">0.718</td> | |
| <td class="has-text-centered">0.622</td> | |
| <td class="has-text-centered">0.602</td> | |
| <td class="has-text-centered">0.166</td> | |
| <td class="has-text-centered">0.811</td> | |
| <td class="has-text-centered">0.166</td> | |
| <td class="has-text-centered">0.221</td> | |
| <td class="has-text-centered">0.562</td> | |
| <td class="has-text-centered">0.709</td> | |
| <td class="has-text-centered">0.562</td> | |
| <td class="has-text-centered">0.465</td> | |
| <td class="has-text-centered">0.732</td> | |
| <td class="has-text-centered">0.384</td> | |
| <td class="has-text-centered">0.775</td> | |
| <td class="has-text-centered">0.513</td> | |
| <td class="has-text-centered performance-medium">0.835</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.567</td> | |
| <td class="has-text-centered">0.693</td> | |
| <td class="has-text-centered">0.567</td> | |
| <td class="has-text-centered">0.547</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.838</td> | |
| <td class="has-text-centered">0.285</td> | |
| <td class="has-text-centered">0.396</td> | |
| <td class="has-text-centered">0.623</td> | |
| <td class="has-text-centered">0.636</td> | |
| <td class="has-text-centered">0.623</td> | |
| <td class="has-text-centered">0.603</td> | |
| <td class="has-text-centered">0.765</td> | |
| <td class="has-text-centered">0.431</td> | |
| <td class="has-text-centered">0.898</td> | |
| <td class="has-text-centered">0.583</td> | |
| <td class="has-text-centered">0.805</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.644</td> | |
| <td class="has-text-centered">0.730</td> | |
| <td class="has-text-centered">0.644</td> | |
| <td class="has-text-centered">0.627</td> | |
| <td class="has-text-centered">0.370</td> | |
| <td class="has-text-centered">0.848</td> | |
| <td class="has-text-centered">0.370</td> | |
| <td class="has-text-centered">0.495</td> | |
| <td class="has-text-centered">0.623</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.623</td> | |
| <td class="has-text-centered">0.605</td> | |
| <td class="has-text-centered">0.821</td> | |
| <td class="has-text-centered">0.506</td> | |
| <td class="has-text-centered">0.867</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.830</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.664</td> | |
| <td class="has-text-centered">0.737</td> | |
| <td class="has-text-centered">0.664</td> | |
| <td class="has-text-centered">0.648</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.842</td> | |
| <td class="has-text-centered">0.373</td> | |
| <td class="has-text-centered">0.500</td> | |
| <td class="has-text-centered">0.583</td> | |
| <td class="has-text-centered performance-medium">0.710</td> | |
| <td class="has-text-centered">0.583</td> | |
| <td class="has-text-centered">0.505</td> | |
| <td class="has-text-centered">0.831</td> | |
| <td class="has-text-centered">0.630</td> | |
| <td class="has-text-centered">0.173</td> | |
| <td class="has-text-centered">0.272</td> | |
| <td class="has-text-centered">0.797</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered performance-strong">0.722</td> | |
| <td class="has-text-centered performance-medium">0.774</td> | |
| <td class="has-text-centered performance-strong">0.722</td> | |
| <td class="has-text-centered performance-strong">0.714</td> | |
| <td class="has-text-centered">0.362</td> | |
| <td class="has-text-centered">0.845</td> | |
| <td class="has-text-centered">0.362</td> | |
| <td class="has-text-centered">0.487</td> | |
| <td class="has-text-centered">0.625</td> | |
| <td class="has-text-centered performance-strong">0.712</td> | |
| <td class="has-text-centered">0.625</td> | |
| <td class="has-text-centered">0.578</td> | |
| <td class="has-text-centered">0.860</td> | |
| <td class="has-text-centered">0.586</td> | |
| <td class="has-text-centered">0.796</td> | |
| <td class="has-text-centered">0.675</td> | |
| <td class="has-text-centered">0.729</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered performance-best">0.772</td> | |
| <td class="has-text-centered performance-strong">0.789</td> | |
| <td class="has-text-centered performance-best">0.772</td> | |
| <td class="has-text-centered performance-best">0.763</td> | |
| <td class="has-text-centered">0.306</td> | |
| <td class="has-text-centered">0.846</td> | |
| <td class="has-text-centered">0.306</td> | |
| <td class="has-text-centered">0.419</td> | |
| <td class="has-text-centered performance-strong">0.679</td> | |
| <td class="has-text-centered">0.682</td> | |
| <td class="has-text-centered performance-strong">0.679</td> | |
| <td class="has-text-centered performance-strong">0.670</td> | |
| <td class="has-text-centered">0.851</td> | |
| <td class="has-text-centered">0.557</td> | |
| <td class="has-text-centered">0.898</td> | |
| <td class="has-text-centered">0.688</td> | |
| <td class="has-text-centered">0.769</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.577</td> | |
| <td class="has-text-centered">0.747</td> | |
| <td class="has-text-centered">0.577</td> | |
| <td class="has-text-centered">0.613</td> | |
| <td class="has-text-centered performance-strong">0.716</td> | |
| <td class="has-text-centered performance-strong">0.871</td> | |
| <td class="has-text-centered performance-strong">0.716</td> | |
| <td class="has-text-centered performance-strong">0.784</td> | |
| <td class="has-text-centered">0.591</td> | |
| <td class="has-text-centered">0.630</td> | |
| <td class="has-text-centered">0.591</td> | |
| <td class="has-text-centered">0.555</td> | |
| <td class="has-text-centered">0.819</td> | |
| <td class="has-text-centered performance-best">1.000</td> | |
| <td class="has-text-centered">0.010</td> | |
| <td class="has-text-centered">0.020</td> | |
| <td class="has-text-centered">0.744</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.528</td> | |
| <td class="has-text-centered">0.630</td> | |
| <td class="has-text-centered">0.528</td> | |
| <td class="has-text-centered">0.508</td> | |
| <td class="has-text-centered performance-best">0.913</td> | |
| <td class="has-text-centered performance-best">0.883</td> | |
| <td class="has-text-centered performance-best">0.913</td> | |
| <td class="has-text-centered performance-best">0.898</td> | |
| <td class="has-text-centered">0.572</td> | |
| <td class="has-text-centered">0.678</td> | |
| <td class="has-text-centered">0.572</td> | |
| <td class="has-text-centered">0.499</td> | |
| <td class="has-text-centered">0.812</td> | |
| <td class="has-text-centered">0.429</td> | |
| <td class="has-text-centered">0.092</td> | |
| <td class="has-text-centered">0.151</td> | |
| <td class="has-text-centered">0.682</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.642</td> | |
| <td class="has-text-centered">0.746</td> | |
| <td class="has-text-centered">0.642</td> | |
| <td class="has-text-centered">0.628</td> | |
| <td class="has-text-centered">0.494</td> | |
| <td class="has-text-centered">0.851</td> | |
| <td class="has-text-centered">0.494</td> | |
| <td class="has-text-centered">0.618</td> | |
| <td class="has-text-centered">0.597</td> | |
| <td class="has-text-centered">0.650</td> | |
| <td class="has-text-centered">0.597</td> | |
| <td class="has-text-centered">0.550</td> | |
| <td class="has-text-centered">0.855</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.469</td> | |
| <td class="has-text-centered">0.541</td> | |
| <td class="has-text-centered">0.782</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered">0.682</td> | |
| <td class="has-text-centered">0.755</td> | |
| <td class="has-text-centered">0.682</td> | |
| <td class="has-text-centered">0.668</td> | |
| <td class="has-text-centered">0.513</td> | |
| <td class="has-text-centered">0.854</td> | |
| <td class="has-text-centered">0.513</td> | |
| <td class="has-text-centered">0.634</td> | |
| <td class="has-text-centered performance-medium">0.675</td> | |
| <td class="has-text-centered">0.677</td> | |
| <td class="has-text-centered performance-medium">0.675</td> | |
| <td class="has-text-centered performance-best">0.674</td> | |
| <td class="has-text-centered performance-medium">0.879</td> | |
| <td class="has-text-centered">0.646</td> | |
| <td class="has-text-centered">0.745</td> | |
| <td class="has-text-centered performance-medium">0.692</td> | |
| <td class="has-text-centered">0.827</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.735</td> | |
| <td class="has-text-centered">0.639</td> | |
| <td class="has-text-centered">0.622</td> | |
| <td class="has-text-centered">0.067</td> | |
| <td class="has-text-centered">0.674</td> | |
| <td class="has-text-centered">0.067</td> | |
| <td class="has-text-centered">0.022</td> | |
| <td class="has-text-centered">0.633</td> | |
| <td class="has-text-centered">0.634</td> | |
| <td class="has-text-centered">0.633</td> | |
| <td class="has-text-centered">0.631</td> | |
| <td class="has-text-centered">0.838</td> | |
| <td class="has-text-centered">0.556</td> | |
| <td class="has-text-centered">0.561</td> | |
| <td class="has-text-centered">0.558</td> | |
| <td class="has-text-centered">0.781</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.650</td> | |
| <td class="has-text-centered">0.530</td> | |
| <td class="has-text-centered">0.516</td> | |
| <td class="has-text-centered performance-medium">0.682</td> | |
| <td class="has-text-centered performance-medium">0.868</td> | |
| <td class="has-text-centered performance-medium">0.682</td> | |
| <td class="has-text-centered performance-medium">0.762</td> | |
| <td class="has-text-centered">0.536</td> | |
| <td class="has-text-centered">0.505</td> | |
| <td class="has-text-centered">0.536</td> | |
| <td class="has-text-centered">0.459</td> | |
| <td class="has-text-centered">0.797</td> | |
| <td class="has-text-centered">0.210</td> | |
| <td class="has-text-centered">0.041</td> | |
| <td class="has-text-centered">0.068</td> | |
| <td class="has-text-centered">0.770</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered">0.660</td> | |
| <td class="has-text-centered">0.747</td> | |
| <td class="has-text-centered">0.660</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.575</td> | |
| <td class="has-text-centered">0.859</td> | |
| <td class="has-text-centered">0.575</td> | |
| <td class="has-text-centered">0.684</td> | |
| <td class="has-text-centered">0.526</td> | |
| <td class="has-text-centered">0.655</td> | |
| <td class="has-text-centered">0.526</td> | |
| <td class="has-text-centered">0.393</td> | |
| <td class="has-text-centered">0.804</td> | |
| <td class="has-text-centered">0.333</td> | |
| <td class="has-text-centered">0.071</td> | |
| <td class="has-text-centered">0.118</td> | |
| <td class="has-text-centered">0.812</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered">0.483</td> | |
| <td class="has-text-centered">0.487</td> | |
| <td class="has-text-centered">0.483</td> | |
| <td class="has-text-centered">0.418</td> | |
| <td class="has-text-centered">0.240</td> | |
| <td class="has-text-centered">0.823</td> | |
| <td class="has-text-centered">0.240</td> | |
| <td class="has-text-centered">0.336</td> | |
| <td class="has-text-centered">0.619</td> | |
| <td class="has-text-centered">0.667</td> | |
| <td class="has-text-centered">0.619</td> | |
| <td class="has-text-centered">0.579</td> | |
| <td class="has-text-centered">0.700</td> | |
| <td class="has-text-centered">0.369</td> | |
| <td class="has-text-centered">0.908</td> | |
| <td class="has-text-centered">0.525</td> | |
| <td class="has-text-centered performance-strong">0.837</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered performance-medium">0.704</td> | |
| <td class="has-text-centered performance-best">0.792</td> | |
| <td class="has-text-centered performance-medium">0.704</td> | |
| <td class="has-text-centered performance-medium">0.710</td> | |
| <td class="has-text-centered">0.396</td> | |
| <td class="has-text-centered">0.846</td> | |
| <td class="has-text-centered">0.396</td> | |
| <td class="has-text-centered">0.524</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-best">0.719</td> | |
| <td class="has-text-centered performance-best">0.681</td> | |
| <td class="has-text-centered performance-medium">0.664</td> | |
| <td class="has-text-centered performance-best">0.896</td> | |
| <td class="has-text-centered performance-medium">0.667</td> | |
| <td class="has-text-centered">0.857</td> | |
| <td class="has-text-centered performance-best">0.750</td> | |
| <td class="has-text-centered">0.824</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.681</td> | |
| <td class="has-text-centered">0.760</td> | |
| <td class="has-text-centered">0.681</td> | |
| <td class="has-text-centered">0.670</td> | |
| <td class="has-text-centered">0.487</td> | |
| <td class="has-text-centered">0.851</td> | |
| <td class="has-text-centered">0.487</td> | |
| <td class="has-text-centered">0.612</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.670</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.635</td> | |
| <td class="has-text-centered performance-strong">0.888</td> | |
| <td class="has-text-centered performance-medium">0.664</td> | |
| <td class="has-text-centered">0.786</td> | |
| <td class="has-text-centered performance-strong">0.720</td> | |
| <td class="has-text-centered">0.769</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Text Summarization tab content --> | |
| <div id="text-summarization" class="tab-content"> | |
| <h2 class="title is-4">Text Summarization Task Results</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th rowspan="2">Model</th> | |
| <th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" data-title="ECTSum" data-tooltip="ECTSum contains 2,425 document-summary pairs featuring earnings call transcripts paired with concise bullet-point summaries extracted from Reuters articles. The summarization task requires extracting and condensing key financial information from lengthy corporate communications into brief, informative points.">ECTSum</th> | |
| <th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" data-title="EDTSum" data-tooltip="EDTSum consists of 2,000 financial news articles paired with their headlines as ground-truth summaries for evaluating text summarization. The task challenges models to condense complex financial news articles into concise, informative headlines that capture the essential information.">EDTSum</th> | |
| </tr> | |
| <tr> | |
| <th class="has-text-centered">BERTScore Precision</th> | |
| <th class="has-text-centered">BERTScore Recall</th> | |
| <th class="has-text-centered">BERTScore F1</th> | |
| <th class="has-text-centered">BERTScore Precision</th> | |
| <th class="has-text-centered">BERTScore Recall</th> | |
| <th class="has-text-centered">BERTScore F1</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Llama 3 70B Instruct</td> | |
| <td class="has-text-centered">0.715</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.754</td> | |
| <td class="has-text-centered">0.793</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td>Llama 3 8B Instruct</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.796</td> | |
| <td class="has-text-centered">0.757</td> | |
| <td class="has-text-centered">0.785</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td>DBRX Instruct</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.786</td> | |
| <td class="has-text-centered">0.729</td> | |
| <td class="has-text-centered">0.774</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.806</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek LLM (67B)</td> | |
| <td class="has-text-centered">0.692</td> | |
| <td class="has-text-centered">0.678</td> | |
| <td class="has-text-centered">0.681</td> | |
| <td class="has-text-centered">0.779</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered">0.807</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 27B</td> | |
| <td class="has-text-centered">0.680</td> | |
| <td class="has-text-centered">0.777</td> | |
| <td class="has-text-centered">0.723</td> | |
| <td class="has-text-centered performance-strong">0.801</td> | |
| <td class="has-text-centered">0.829</td> | |
| <td class="has-text-centered">0.814</td> | |
| </tr> | |
| <tr> | |
| <td>Gemma 2 9B</td> | |
| <td class="has-text-centered">0.651</td> | |
| <td class="has-text-centered">0.531</td> | |
| <td class="has-text-centered">0.585</td> | |
| <td class="has-text-centered performance-best">0.803</td> | |
| <td class="has-text-centered">0.833</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td>Mistral (7B) Instruct v0.3</td> | |
| <td class="has-text-centered">0.702</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.783</td> | |
| <td class="has-text-centered">0.842</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x22B Instruct</td> | |
| <td class="has-text-centered">0.713</td> | |
| <td class="has-text-centered performance-best">0.812</td> | |
| <td class="has-text-centered">0.758</td> | |
| <td class="has-text-centered">0.790</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td>Mixtral-8x7B Instruct</td> | |
| <td class="has-text-centered">0.727</td> | |
| <td class="has-text-centered">0.773</td> | |
| <td class="has-text-centered">0.747</td> | |
| <td class="has-text-centered">0.785</td> | |
| <td class="has-text-centered">0.839</td> | |
| <td class="has-text-centered">0.810</td> | |
| </tr> | |
| <tr> | |
| <td>Qwen 2 Instruct (72B)</td> | |
| <td class="has-text-centered">0.709</td> | |
| <td class="has-text-centered performance-medium">0.804</td> | |
| <td class="has-text-centered">0.752</td> | |
| <td class="has-text-centered">0.781</td> | |
| <td class="has-text-centered performance-strong">0.846</td> | |
| <td class="has-text-centered">0.811</td> | |
| </tr> | |
| <tr> | |
| <td>WizardLM-2 8x22B</td> | |
| <td class="has-text-centered">0.677</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.735</td> | |
| <td class="has-text-centered">0.774</td> | |
| <td class="has-text-centered performance-best">0.847</td> | |
| <td class="has-text-centered">0.808</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek-V3</td> | |
| <td class="has-text-centered">0.703</td> | |
| <td class="has-text-centered performance-strong">0.806</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.791</td> | |
| <td class="has-text-centered">0.842</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td>DeepSeek R1</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered">0.759</td> | |
| <td class="has-text-centered">0.770</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.804</td> | |
| </tr> | |
| <tr> | |
| <td>QwQ-32B-Preview</td> | |
| <td class="has-text-centered">0.653</td> | |
| <td class="has-text-centered">0.751</td> | |
| <td class="has-text-centered">0.696</td> | |
| <td class="has-text-centered">0.797</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Mini</td> | |
| <td class="has-text-centered">0.692</td> | |
| <td class="has-text-centered">0.798</td> | |
| <td class="has-text-centered">0.741</td> | |
| <td class="has-text-centered">0.798</td> | |
| <td class="has-text-centered">0.838</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| <tr> | |
| <td>Jamba 1.5 Large</td> | |
| <td class="has-text-centered">0.679</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered">0.734</td> | |
| <td class="has-text-centered">0.799</td> | |
| <td class="has-text-centered">0.841</td> | |
| <td class="has-text-centered performance-best">0.818</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3.5 Sonnet</td> | |
| <td class="has-text-centered performance-medium">0.737</td> | |
| <td class="has-text-centered">0.802</td> | |
| <td class="has-text-centered performance-medium">0.767</td> | |
| <td class="has-text-centered">0.786</td> | |
| <td class="has-text-centered">0.843</td> | |
| <td class="has-text-centered">0.813</td> | |
| </tr> | |
| <tr> | |
| <td>Claude 3 Haiku</td> | |
| <td class="has-text-centered">0.683</td> | |
| <td class="has-text-centered">0.617</td> | |
| <td class="has-text-centered">0.646</td> | |
| <td class="has-text-centered">0.778</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered">0.808</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R 7B</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.781</td> | |
| <td class="has-text-centered">0.750</td> | |
| <td class="has-text-centered">0.790</td> | |
| <td class="has-text-centered performance-medium">0.844</td> | |
| <td class="has-text-centered">0.815</td> | |
| </tr> | |
| <tr> | |
| <td>Cohere Command R +</td> | |
| <td class="has-text-centered">0.724</td> | |
| <td class="has-text-centered">0.782</td> | |
| <td class="has-text-centered">0.751</td> | |
| <td class="has-text-centered">0.789</td> | |
| <td class="has-text-centered">0.834</td> | |
| <td class="has-text-centered">0.810</td> | |
| </tr> | |
| <tr> | |
| <td>Google Gemini 1.5 Pro</td> | |
| <td class="has-text-centered performance-best">0.757</td> | |
| <td class="has-text-centered">0.800</td> | |
| <td class="has-text-centered performance-best">0.777</td> | |
| <td class="has-text-centered performance-medium">0.800</td> | |
| <td class="has-text-centered">0.836</td> | |
| <td class="has-text-centered performance-strong">0.817</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI gpt-4o</td> | |
| <td class="has-text-centered performance-strong">0.755</td> | |
| <td class="has-text-centered">0.793</td> | |
| <td class="has-text-centered performance-strong">0.773</td> | |
| <td class="has-text-centered">0.795</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| <tr> | |
| <td>OpenAI o1-mini</td> | |
| <td class="has-text-centered">0.731</td> | |
| <td class="has-text-centered">0.801</td> | |
| <td class="has-text-centered">0.763</td> | |
| <td class="has-text-centered">0.795</td> | |
| <td class="has-text-centered">0.840</td> | |
| <td class="has-text-centered performance-medium">0.816</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> Color highlighting indicates performance ranking: | |
| <span class="performance-best"> Best </span>, | |
| <span class="performance-medium"> Strong </span>, | |
| <span class="performance-low"> Good </span> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Cost Analysis Tab --> | |
| <div id="cost-analysis" class="tab-content"> | |
| <h2 class="title is-4">Model Cost Analysis</h2> | |
| <div class="results-table"> | |
| <table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth"> | |
| <thead> | |
| <tr> | |
| <th>Model</th> | |
| <th class="has-text-centered tooltip-trigger column-border-left" data-title="FOMC" data-tooltip="FOMC is a dataset containing Federal Open Market Committee speeches, meeting minutes, and press conference transcripts spanning from 1996 to 2022. The classification task involves determining whether the monetary policy stance expressed in each document is hawkish (tighter monetary policy) or dovish (looser monetary policy).">FOMC</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="Financial Phrase Bank" data-tooltip="Financial Phrase Bank (FPB) contains 4,840 sentences from financial news articles categorized as positive, negative, or neutral by 16 finance experts using majority voting. The sentiment classification task requires understanding how these statements might influence investor perception of stock prices.">FPB</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinQA" data-tooltip="FinQA contains 8,281 question-answer pairs derived from financial reports that require numerical reasoning over tabular financial data. The question-answering task features multi-step reasoning challenges with full annotation of reasoning programs to solve complex financial queries.">FinQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FiQA Task 1" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis in microblog posts and news headlines using a continuous scale from -1 (negative) to 1 (positive). The regression task requires models to accurately predict the sentiment score that reflects investor perception of financial texts.">FiQA-1</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FiQA Task 2" data-tooltip="FiQA Task 2 focuses on sentiment analysis of financial texts from various sources.">FiQA-2</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="Headlines" data-tooltip="Headlines is a dataset containing 11,412 human-annotated financial news headlines focused on commodities, particularly gold, spanning from 2000 to 2019. The classification task involves identifying binary indicators for price mentions and directional price movements in these concise financial texts.">HL</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinBench" data-tooltip="FinBench is a comprehensive evaluation dataset containing 333,000 labeled instances that combines tabular data and profile text for financial risk prediction. The task requires models to predict financial outcomes across three key risk categories: default, fraud, and customer churn.">FB</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinRED" data-tooltip="FinRED is a specialized relation extraction dataset created from financial news and earnings call transcripts using distance supervision based on Wikidata triplets. The task involves identifying and extracting financial relationships between entities to understand connections in financial contexts.">FR</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ReFiND" data-tooltip="ReFiND is a comprehensive relation extraction dataset containing approximately 29,000 annotated instances with 22 distinct relation types across 8 entity pair categories from various financial documents. The task requires identifying specific relationships between financial entities in complex documents like SEC filings.">RD</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="EDTSum" data-tooltip="EDTSum consists of 2,000 financial news articles paired with their headlines as ground-truth summaries for evaluating text summarization. The task challenges models to condense complex financial news articles into concise, informative headlines that capture the essential information.">EDTSum</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="Banking77" data-tooltip="Banking77 is a fine-grained dataset comprising 13,083 customer service queries annotated with 77 unique intents from the banking domain. The task involves accurately classifying each customer query into the correct intent category to improve automated banking support systems.">B77</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinCausal (CD)" data-tooltip="FinCausal Causal Discovery (CD) contains 29,444 text sections from financial news, with 2,136 annotated as expressing causal relationships. The task involves extracting precise cause and effect spans from financial texts that contain causal relationships.">CD</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinCausal (CC)" data-tooltip="FinCausal Causality Classification (CC) consists of 29,444 text sections from financial news with binary annotations indicating causal relationships. The classification task requires determining whether a given financial text section contains a causal relationship (1) or not (0).">CC</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ECTSum" data-tooltip="ECTSum contains 2,425 document-summary pairs featuring earnings call transcripts paired with concise bullet-point summaries extracted from Reuters articles. The summarization task requires extracting and condensing key financial information from lengthy corporate communications into brief, informative points.">ECTSum</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FinEntity" data-tooltip="FinEntity consists of 979 financial news paragraphs containing 2,131 manually-annotated financial entities with sentiment classifications. The task involves identifying companies and asset classes in financial texts while determining the associated sentiment expressed toward each entity.">FE</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FiNER-ORD" data-tooltip="FiNER-ORD is a manually annotated named entity recognition dataset comprising financial news articles with detailed entity annotations. The task requires identifying and correctly classifying person, location, and organization entities in financial contexts.">FiNER</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="FNXL" data-tooltip="FNXL contains 79,088 sentences with 142,922 annotated numerals extracted from SEC 10-K reports and categorized under 2,794 distinct numerical labels. The information extraction task requires identifying, categorizing and understanding the financial significance of numerical entities in regulatory filings.">FNXL</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="NumClaim" data-tooltip="NumClaim is an expert-annotated dataset for detecting and analyzing fine-grained investor claims within financial narratives that contain numerical information. The task requires identifying and categorizing claims containing numerals in analyst reports and earnings call transcripts for investment decision making.">NC</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="TATQA" data-tooltip="TATQA is a large-scale question answering dataset for hybrid data sources that combines tables and text from financial reports. The task emphasizes numerical reasoning operations across multiple formats, requiring models to integrate information from structured and unstructured sources to answer financial questions.">TQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="ConvFinQA" data-tooltip="ConvFinQA is a multi-turn question answering dataset with 3,892 conversations containing 14,115 questions that explore chains of numerical reasoning in financial contexts. The conversational task requires maintaining context while performing sequential numerical operations to answer increasingly complex financial questions.">CFQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right" data-title="SubjECTive-QA" data-tooltip="SubjECTive-QA contains 49,446 annotations across 2,747 question-answer pairs extracted from 120 earnings call transcripts. The multi-label classification task involves analyzing six subjective features in financial discourse: assertiveness, cautiousness, optimism, specificity, clarity, and relevance.">SQA</th> | |
| <th class="has-text-centered tooltip-trigger tooltip-right column-border-left" data-title="Total Cost" data-tooltip="Total cost in USD to run inference on all datasets combined">Total</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td> | |
| <td class="column-border-left has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">1.14</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.72</td> | |
| <td class="has-text-centered">1.00</td> | |
| <td class="has-text-centered">0.40</td> | |
| <td class="has-text-centered">0.38</td> | |
| <td class="has-text-centered">1.34</td> | |
| <td class="has-text-centered">1.94</td> | |
| <td class="has-text-centered">1.64</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">1.56</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">0.33</td> | |
| <td class="has-text-centered">0.25</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">1.11</td> | |
| <td class="has-text-centered">2.96</td> | |
| <td class="has-text-centered">1.17</td> | |
| <td class="column-border-left has-text-centered performance-medium">16.54</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Llama 3 8B Instruct" data-tooltip="Meta's compact 8 billion parameter language model, offering an efficient alternative to larger variants. Optimized for practical instruction-following tasks with reduced computational requirements.">Llama 3 8B Instruct</td> | |
| <td class="column-border-left has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.25</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.16</td> | |
| <td class="has-text-centered">0.22</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.32</td> | |
| <td class="has-text-centered">0.43</td> | |
| <td class="has-text-centered">0.37</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.36</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.26</td> | |
| <td class="has-text-centered">0.69</td> | |
| <td class="has-text-centered">0.26</td> | |
| <td class="column-border-left has-text-centered performance-low">3.79</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="DBRX Instruct" data-tooltip="A 132 billion parameter Mixture of Experts (MoE) model developed by Databricks. Fine-tuned for instruction-following on the Mosaic AI platform with strong performance across diverse tasks.">DBRX Instruct</td> | |
| <td class="column-border-left has-text-centered">0.14</td> | |
| <td class="has-text-centered">0.17</td> | |
| <td class="has-text-centered">1.50</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.95</td> | |
| <td class="has-text-centered">1.29</td> | |
| <td class="has-text-centered">0.56</td> | |
| <td class="has-text-centered">0.57</td> | |
| <td class="has-text-centered">2.05</td> | |
| <td class="has-text-centered">2.93</td> | |
| <td class="has-text-centered">2.14</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">2.45</td> | |
| <td class="has-text-centered">0.17</td> | |
| <td class="has-text-centered">0.47</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">0.13</td> | |
| <td class="has-text-centered">1.47</td> | |
| <td class="has-text-centered">4.19</td> | |
| <td class="has-text-centered">1.55</td> | |
| <td class="column-border-left has-text-centered performance-medium">23.35</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter dense language model specialized for complex reasoning and instruction following. Developed by the DeepSeek AI team to compete with other frontier models.">DeepSeek LLM (67B)</td> | |
| <td class="column-border-left has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">1.25</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.76</td> | |
| <td class="has-text-centered">0.87</td> | |
| <td class="has-text-centered">0.42</td> | |
| <td class="has-text-centered">0.37</td> | |
| <td class="has-text-centered">1.45</td> | |
| <td class="has-text-centered">1.85</td> | |
| <td class="has-text-centered">2.03</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.83</td> | |
| <td class="has-text-centered">0.13</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">0.24</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">1.20</td> | |
| <td class="has-text-centered">3.17</td> | |
| <td class="has-text-centered">1.17</td> | |
| <td class="column-border-left has-text-centered performance-medium">16.57</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Gemma 2 27B" data-tooltip="Google's 27 billion parameter language model released as an open-weight alternative to proprietary models. Designed for general NLP applications with strong performance-to-size ratio.">Gemma 2 27B</td> | |
| <td class="column-border-left has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">1.05</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.66</td> | |
| <td class="has-text-centered">0.91</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">1.37</td> | |
| <td class="has-text-centered">1.75</td> | |
| <td class="has-text-centered">1.77</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.04</td> | |
| <td class="has-text-centered">1.46</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">0.21</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">1.00</td> | |
| <td class="has-text-centered">2.84</td> | |
| <td class="has-text-centered">1.04</td> | |
| <td class="column-border-left has-text-centered performance-medium">15.50</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Gemma 2 9B" data-tooltip="Google's compact 9 billion parameter language model offering an efficient alternative to larger models. Features strong performance-to-size ratio for general NLP tasks with reduced computing requirements.">Gemma 2 9B</td> | |
| <td class="column-border-left has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.40</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.24</td> | |
| <td class="has-text-centered">0.33</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">0.51</td> | |
| <td class="has-text-centered">0.66</td> | |
| <td class="has-text-centered">0.66</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.00</td> | |
| <td class="has-text-centered">0.04</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.37</td> | |
| <td class="has-text-centered">1.08</td> | |
| <td class="has-text-centered">0.39</td> | |
| <td class="column-border-left has-text-centered performance-low">5.29</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter dense language model designed for instruction following. Notable for its efficiency and strong performance at a smaller size, making it suitable for cost-sensitive applications.">Mistral (7B) Instruct v0.3</td> | |
| <td class="column-border-left has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.28</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.18</td> | |
| <td class="has-text-centered">0.24</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.36</td> | |
| <td class="has-text-centered">0.57</td> | |
| <td class="has-text-centered">0.48</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.45</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.27</td> | |
| <td class="has-text-centered">0.78</td> | |
| <td class="has-text-centered">0.26</td> | |
| <td class="column-border-left has-text-centered performance-low">4.36</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter Mixture of Experts (MoE) model that combines eight 22B expert networks. Advanced architecture enables state-of-the-art performance while maintaining inference efficiency.">Mixtral-8x22B Instruct</td> | |
| <td class="column-border-left has-text-centered">0.14</td> | |
| <td class="has-text-centered">0.17</td> | |
| <td class="has-text-centered">1.80</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">1.05</td> | |
| <td class="has-text-centered">1.44</td> | |
| <td class="has-text-centered">0.58</td> | |
| <td class="has-text-centered">0.56</td> | |
| <td class="has-text-centered">2.04</td> | |
| <td class="has-text-centered">3.42</td> | |
| <td class="has-text-centered">2.89</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">2.66</td> | |
| <td class="has-text-centered">0.18</td> | |
| <td class="has-text-centered">0.48</td> | |
| <td class="has-text-centered">0.35</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">1.73</td> | |
| <td class="has-text-centered">4.90</td> | |
| <td class="has-text-centered">1.55</td> | |
| <td class="column-border-left has-text-centered performance-medium">26.35</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 46.7 billion parameter Mixture of Experts (MoE) model combining eight 7B expert networks. Provides a balanced approach between performance and efficiency for instruction-following tasks.">Mixtral-8x7B Instruct</td> | |
| <td class="column-border-left has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.88</td> | |
| <td class="has-text-centered">0.04</td> | |
| <td class="has-text-centered">0.53</td> | |
| <td class="has-text-centered">0.70</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">1.07</td> | |
| <td class="has-text-centered">1.72</td> | |
| <td class="has-text-centered">1.50</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">1.30</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.24</td> | |
| <td class="has-text-centered">0.20</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.87</td> | |
| <td class="has-text-centered">2.55</td> | |
| <td class="has-text-centered">0.78</td> | |
| <td class="column-border-left has-text-centered performance-medium">13.41</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba Cloud's 72 billion parameter dense language model designed for instruction following. Features strong multilingual capabilities and optimization for diverse NLP tasks.">Qwen 2 Instruct (72B)</td> | |
| <td class="column-border-left has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">1.29</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.74</td> | |
| <td class="has-text-centered">0.96</td> | |
| <td class="has-text-centered">0.43</td> | |
| <td class="has-text-centered">0.43</td> | |
| <td class="has-text-centered">1.44</td> | |
| <td class="has-text-centered">2.36</td> | |
| <td class="has-text-centered">1.61</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">1.80</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">0.24</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">1.18</td> | |
| <td class="has-text-centered">3.41</td> | |
| <td class="has-text-centered">1.17</td> | |
| <td class="column-border-left has-text-centered performance-medium">18.02</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="WizardLM-2 8x22B" data-tooltip="Microsoft's 141 billion parameter Mixture of Experts (MoE) model specifically optimized for instruction-following tasks. Combines eight 22B expert networks with advanced routing for high performance.">WizardLM-2 8x22B</td> | |
| <td class="column-border-left has-text-centered">0.16</td> | |
| <td class="has-text-centered">0.19</td> | |
| <td class="has-text-centered">1.94</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">1.07</td> | |
| <td class="has-text-centered">1.47</td> | |
| <td class="has-text-centered">0.61</td> | |
| <td class="has-text-centered">0.61</td> | |
| <td class="has-text-centered">2.24</td> | |
| <td class="has-text-centered">3.47</td> | |
| <td class="has-text-centered">3.00</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">2.85</td> | |
| <td class="has-text-centered">0.18</td> | |
| <td class="has-text-centered">0.49</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">1.94</td> | |
| <td class="has-text-centered">5.31</td> | |
| <td class="has-text-centered">1.55</td> | |
| <td class="column-border-left has-text-centered performance-medium">27.87</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) language model designed for advanced reasoning and instruction following. Features strong performance across diverse tasks with efficient architecture.">DeepSeek-V3</td> | |
| <td class="column-border-left has-text-centered">0.13</td> | |
| <td class="has-text-centered">0.15</td> | |
| <td class="has-text-centered">1.57</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.98</td> | |
| <td class="has-text-centered">1.36</td> | |
| <td class="has-text-centered">0.52</td> | |
| <td class="has-text-centered">0.54</td> | |
| <td class="has-text-centered">2.10</td> | |
| <td class="has-text-centered">2.99</td> | |
| <td class="has-text-centered">2.55</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">2.33</td> | |
| <td class="has-text-centered">0.16</td> | |
| <td class="has-text-centered">0.55</td> | |
| <td class="has-text-centered">0.28</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">1.56</td> | |
| <td class="has-text-centered">4.28</td> | |
| <td class="has-text-centered">1.62</td> | |
| <td class="column-border-left has-text-centered performance-medium">24.03</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td> | |
| <td class="column-border-left has-text-centered">1.99</td> | |
| <td class="has-text-centered">2.10</td> | |
| <td class="has-text-centered">14.18</td> | |
| <td class="has-text-centered">1.48</td> | |
| <td class="has-text-centered">17.82</td> | |
| <td class="has-text-centered">20.11</td> | |
| <td class="has-text-centered">6.63</td> | |
| <td class="has-text-centered">12.65</td> | |
| <td class="has-text-centered">31.00</td> | |
| <td class="has-text-centered">21.15</td> | |
| <td class="has-text-centered">23.28</td> | |
| <td class="has-text-centered">3.75</td> | |
| <td class="has-text-centered">1.06</td> | |
| <td class="has-text-centered">15.02</td> | |
| <td class="has-text-centered">7.31</td> | |
| <td class="has-text-centered">8.34</td> | |
| <td class="has-text-centered">11.21</td> | |
| <td class="has-text-centered">1.88</td> | |
| <td class="has-text-centered">13.72</td> | |
| <td class="has-text-centered">39.42</td> | |
| <td class="has-text-centered">9.07</td> | |
| <td class="column-border-left has-text-centered performance-strong">263.16</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="QwQ-32B-Preview" data-tooltip="Alibaba Cloud's 32 billion parameter QwenNext variant with optimizations for both general and specialized tasks. Features strong performance across various financial benchmarks with smaller parameter count.">QwQ-32B-Preview</td> | |
| <td class="column-border-left has-text-centered">0.15</td> | |
| <td class="has-text-centered">0.18</td> | |
| <td class="has-text-centered">2.38</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.93</td> | |
| <td class="has-text-centered">1.37</td> | |
| <td class="has-text-centered">0.60</td> | |
| <td class="has-text-centered">0.68</td> | |
| <td class="has-text-centered">2.18</td> | |
| <td class="has-text-centered">3.12</td> | |
| <td class="has-text-centered">2.36</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">2.76</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">0.65</td> | |
| <td class="has-text-centered">0.54</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">2.61</td> | |
| <td class="has-text-centered">7.83</td> | |
| <td class="has-text-centered">1.55</td> | |
| <td class="column-border-left has-text-centered performance-medium">30.43</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Jamba 1.5 Mini" data-tooltip="AI21 Labs' 52 billion parameter Mixture of Experts (MoE) model designed for efficient instruction following. Optimized for practical deployment with reduced computational requirements compared to larger variants.">Jamba 1.5 Mini</td> | |
| <td class="column-border-left has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.23</td> | |
| <td class="has-text-centered">0.22</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.44</td> | |
| <td class="has-text-centered">0.55</td> | |
| <td class="has-text-centered">0.51</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.49</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.25</td> | |
| <td class="has-text-centered">0.72</td> | |
| <td class="has-text-centered">0.26</td> | |
| <td class="column-border-left has-text-centered performance-low">4.47</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Jamba 1.5 Large" data-tooltip="AI21 Labs' premium 398 billion parameter Mixture of Experts (MoE) model designed for advanced reasoning. Features strong performance across complex tasks including financial domain problems.">Jamba 1.5 Large</td> | |
| <td class="column-border-left has-text-centered">0.31</td> | |
| <td class="has-text-centered">0.36</td> | |
| <td class="has-text-centered">4.42</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">3.47</td> | |
| <td class="has-text-centered">4.81</td> | |
| <td class="has-text-centered">1.78</td> | |
| <td class="has-text-centered">0.94</td> | |
| <td class="has-text-centered">4.97</td> | |
| <td class="has-text-centered">5.80</td> | |
| <td class="has-text-centered">5.51</td> | |
| <td class="has-text-centered">0.35</td> | |
| <td class="has-text-centered">0.13</td> | |
| <td class="has-text-centered">7.07</td> | |
| <td class="has-text-centered">0.56</td> | |
| <td class="has-text-centered">1.67</td> | |
| <td class="has-text-centered">0.77</td> | |
| <td class="has-text-centered">0.30</td> | |
| <td class="has-text-centered">2.87</td> | |
| <td class="has-text-centered">7.45</td> | |
| <td class="has-text-centered">2.59</td> | |
| <td class="column-border-left has-text-centered performance-strong">56.42</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model designed for complex reasoning and instruction-following tasks. Features strong performance across financial tasks with sophisticated reasoning capabilities.">Claude 3.5 Sonnet</td> | |
| <td class="column-border-left has-text-centered">0.62</td> | |
| <td class="has-text-centered">0.72</td> | |
| <td class="has-text-centered">6.98</td> | |
| <td class="has-text-centered">0.55</td> | |
| <td class="has-text-centered">6.50</td> | |
| <td class="has-text-centered">8.81</td> | |
| <td class="has-text-centered">3.44</td> | |
| <td class="has-text-centered">3.21</td> | |
| <td class="has-text-centered">12.32</td> | |
| <td class="has-text-centered">9.50</td> | |
| <td class="has-text-centered">11.11</td> | |
| <td class="has-text-centered">0.61</td> | |
| <td class="has-text-centered">0.22</td> | |
| <td class="has-text-centered">7.09</td> | |
| <td class="has-text-centered">0.90</td> | |
| <td class="has-text-centered">3.01</td> | |
| <td class="has-text-centered">1.79</td> | |
| <td class="has-text-centered">0.57</td> | |
| <td class="has-text-centered">9.18</td> | |
| <td class="has-text-centered">16.86</td> | |
| <td class="has-text-centered">3.89</td> | |
| <td class="column-border-left has-text-centered performance-strong">107.87</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Claude 3 Haiku" data-tooltip="Anthropic's compact proprietary language model optimized for efficiency while maintaining strong capabilities. Designed for cost-effective deployment with reduced computational requirements compared to larger variants.">Claude 3 Haiku</td> | |
| <td class="column-border-left has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.56</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.54</td> | |
| <td class="has-text-centered">0.73</td> | |
| <td class="has-text-centered">0.28</td> | |
| <td class="has-text-centered">0.25</td> | |
| <td class="has-text-centered">0.82</td> | |
| <td class="has-text-centered">0.81</td> | |
| <td class="has-text-centered">0.90</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.02</td> | |
| <td class="has-text-centered">0.21</td> | |
| <td class="has-text-centered">0.06</td> | |
| <td class="has-text-centered">0.23</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="has-text-centered">0.64</td> | |
| <td class="has-text-centered">1.28</td> | |
| <td class="has-text-centered">0.32</td> | |
| <td class="column-border-left has-text-centered performance-low">8.07</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Cohere Command R 7B" data-tooltip="Cohere's compact 7 billion parameter instruction-tuned model optimized for practical applications. Focused on efficiency with minimal computational requirements while maintaining reasonable capabilities.">Cohere Command R 7B</td> | |
| <td class="column-border-left has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.00</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">0.09</td> | |
| <td class="has-text-centered">0.04</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.11</td> | |
| <td class="has-text-centered">0.10</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.00</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.03</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.01</td> | |
| <td class="has-text-centered">0.08</td> | |
| <td class="has-text-centered">0.19</td> | |
| <td class="has-text-centered">0.05</td> | |
| <td class="column-border-left has-text-centered performance-low">1.09</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Cohere Command R +" data-tooltip="Cohere's premium 104 billion parameter dense language model optimized for advanced tasks. Features strong instruction-following capabilities with comprehensive knowledge across domains including finance.">Cohere Command R +</td> | |
| <td class="column-border-left has-text-centered">0.41</td> | |
| <td class="has-text-centered">0.45</td> | |
| <td class="has-text-centered">5.40</td> | |
| <td class="has-text-centered">0.35</td> | |
| <td class="has-text-centered">4.41</td> | |
| <td class="has-text-centered">4.00</td> | |
| <td class="has-text-centered">2.30</td> | |
| <td class="has-text-centered">0.93</td> | |
| <td class="has-text-centered">3.87</td> | |
| <td class="has-text-centered">7.03</td> | |
| <td class="has-text-centered">7.21</td> | |
| <td class="has-text-centered">0.43</td> | |
| <td class="has-text-centered">0.12</td> | |
| <td class="has-text-centered">5.55</td> | |
| <td class="has-text-centered">0.48</td> | |
| <td class="has-text-centered">1.69</td> | |
| <td class="has-text-centered">0.97</td> | |
| <td class="has-text-centered">0.42</td> | |
| <td class="has-text-centered">4.59</td> | |
| <td class="has-text-centered">10.09</td> | |
| <td class="has-text-centered">3.24</td> | |
| <td class="column-border-left has-text-centered performance-strong">63.95</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td> | |
| <td class="column-border-left has-text-centered">0.23</td> | |
| <td class="has-text-centered">0.21</td> | |
| <td class="has-text-centered">2.26</td> | |
| <td class="has-text-centered">0.18</td> | |
| <td class="has-text-centered">2.20</td> | |
| <td class="has-text-centered">2.78</td> | |
| <td class="has-text-centered">1.02</td> | |
| <td class="has-text-centered">0.49</td> | |
| <td class="has-text-centered">2.27</td> | |
| <td class="has-text-centered">3.45</td> | |
| <td class="has-text-centered">2.70</td> | |
| <td class="has-text-centered">0.21</td> | |
| <td class="has-text-centered">0.07</td> | |
| <td class="has-text-centered">2.65</td> | |
| <td class="has-text-centered">0.25</td> | |
| <td class="has-text-centered">0.87</td> | |
| <td class="has-text-centered">0.58</td> | |
| <td class="has-text-centered">0.21</td> | |
| <td class="has-text-centered">2.13</td> | |
| <td class="has-text-centered">5.78</td> | |
| <td class="has-text-centered">1.62</td> | |
| <td class="column-border-left has-text-centered performance-medium">32.16</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="OpenAI GPT-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td> | |
| <td class="column-border-left has-text-centered">0.35</td> | |
| <td class="has-text-centered">0.41</td> | |
| <td class="has-text-centered">4.99</td> | |
| <td class="has-text-centered">0.32</td> | |
| <td class="has-text-centered">4.45</td> | |
| <td class="has-text-centered">5.33</td> | |
| <td class="has-text-centered">1.55</td> | |
| <td class="has-text-centered">1.21</td> | |
| <td class="has-text-centered">5.77</td> | |
| <td class="has-text-centered">6.57</td> | |
| <td class="has-text-centered">5.00</td> | |
| <td class="has-text-centered">0.35</td> | |
| <td class="has-text-centered">0.14</td> | |
| <td class="has-text-centered">4.85</td> | |
| <td class="has-text-centered">0.44</td> | |
| <td class="has-text-centered">1.94</td> | |
| <td class="has-text-centered">0.96</td> | |
| <td class="has-text-centered">0.34</td> | |
| <td class="has-text-centered">4.95</td> | |
| <td class="has-text-centered">10.36</td> | |
| <td class="has-text-centered">3.24</td> | |
| <td class="column-border-left has-text-centered performance-strong">63.52</td> | |
| </tr> | |
| <tr> | |
| <td class="tooltip-trigger tooltip-right" data-title="OpenAI o1-mini" data-tooltip="OpenAI's compact model from the o1 series, designed as a more efficient alternative to larger models. Features strong performance relative to its size with capabilities for complex financial tasks.">OpenAI o1-mini</td> | |
| <td class="column-border-left has-text-centered">0.90</td> | |
| <td class="has-text-centered">0.90</td> | |
| <td class="has-text-centered">5.25</td> | |
| <td class="has-text-centered">0.73</td> | |
| <td class="has-text-centered">9.70</td> | |
| <td class="has-text-centered">12.20</td> | |
| <td class="has-text-centered">3.27</td> | |
| <td class="has-text-centered">4.89</td> | |
| <td class="has-text-centered">13.60</td> | |
| <td class="has-text-centered">1.29</td> | |
| <td class="has-text-centered">9.29</td> | |
| <td class="has-text-centered">2.56</td> | |
| <td class="has-text-centered">0.75</td> | |
| <td class="has-text-centered">3.18</td> | |
| <td class="has-text-centered">2.92</td> | |
| <td class="has-text-centered">1.91</td> | |
| <td class="has-text-centered">6.39</td> | |
| <td class="has-text-centered">0.92</td> | |
| <td class="has-text-centered">6.97</td> | |
| <td class="has-text-centered">15.71</td> | |
| <td class="has-text-centered">1.42</td> | |
| <td class="column-border-left has-text-centered performance-strong">104.73</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <style> | |
| /* Ensure model tooltips appear on the right with arrow pointing left */ | |
| .tooltip-trigger.tooltip-right:hover::after { | |
| left: 100% ; | |
| right: auto ; | |
| margin-left: 10px ; | |
| margin-right: 0 ; | |
| } | |
| /* Fix for cost analysis tab model tooltips */ | |
| #cost-analysis td.tooltip-trigger:hover::after { | |
| position: absolute ; | |
| left: 100% ; | |
| right: auto ; | |
| top: 50% ; | |
| transform: translateY(-50%) ; | |
| margin-left: 10px ; | |
| margin-right: 0 ; | |
| margin-top: 0 ; | |
| z-index: 99 ; | |
| } | |
| #cost-analysis td.tooltip-trigger:hover::before { | |
| position: absolute ; | |
| left: 100% ; | |
| right: auto ; | |
| top: 50% ; | |
| transform: translateY(-50%) ; | |
| border-left: 6px solid rgba(0, 0, 0, 0.8) ; | |
| border-right: none ; | |
| border-top: 6px solid transparent ; | |
| border-bottom: 6px solid transparent ; | |
| margin-top: 0 ; | |
| margin-left: 0 ; | |
| z-index: 100 ; | |
| } | |
| /* Ensure the model tooltips are properly centered in the cost analysis tab */ | |
| .model-tooltip { | |
| position: fixed ; | |
| transform: translateY(-50%) ; | |
| margin-top: 0 ; | |
| z-index: 1000 ; | |
| max-height: 400px ; | |
| overflow-y: auto ; | |
| } | |
| /* Make sure arrows are properly centered */ | |
| .model-tooltip > div { | |
| transform: translateY(-50%) ; | |
| z-index: 1001 ; | |
| } | |
| /* Ensure arrows are visible and properly sized */ | |
| .model-tooltip > div, | |
| #cost-analysis td.tooltip-trigger:first-child:hover::before { | |
| border-width: 8px ; | |
| } | |
| /* Force all model tooltips in cost analysis to have tooltip-right behavior */ | |
| #cost-analysis td:first-child { | |
| position: relative; | |
| } | |
| /* More specific selectors to ensure model tooltips in cost analysis tab are positioned correctly */ | |
| #cost-analysis td.tooltip-trigger:first-child:hover::after { | |
| left: 100% ; | |
| top: 50% ; | |
| bottom: auto ; | |
| transform: translateY(-50%) ; | |
| margin-top: 0 ; | |
| margin-left: 15px ; | |
| display: block ; | |
| } | |
| #cost-analysis td.tooltip-trigger:first-child:hover::before { | |
| content: '' ; | |
| display: block ; | |
| position: absolute ; | |
| top: 50% ; | |
| left: 100% ; | |
| right: auto ; | |
| bottom: auto ; | |
| transform: translateY(-50%) ; | |
| margin-left: 0 ; | |
| margin-top: 0 ; | |
| width: 0 ; | |
| height: 0 ; | |
| border-top: 10px solid transparent ; | |
| border-bottom: 10px solid transparent ; | |
| border-right: none ; | |
| border-left: 10px solid black ; | |
| z-index: 9999 ; | |
| pointer-events: none ; | |
| } | |
| /* Add an extra arrow to each model cell specifically for visual clarity */ | |
| #cost-analysis td:first-child.tooltip-trigger:after { | |
| content: "βΊ" ; | |
| position: absolute ; | |
| right: 10px ; | |
| color: gray ; | |
| font-size: 12px ; | |
| } | |
| </style> | |
| <div class="performance-legend"> | |
| <div class="performance-legend-item performance-low">Low Cost ($0-$10)</div> | |
| <div class="performance-legend-item performance-medium">Medium Cost ($10-$35)</div> | |
| <div class="performance-legend-item performance-strong">High Cost ($35-$70)</div> | |
| <div class="performance-legend-item" style="background-color: #1f93ff; color: white;">Very High Cost ($70+)</div> | |
| </div> | |
| <div class="content is-small mt-4"> | |
| <p><strong>Note:</strong> All costs are in USD and represent the expense to run the model on each specific dataset. Colors indicate cost tiers on the total cost, with darker blue representing higher costs. For cost-efficiency analysis, consider comparing these costs with the corresponding performance metrics in other tabs.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- </section> | |
| </div> --> | |
| <section class="section"> | |
| <div class="container"> | |
| <!-- Model Performance Highlights --> | |
| <div class="card mb-5"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-medal"></i></span> | |
| Model Performance Highlights | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p class="has-text-weight-bold mb-4 has-text-centered">π Top Performing Models</p> | |
| <div class="columns is-centered is-multiline"> | |
| <!-- DeepSeek R1 --> | |
| <div class="column is-4"> | |
| <div class="is-flex is-flex-direction-column is-align-items-center"> | |
| <figure class="image is-128x128 mb-3"> | |
| <img src="static/images/deepseek_logo.png" alt="DeepSeek R1 Logo"> | |
| </figure> | |
| <p class="is-size-4 has-text-weight-semibold mb-2">DeepSeek R1</p> | |
| <span class="icon is-large has-text-warning"><i class="fas fa-trophy fa-2x"></i></span> | |
| </div> | |
| </div> | |
| <!-- OpenAI o1-mini --> | |
| <div class="column is-4"> | |
| <div class="is-flex is-flex-direction-column is-align-items-center"> | |
| <figure class="image is-128x128 mb-3"> | |
| <img src="static/images/openai_logo.png" alt="OpenAI Logo"> | |
| </figure> | |
| <p class="is-size-4 has-text-weight-semibold mb-2">OpenAI o1-mini</p> | |
| <span class="icon is-large has-text-grey"><i class="fas fa-trophy fa-2x"></i></span> | |
| </div> | |
| </div> | |
| <!-- Claude 3.5 Sonnet --> | |
| <div class="column is-4"> | |
| <div class="is-flex is-flex-direction-column is-align-items-center"> | |
| <figure class="image is-128x128 mb-3"> | |
| <img src="static/images/claude_logo.png" alt="Claude 3.5 Sonnet Logo"> | |
| </figure> | |
| <p class="is-size-4 has-text-weight-semibold mb-2">Claude 3.5 Sonnet</p> | |
| <span class="icon is-large has-text-bronze"><i class="fas fa-trophy fa-2x"></i></span> | |
| </div> | |
| </div> | |
| </div> | |
| <hr> | |
| <p class="has-text-weight-bold mb-3"><span class="icon has-text-primary"><i class="fa-solid fa-magnifying-glass"></i></span> Key Insights from Model Analysis</p> | |
| <div class="notification is-info is-light py-3 px-4"> | |
| <p><strong>π No single dominant model:</strong> DeepSeek R1 leads in complex multi-step QA, while Claude 3.5 excels in sentiment tasks. GPT-4o is strong in classification and summarization.</p> | |
| <p><strong>βοΈ Inconsistent scaling:</strong> Larger models donβt always outperform smaller onesβDeepSeek R1 trails in summarization despite excelling in QA.</p> | |
| <p><strong>π οΈ Open-weight models:</strong> Many open-weight models like DeepSeek-V3 and Llama 3.1 70B offer competitive performance while being cost-effective.</p> | |
| <p><strong>π° Cost-performance disparities:</strong> Running DeepSeek R1 can cost up to <strong>$260</strong> per million tokens, while Claude 3.5 Sonnet and o1-mini cost around <strong>$105</strong>, and Metaβs Llama 3.1 8B only <strong>$4</strong>.</p> | |
| <p><strong>π Numeric reasoning challenges:</strong> Even the best models struggle with financial numeric reasoning tasks, achieving low F1 scores (<strong>β€ 0.06</strong>).</p> | |
| <p><strong>π’ Step-by-step deductions:</strong> Multi-turn financial QA (e.g., ConvFinQA) significantly reduces model accuracy due to complex dependencies.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Error Analysis & Key Findings --> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-exclamation-triangle"></i></span> | |
| Error Analysis & Key Findings | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p class="mb-4">Common challenges and limitations identified in our evaluations:</p> | |
| <!-- Individual Error Categories --> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Concerns regarding outdated models</p> | |
| <div class="notification is-danger is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0"><strong>LLama 2 13B Chat</strong> produces trivial or empty responses, possibly due to misalignment during fine-tuning.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Numeric Regression Issues</p> | |
| <div class="notification is-danger is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">LMs struggle with precision and rounding in continuous-valued regressions (e.g., financial percentages). Post-hoc normalization is needed.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Data Contamination</p> | |
| <div class="notification is-danger is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">Overlap between public financial datasets and pretraining corpora can inflate zero-shot performance, requiring time-split test sets.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Challenges in Causal Classification</p> | |
| <div class="notification is-danger is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">Most models struggle with financial causal reasoning, requiring structured knowledge bases or explicit symbolic reasoning.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Language Drift</p> | |
| <div class="notification is-warning is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0"><strong>Qwen 2 72B</strong> exhibits unintended shifts to Chinese output in English summarization tasks, indicating strong pretraining priors.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Summarization Nuances</p> | |
| <div class="notification is-warning is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">Models achieve high BERTScores (~80-82%) on extractive summarization but suffer on abstractive tasks, especially in finance-specific jargon.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Prompt Design Limitations</p> | |
| <div class="notification is-warning is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">Prompts tuned on <strong>Llama 3 8B</strong> may not generalize across models, leading to inconsistencies in label generation (e.g., minor syntactic variations).</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Differences in QA Datasets</p> | |
| <div class="notification is-warning is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0"><strong>ConvFinQA</strong> consistently underperforms compared to <strong>FinQA</strong> due to its multi-turn dialogue complexity.</p> | |
| </div> | |
| </div> | |
| <div class="error-category mb-4"> | |
| <p class="has-text-weight-bold mb-1">Efficiency and Cost Considerations</p> | |
| <div class="notification is-warning is-light py-2 px-3"> | |
| <p class="is-size-7 mb-0">Inference costs vary by up to <strong>2Γ</strong> among similarly sized models, requiring a balance between performance and resource usage.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <section id="contributions" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Contributions --> | |
| <div class="columns is-centered"> | |
| <div class="column is-full"> | |
| <h2 class="title is-3 section-title has-text-centered">Contributions & Future Work</h2> | |
| <div class="content"> | |
| <!-- Contributions Overview --> | |
| <div class="notification is-info is-light has-text-centered mb-5"> | |
| <p class="is-size-5 has-text-weight-semibold"> | |
| Our work introduces a standardized, large-scale, and holistic evaluation framework for financial language models. | |
| </p> | |
| </div> | |
| <!-- Contributions --> | |
| <div class="box has-background-white-ter mb-5"> | |
| <h4 class="title is-4 has-text-centered mb-4">Key Contributions</h4> | |
| <div class="columns is-multiline is-centered"> | |
| <!-- Contribution 1 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-cogs fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Standardized Evaluation Framework</p> | |
| <p class="is-size-7">We introduce an open-source, modular benchmarking suite for systematic LM evaluations on core financial NLP tasks.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Contribution 2 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-chart-line fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Large-Scale Model Assessment</p> | |
| <p class="is-size-7">We benchmark 23 foundation LMsβopen-weight and proprietaryβacross 20 financial tasks, revealing performance-cost trade-offs.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Contribution 3 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-database fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Holistic Dataset Taxonomy</p> | |
| <p class="is-size-7">We establish a structured dataset taxonomy, categorizing financial NLP tasks based on domain, data format, and linguistic complexity.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Contribution 4 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-users fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Living Benchmark & Open Collaboration</p> | |
| <p class="is-size-7">We introduce a continuously updated leaderboard, inviting researchers to contribute new datasets and evaluation results.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Contribution 5 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-balance-scale fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Error Analysis & Cost-Performance Insights</p> | |
| <p class="is-size-7">We analyze systematic model errors and quantify cost-performance trade-offs for informed deployment in real-world applications.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Contribution 6 --> | |
| <div class="column is-6"> | |
| <div class="media"> | |
| <div class="media-left"> | |
| <span class="icon has-text-primary is-large"> | |
| <i class="fas fa-code-branch fa-lg"></i> | |
| </span> | |
| </div> | |
| <div class="media-content"> | |
| <p class="has-text-weight-semibold">Open-Source Implementation</p> | |
| <p class="is-size-7">We release a fully open-source framework, enabling the research community to extend and refine financial LM evaluation methodologies.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Limitations & Future Work --> | |
| <div class="columns is-multiline"> | |
| <!-- Limitations --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-exclamation-circle"></i></span> | |
| Limitations | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p class="mb-3"> | |
| While our benchmark provides valuable insights, several limitations must be acknowledged: | |
| </p> | |
| <div class="notification is-danger is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">β Data Contamination Risks</p> | |
| <p class="is-size-7 mb-0">Benchmark testing data may overlap with model pretraining corpora, leading to artificially inflated performance. We actively work on novel datasets to mitigate these risks.</p> | |
| </div> | |
| <div class="notification is-warning is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">β οΈ Dataset Size & Diversity</p> | |
| <p class="is-size-7 mb-0">Our dataset scope is limited, affecting model generalization across diverse financial domains and languages.</p> | |
| </div> | |
| <div class="notification is-warning is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">β οΈ Zero-Shot Focus</p> | |
| <p class="is-size-7 mb-0">Due to budget constraints, our evaluations rely on zero-shot learning only, without fine-tuning or few-shot prompting.</p> | |
| </div> | |
| <div class="notification is-warning is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">β οΈ Limited Adaptation Strategies</p> | |
| <p class="is-size-7 mb-0">We do not explore chain-of-thought reasoning or advanced prompting, though these techniques are known to improve model performance.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">βΉοΈ English Language Bias</p> | |
| <p class="is-size-7 mb-0">The benchmark primarily focuses on English due to the availability of financial datasets, limiting insights into multilingual model performance.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">βΉοΈ Real-World Complex Tasks</p> | |
| <p class="is-size-7 mb-0">Existing tasks do not fully capture the dynamic and evolving nature of financial markets, requiring ongoing dataset expansion.</p> | |
| </div> | |
| <p class="is-italic is-size-7 mt-4"> | |
| Recognizing these limitations is essential for improving future financial NLP benchmarks. Our ongoing work aims to address these challenges through dataset refinement, broader task coverage, and multilingual support. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Future Work --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-lightbulb"></i></span> | |
| Future Work | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <p class="mb-3"> | |
| To strengthen the robustness and adaptability of our framework, we advocate for open collaboration within the research community | |
| and propose the following future directions to expand its capabilities: | |
| </p> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">π Multilingual Expansion</p> | |
| <p class="is-size-7 mb-0">Extending benchmarks beyond English to include multilingual financial datasets and evaluations.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">π§ Few-Shot & Chain-of-Thought</p> | |
| <p class="is-size-7 mb-0">Investigating in-context learning techniques such as few-shot, chain-of-thought, and retrieval-augmented generation (RAG).</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">βοΈ Domain-Adaptive Training</p> | |
| <p class="is-size-7 mb-0">Evaluating fine-tuning strategies to enhance model understanding of financial-specific terminology and reasoning.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">π Expanded Dataset Coverage</p> | |
| <p class="is-size-7 mb-0">Curating datasets from underrepresented financial sectors such as insurance, derivatives, and central banking.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">βοΈ Efficiency & Cost Benchmarking</p> | |
| <p class="is-size-7 mb-0">Developing detailed trade-off analyses between accuracy, latency, and cost to optimize real-world usability.</p> | |
| </div> | |
| <div class="notification is-info is-light py-2 px-3 mb-3"> | |
| <p class="has-text-weight-bold mb-1">π Advanced Evaluation Metrics</p> | |
| <p class="is-size-7 mb-0">Moving beyond traditional accuracy metrics by incorporating trustworthiness, robustness, and interpretability measures.</p> | |
| </div> | |
| <p class="is-italic is-size-7 mt-4"> | |
| These improvements will enable more accurate and fair comparisons of financial language models, | |
| fostering greater transparency, reproducibility, and real-world applicability. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Contributions --> | |
| </div> | |
| </section> | |
| <section id="framework" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Framework Overview --> | |
| <div class="columns is-centered"> | |
| <div class="column is-full"> | |
| <h2 class="title is-3 section-title has-text-centered">Framework & Resources</h2> | |
| <div class="box has-background-white-ter has-text-centered p-5 mb-5"> | |
| <span class="icon is-large mb-4"> | |
| <i class="fas fa-fire flame-icon fa-3x"></i> | |
| </span> | |
| <h3 class="title is-4"><span class="flame">FLaME</span> Framework</h3> | |
| <p class="subtitle">An open-source, modular benchmarking suite for evaluating financial language models.</p> | |
| <div class="buttons is-centered mt-4"> | |
| <a href="https://github.com/gtfintechlab/FLaME" target="_blank" class="button is-primary"> | |
| <span class="icon"><i class="fab fa-github"></i></span> | |
| <span>GitHub Repository</span> | |
| </a> | |
| <a href="https://huggingface.co/gtfintechlab/" target="_blank" class="button is-info"> | |
| <span class="icon"><img src="./static/images/huggingface_logo.svg" alt="HuggingFace" width="20" height="20"></span> | |
| <span>Hugging Face Space</span> | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Features & Getting Started --> | |
| <div class="columns is-multiline"> | |
| <!-- π Framework Features --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-cogs"></i></span> | |
| Framework Features | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <div class="feature-list"> | |
| <div class="feature-item mb-3"> | |
| <p class="has-text-weight-bold mb-1"> | |
| <span class="icon has-text-primary"><i class="fas fa-check"></i></span> π§ Standardized Pipelines | |
| </p> | |
| <p class="is-size-7 ml-4">Pre-built evaluation pipelines for key financial NLP tasks.</p> | |
| </div> | |
| <div class="feature-item mb-3"> | |
| <p class="has-text-weight-bold mb-1"> | |
| <span class="icon has-text-primary"><i class="fas fa-check"></i></span> βοΈ Customizable Assessments | |
| </p> | |
| <p class="is-size-7 ml-4">Supports easy model-to-model and dataset-to-dataset comparisons.</p> | |
| </div> | |
| <div class="feature-item mb-3"> | |
| <p class="has-text-weight-bold mb-1"> | |
| <span class="icon has-text-primary"><i class="fas fa-check"></i></span> Reproducible Benchmarking | |
| </p> | |
| <p class="is-size-7 ml-4">Ensures consistent evaluation metrics and transparent methodology.</p> | |
| </div> | |
| <div class="feature-item"> | |
| <p class="has-text-weight-bold mb-1"> | |
| <span class="icon has-text-primary"><i class="fas fa-check"></i></span> π Extensible Architecture | |
| </p> | |
| <p class="is-size-7 ml-4">Easily integrates new tasks, datasets, and evaluation modules.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- π Getting Started --> | |
| <div class="column is-6"> | |
| <div class="card h-100"> | |
| <div class="card-header"> | |
| <p class="card-header-title"> | |
| <span class="icon mr-2"><i class="fas fa-rocket"></i></span> | |
| Getting Started | |
| </p> | |
| </div> | |
| <div class="card-content"> | |
| <div class="content"> | |
| <div class="steps"> | |
| <div class="step-item"> | |
| <div class="step-marker">1</div> | |
| <div class="step-details"> | |
| <p class="step-title">Clone Repository</p> | |
| <p class="is-size-7 has-text-grey"> | |
| <code>git clone https://github.com/gtfintechlab/FLaME.git</code> | |
| </p> | |
| </div> | |
| </div> | |
| <div class="step-item"> | |
| <div class="step-marker">2</div> | |
| <div class="step-details"> | |
| <p class="step-title">Install Dependencies</p> | |
| <p class="is-size-7 has-text-grey"> | |
| <code>pip install -r requirements.txt</code> | |
| </p> | |
| </div> | |
| </div> | |
| <div class="step-item"> | |
| <div class="step-marker">3</div> | |
| <div class="step-details"> | |
| <p class="step-title">Configure Models</p> | |
| <p class="is-size-7 has-text-grey">Edit <code>config.yaml</code> with API keys and model settings.</p> | |
| </div> | |
| </div> | |
| <div class="step-item"> | |
| <div class="step-marker">4</div> | |
| <div class="step-details"> | |
| <p class="step-title">Run Evaluations</p> | |
| <p class="is-size-7 has-text-grey"> | |
| <code>python -m flame.run --task all</code> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="notification is-info is-light mt-4"> | |
| <p class="mb-0">For detailed documentation, visit our <a href="https://github.com/gtfintechlab/FLaME" target="_blank">GitHub repository</a>.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> <!-- End Columns --> | |
| </div> | |
| </section> | |
| <section id="datasets" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <h2 class="title is-3 section-title has-text-centered">Datasets & Domains</h2> | |
| <div class="columns is-multiline"> | |
| <!-- π Numerical Reasoning & Question Answering --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π Numerical Reasoning & Question Answering | |
| </p> | |
| <ul> | |
| <li><strong>FinQA</strong> β Multi-step financial numerical reasoning.</li> | |
| <li><strong>ConvFinQA</strong> β Conversational numerical reasoning.</li> | |
| <li><strong>TaT-QA</strong> β Hybrid table-text question answering.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- π Text Summarization --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π Text Summarization | |
| </p> | |
| <ul> | |
| <li><strong>ECTSum</strong> β Earnings call transcript summarization.</li> | |
| <li><strong>EDTSum</strong> β Financial news summarization.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- π Information Retrieval --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π Information Retrieval | |
| </p> | |
| <ul> | |
| <li><strong>FiNER-ORD</strong> β Named entity recognition for financial documents.</li> | |
| <li><strong>FinEntity</strong> β Entity-based sentiment classification.</li> | |
| <li><strong>Financial Numeric Extreme Labeling (FNXL)</strong> β Automated numeral annotation in financial reports.</li> | |
| <li><strong>FinRED</strong> β Relation extraction in finance.</li> | |
| <li><strong>REFinD</strong> β Relation extraction from SEC filings.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- π Sentiment Analysis --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π Sentiment Analysis | |
| </p> | |
| <ul> | |
| <li><strong>FiQA (Task 1)</strong> β Aspect-based sentiment analysis.</li> | |
| <li><strong>FiQA (Task 2)</strong> β Opinion-based financial QA.</li> | |
| <li><strong>Financial Phrase Bank (FPB)</strong> β Market sentiment classification.</li> | |
| <li><strong>SubjECTive-QA</strong> β Subjectivity detection in earnings call Q&A.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- π·οΈ Text Classification --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π·οΈ Text Classification | |
| </p> | |
| <ul> | |
| <li><strong>Numerical Claim Detection</strong> β Fine-grained investor claim detection.</li> | |
| <li><strong>News Headline Classification</strong> β Market sentiment and trend detection.</li> | |
| <li><strong>FOMC Dataset</strong> β Hawkish-Dovish stance classification.</li> | |
| <li><strong>Banking77</strong> β Fine-grained intent detection.</li> | |
| <li><strong>FinBench</strong> β Financial risk classification (default, fraud, churn).</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- π§ Causal Analysis --> | |
| <div class="column is-6"> | |
| <div class="dataset-category box"> | |
| <p class="has-text-weight-bold"> | |
| π§ Causal Analysis | |
| </p> | |
| <ul> | |
| <li><strong>FinCausal</strong> β Causal reasoning in financial news.</li> | |
| <li><strong>FinCausal-SC</strong> β Cause-effect span extraction.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> <!-- End Columns --> | |
| </div> | |
| </section> | |
| <section id="BibTeX" class="section content-section"> | |
| <div class="container is-max-desktop"> | |
| <h2 class="title is-3 section-title has-text-centered">Citation</h2> | |
| <div class="bibtex-container"> | |
| <div class="notification is-primary is-light mb-4"> | |
| <p class="has-text-centered mb-0"> | |
| If you use <span class="flame">FLaME</span> in your research, please cite our paper: | |
| </p> | |
| </div> | |
| <div class="box"> | |
| <pre><code>@article{flame2025, | |
| author = {Glenn Matlin, Mika Okamoto, Huzaifa Pardawala, Yang Yang, Sudheer Chava}, | |
| title = {FLaME: Holistic Financial Language Model Evaluation}, | |
| year = {2025}, | |
| month = {February}, | |
| }</code></pre> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <footer class="footer"> | |
| <div class="container"> | |
| <div class="columns is-centered"> | |
| <div class="column is-8"> | |
| <div class="content has-text-centered"> | |
| <h4 class="has-text-white mb-4"><span class="flame">FLaME</span>: Financial Language Model Evaluation</h4> | |
| <div class="footer-links mb-5"> | |
| <a class="icon-link mr-3" target="_blank" href="FLaME/FLaME.pdf" title="Download PDF"> | |
| <i class="fas fa-file-pdf fa-lg"></i> | |
| </a> | |
| <!-- <a class="icon-link mr-3" href="https://arxiv.org/abs/" target="_blank" title="View on arXiv"> | |
| <i class="ai ai-arxiv fa-lg"></i> | |
| </a> --> | |
| <a class="icon-link mr-3" href="https://github.com/gtfintechlab/FLaME" target="_blank" title="GitHub Repository"> | |
| <i class="fab fa-github fa-lg"></i> | |
| </a> | |
| <a class="icon-link" href="https://huggingface.co/gtfintechlab/" target="_blank" title="HuggingFace Space"> | |
| <img src="./static/images/huggingface_logo.svg" alt="HuggingFace" width="24" height="24"> | |
| </a> | |
| </div> | |
| <div class="institution-info mb-4"> | |
| <p class="has-text-white-ter">Georgia Institute of Technology</p> | |
| </div> | |
| <p class="has-text-white-ter is-size-7"> | |
| This website is licensed under a <a rel="license" target="_blank" class="has-text-weight-bold" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </footer> | |
| <!-- JavaScript for mobile menu toggle --> | |
| <script> | |
| document.addEventListener('DOMContentLoaded', () => { | |
| // Get all "navbar-burger" elements | |
| const $navbarBurgers = Array.prototype.slice.call(document.querySelectorAll('.navbar-burger'), 0); | |
| // Add a click event on each of them | |
| $navbarBurgers.forEach( el => { | |
| el.addEventListener('click', () => { | |
| // Get the target from the "data-target" attribute | |
| const target = el.dataset.target; | |
| const $target = document.getElementById(target); | |
| // Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu" | |
| el.classList.toggle('is-active'); | |
| $target.classList.toggle('is-active'); | |
| }); | |
| }); | |
| }); | |
| // Add padding to top of page for fixed navbar | |
| document.addEventListener('DOMContentLoaded', () => { | |
| const navbar = document.querySelector('.navbar'); | |
| const navbarHeight = navbar.offsetHeight; | |
| document.body.style.paddingTop = navbarHeight + 'px'; | |
| }); | |
| </script> | |
| </body> | |
| </html> |