from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("overall", "anls", "ANLS (Overall)") task1 = Task("single_evidence", "anls", "ANLS (Single Evidence)") task2 = Task("multi_evidence_same_doc", "anls", "ANLS (Multi-Evidence, Same Doc)") task3 = Task("multi_evidence_multi_doc", "anls", "ANLS (Multi-Evidence, Multi Doc)") # Your leaderboard name # Static files are served relative to the static path set in app.py TITLE = "" # """

Agentic Document AI Benchmark

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ Welcome to the **Agentic Document AI Leaderboard**! This benchmark evaluates the performance of AI agents on complex document understanding tasks that require multi-step reasoning and evidence gathering across documents. The benchmark uses **ANLS (Average Normalized Levenshtein Similarity)** as the primary evaluation metric, measuring how well models can extract and synthesize information from documents. We evaluate performance on: - **Overall accuracy** across the entire dataset - **Single evidence** questions (information from one source) - **Multi-evidence, same document** questions (combining information within a document) - **Multi-evidence, multi-document** questions (synthesizing across multiple documents) We also track **inference costs** in terms of agent steps and USD to help understand the efficiency of different approaches. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = """ ## How it works The Agentic Document AI benchmark evaluates AI systems on their ability to: 1. Navigate and understand complex document structures 2. Extract relevant evidence to answer questions 3. Synthesize information from multiple sources 4. Perform multi-step reasoning with document context ## Metrics ### Performance Metrics (ANLS-based) - **ANLS (Overall)**: Main score - Average Normalized Levenshtein Similarity across the entire dataset - **ANLS (Single Evidence)**: Performance on questions requiring single evidence extraction - **ANLS (Multi-Evidence, Same Doc)**: Performance when combining evidence within one document - **ANLS (Multi-Evidence, Multi Doc)**: Performance when synthesizing across multiple documents ### Efficiency Metrics - **Agent Steps**: Total number of reasoning/action steps taken by the agent - **Cost (USD)**: Estimated inference cost in US dollars ## Reproducibility To submit your results: 1. Run your model/agent on the benchmark dataset 2. Generate a JSONL file where each line contains one prediction: ```json {"question": "What is Dr. McElhaney's position?", "answer": ["Senior Scientist"], "citations": [{"file": "1307326.pdf", "page": 1}], "iterations": 1, "id": "q_4"} {"question": "Who is the CEO?", "answer": ["John Smith"], "citations": [{"file": "report.pdf", "page": 3}], "iterations": 2, "id": "q_5"} ``` **Required fields:** - `question`: The question text (string) - `answer`: List of answer strings - `citations`: List of citation dicts with "file" and "page" - `iterations`: Number of agent iterations/steps (integer) - `id`: Unique question identifier (string) 3. Submit your JSONL file through the submission tab 4. The system will evaluate your predictions against the gold standard and compute ANLS scores See `submission_template.jsonl` for a complete example. """