TITLE = """
AMA-Bench Leaderboard
"""
INTRODUCTION_TEXT = """
AMA-Bench evaluates the memory capabilities of LLMs and memory-augmented agents across four cognitive dimensions:
**Recall** (retrieving stored information), **Causal Inference** (cause-and-effect reasoning), **State Updating** (tracking evolving states), and **State Abstraction** (forming higher-level representations).
## Leaderboard
Our leaderboard presents results for the multiple-choice subset, which provides objective and easier-to-score evaluation.
See below for submission details.
"""
SUBMISSION_TEXT = """
## Submissions
Results can be submitted for evaluation. Each submission should contain answers for all questions in the benchmark.
We expect submissions to be JSON Lines files with the following format:
```
{"episode_id": "trajectory_id", "question_uuid_list": ["uuid-1", "uuid-2", "uuid-3"], "answer_list": ["The agent moved right.", "..."], "llm_as_judge_score_list": [true, false, true]}
```
**Required fields:**
- `episode_id`: The episode identifier
- `question_uuid_list`: List of question UUIDs corresponding to each answer (e.g., `["uuid-1", "uuid-2"]`)
- `answer_list`: Your model's answers, in the same order as `question_uuid_list`
- `llm_as_judge_score_list`: Boolean scores for each answer (e.g., `[true, false, true]`)
- `reasoning_trace`: (Optional) The reasoning process or explanation for the answers
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{ama-bench,
title={AMA-Bench: Agent Memory Assessment Benchmark},
author={AMA-Bench Team},
year={2024}
}"""
def format_error(msg):
"""Format error message with red styling."""
return f"{msg}
"
def format_warning(msg):
"""Format warning message with orange styling."""
return f"{msg}
"
def format_log(msg):
"""Format success message with green styling."""
return f"{msg}
"
def model_hyperlink(link, model_name):
"""Create a hyperlink to the model information."""
if not link or link.strip() == "":
return model_name
return f'{model_name}'