Spaces:
Running
Running
| TITLE = """<h1 align="center" id="space-title">AMA-Bench Leaderboard</h1>""" | |
| INTRODUCTION_TEXT = """ | |
| AMA-Bench evaluates the memory capabilities of LLMs and memory-augmented agents across four cognitive dimensions: | |
| **Recall** (retrieving stored information), **Causal Inference** (cause-and-effect reasoning), **State Updating** (tracking evolving states), and **State Abstraction** (forming higher-level representations). | |
| ## Leaderboard | |
| Our leaderboard presents results for the multiple-choice subset, which provides objective and easier-to-score evaluation. | |
| See below for submission details. | |
| """ | |
| SUBMISSION_TEXT = """ | |
| ## Submissions | |
| Results can be submitted for evaluation. Each submission should contain answers for all questions in the benchmark. | |
| We expect submissions to be JSON Lines files with the following format: | |
| ``` | |
| {"episode_id": "trajectory_id", "question_uuid_list": ["uuid-1", "uuid-2", "uuid-3"], "answer_list": ["The agent moved right.", "..."], "llm_as_judge_score_list": [true, false, true]} | |
| ``` | |
| **Required fields:** | |
| - `episode_id`: The episode identifier | |
| - `question_uuid_list`: List of question UUIDs corresponding to each answer (e.g., `["uuid-1", "uuid-2"]`) | |
| - `answer_list`: Your model's answers, in the same order as `question_uuid_list` | |
| - `llm_as_judge_score_list`: Boolean scores for each answer (e.g., `[true, false, true]`) | |
| - `reasoning_trace`: (Optional) The reasoning process or explanation for the answers | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
| CITATION_BUTTON_TEXT = r"""@misc{ama-bench, | |
| title={AMA-Bench: Agent Memory Assessment Benchmark}, | |
| author={AMA-Bench Team}, | |
| year={2024} | |
| }""" | |
| def format_error(msg): | |
| """Format error message with red styling.""" | |
| return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def format_warning(msg): | |
| """Format warning message with orange styling.""" | |
| return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def format_log(msg): | |
| """Format success message with green styling.""" | |
| return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def model_hyperlink(link, model_name): | |
| """Create a hyperlink to the model information.""" | |
| if not link or link.strip() == "": | |
| return model_name | |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |