Spaces:

AMA-bench
/

AMA-bench-Leaderboard

Running

App Files Files Community

AMA-bench-Leaderboard / content.py

uuuhjb

update submission

bc450de 4 days ago

raw

history blame contribute delete

2.49 kB

	TITLE = """<h1 align="center" id="space-title">AMA-Bench Leaderboard</h1>"""

	INTRODUCTION_TEXT = """
	AMA-Bench evaluates the memory capabilities of LLMs and memory-augmented agents across four cognitive dimensions:
	Recall (retrieving stored information), Causal Inference (cause-and-effect reasoning), State Updating (tracking evolving states), and State Abstraction (forming higher-level representations).


	## Leaderboard
	Our leaderboard presents results for the multiple-choice subset, which provides objective and easier-to-score evaluation.
	See below for submission details.
	"""

	SUBMISSION_TEXT = """
	## Submissions
	Results can be submitted for evaluation. Each submission should contain answers for all questions in the benchmark.

	We expect submissions to be JSON Lines files with the following format:
	```
	{"episode_id": "trajectory_id", "question_uuid_list": ["uuid-1", "uuid-2", "uuid-3"], "answer_list": ["The agent moved right.", "..."], "llm_as_judge_score_list": [true, false, true]}
	```

	Required fields:
	- `episode_id`: The episode identifier
	- `question_uuid_list`: List of question UUIDs corresponding to each answer (e.g., `["uuid-1", "uuid-2"]`)
	- `answer_list`: Your model's answers, in the same order as `question_uuid_list`
	- `llm_as_judge_score_list`: Boolean scores for each answer (e.g., `[true, false, true]`)
	- `reasoning_trace`: (Optional) The reasoning process or explanation for the answers
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""@misc{ama-bench,
	title={AMA-Bench: Agent Memory Assessment Benchmark},
	author={AMA-Bench Team},
	year={2024}
	}"""


	def format_error(msg):
	"""Format error message with red styling."""
	return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"


	def format_warning(msg):
	"""Format warning message with orange styling."""
	return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"


	def format_log(msg):
	"""Format success message with green styling."""
	return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"


	def model_hyperlink(link, model_name):
	"""Create a hyperlink to the model information."""
	if not link or link.strip() == "":
	return model_name
	return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'