JustinTX
/

shinka-backup

Model card Files Files and versions

shinka-backup / examples_deprecated /adas_aime /initial.py

JustinTX's picture

Add files using upload-large-folder tool

14c9c2b verified 28 days ago

history blame contribute delete

1.85 kB

	"""Agent design evaluation on math tasks."""

	import re
	from typing import Callable, List, Optional, Tuple, Dict
	from collections import Counter, defaultdict
	from math_eval import agent_evaluation


	# EVOLVE-BLOCK-START
	class Agent:
	def __init__(
	self,
	query_llm: Callable,
	temperature=0.0,
	):
	self.output_format_instructions = "On the final line output only the digits of the answer (0‑999). Provide your final answer enclosed in a LaTeX \\boxed{{...}} command."
	self.query_llm = query_llm
	self.temperature = temperature

	def forward(self, problem: str) -> tuple[str, float]:
	"""Queries the LLM with a math problem."""
	system_prompt, task_prompt = self.get_prompt_for_task(problem)
	response, cost = self.query_llm(
	prompt=task_prompt,
	system=system_prompt,
	temperature=self.temperature,
	)
	return response, cost

	def get_prompt_for_task(self, problem: str) -> tuple[str, str]:
	system_prompt = "You are a skilled mathematician."
	task_prompt = f"{self.output_format_instructions}:\n\n{problem}\n\n"
	return system_prompt, task_prompt


	# EVOLVE-BLOCK-END


	def run_experiment(**kwargs):
	from utils import query_llm, create_call_limited_query_llm
	from functools import partial

	# Create base query_llm function
	base_query_llm = partial(query_llm, model_name=kwargs["model_name"])

	# Wrap it with call limiting (max 10 calls per forward pass)
	limited_query_llm = create_call_limited_query_llm(
	base_query_llm,
	max_calls=kwargs["max_calls"],
	)

	accuracy, cost_total, processed, num_llm_calls, df = agent_evaluation(
	Agent, limited_query_llm, year=kwargs["year"]
	)
	return accuracy, cost_total, processed, num_llm_calls, df