| """Agent design evaluation on math tasks.""" |
|
|
| import re |
| from typing import Callable, List, Optional, Tuple, Dict |
| from collections import Counter, defaultdict |
| from math_eval import agent_evaluation |
|
|
|
|
| |
| class Agent: |
| def __init__( |
| self, |
| query_llm: Callable, |
| temperature=0.0, |
| ): |
| self.output_format_instructions = "On the final line output only the digits of the answer (0‑999). Provide your final answer enclosed in a LaTeX \\boxed{{...}} command." |
| self.query_llm = query_llm |
| self.temperature = temperature |
|
|
| def forward(self, problem: str) -> tuple[str, float]: |
| """Queries the LLM with a math problem.""" |
| system_prompt, task_prompt = self.get_prompt_for_task(problem) |
| response, cost = self.query_llm( |
| prompt=task_prompt, |
| system=system_prompt, |
| temperature=self.temperature, |
| ) |
| return response, cost |
|
|
| def get_prompt_for_task(self, problem: str) -> tuple[str, str]: |
| system_prompt = "You are a skilled mathematician." |
| task_prompt = f"{self.output_format_instructions}:\n\n{problem}\n\n" |
| return system_prompt, task_prompt |
|
|
|
|
| |
|
|
|
|
| def run_experiment(**kwargs): |
| from utils import query_llm, create_call_limited_query_llm |
| from functools import partial |
|
|
| |
| base_query_llm = partial(query_llm, model_name=kwargs["model_name"]) |
|
|
| |
| limited_query_llm = create_call_limited_query_llm( |
| base_query_llm, |
| max_calls=kwargs["max_calls"], |
| ) |
|
|
| accuracy, cost_total, processed, num_llm_calls, df = agent_evaluation( |
| Agent, limited_query_llm, year=kwargs["year"] |
| ) |
| return accuracy, cost_total, processed, num_llm_calls, df |
|
|