from smolagents import LiteLLMModel from smolagents.tools import Tool from src.settings import settings from src.utils import InputTokenRateLimiter class FinalAnswerTool(Tool): name = "final_answer" description = "Provides the exact, final answer to the given question." inputs = { "question": { "type": "string", "description": "The original question being asked.", }, "answer": {"type": "string", "description": "The answer to the question."}, } output_type = "string" def __init__(self): self.model = LiteLLMModel( model_id=settings.llm_model_id, api_key=settings.llm_api_key, temperature=0.1, max_tokens=20, ) self.token_rate_limiter = InputTokenRateLimiter() self.expected_tokens_per_step = 10000 self.is_initialized = True def forward(self, question: str, answer: str) -> str: self.token_rate_limiter.maybe_wait(self.expected_tokens_per_step) response = self.model.generate( [ { "role": "user", "content": [ { "type": "text", "text": f""" Rewrite the following ANSWER to be concise and use as few tokens as possible to answer the QUESTION directly. If there's ambiguity in the ANSWER, make a clear cut decision to give a concise result. Final result should not be in sentence format. If the answer is an error, return 'N/A' instead. QUESTION: {question} ANSWER: {answer} """, } ], } ] ) token_usage_info = getattr(response, "token_usage", None) tokens_used = 0 if tokens_used: tokens_used = token_usage_info.input_tokens self.token_rate_limiter.add_tokens(tokens_used) return response.content