import math from typing import Optional, Tuple, Literal from smolagents import tool import base64 from openai import OpenAI import joblib import os @tool def download_and_get_path_for_provided_file(path: str) -> str: """ Download and cache the provided file. Returns the path of the cached file. Args: path (str): Intended file path Returns: bytes: The binary content of the downloaded file """ from huggingface_hub import hf_hub_download for dataset in ["test","validation"]: try: file_path = hf_hub_download( repo_id="gaia-benchmark/GAIA", filename=f"2023/{dataset}/{path}", repo_type="dataset", token=os.environ['HF_TOKEN']) if file_path: return file_path except Exception as e: print(e) continue @tool def extract_text_from_audio(file_path: str) -> str: """ Extract and return text transcription from an audio file given its path. Args: file_path (str): Path to the audio file to be transcribed. Returns: str: The extracted text content from the audio file. Raises: Exception : the exception Examples: >>> extract_text_from_audio("meeting_recording.wav") "Hello team, welcome to our weekly meeting..." >>> extract_text_from_audio("/path/to/audio/interview.mp3") "Could you please introduce yourself and your background?" """ try: return joblib.load(f"{file_path}") except: client = OpenAI() audio_file = open(file_path, "rb") transcription = client.audio.transcriptions.create( model="gpt-4o-transcribe", file=audio_file, response_format="text" ) joblib.dump(transcription, f"{file_path}") return transcription @tool def describe_image(request:str, file_path: str) -> str: """ Extract and return the requested information from an image given its path. Args: request: The information to retreive from the image. The request must be simple, short and precise. file_path (str): Path to the audio file to be transcribed. The file should be in a format compatible with the SpeechRecognition library. Returns: str: The extracted text from the image. Examples: >>> describe_image("how many birds are in the picture", "underwater_picture.jpg") "There are 2 birds depicted in an frame placed underwater" >>> describe_image("what is the position of the black queen?","chess_board.png") "Qd3" """ try : return joblib.load(f"{file_path}") except: client = OpenAI() # Function to encode the image def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") # Getting the Base64 string base64_image = encode_image(file_path) response = client.responses.create( model="gpt-4.1", input=[ { "role": "user", "content": [ { "type": "input_text", "text": request }, { "type": "input_image", "image_url": f"data:image/jpeg;base64,{base64_image}", }, ], } ], ) joblib.dump(response.output_text,f"{file_path}") return response.output_text @tool def get_transcript_from_youtube_file_id(file_id: str) -> str: """ Retrieve the transcript for a YouTube video given its id. Args: file_id (str): The YouTube video ID (the alphanumeric string that appears after 'v=' in a YouTube URL, e.g., 'dQw4w9WgXcQ'). Returns: str: The transcript content for the specified video. a JSON string or formatted text containing transcript segments with timestamps. """ from youtube_transcript_api import YouTubeTranscriptApi ytt_api = YouTubeTranscriptApi() transcript = ytt_api.fetch(file_id) return transcript @tool def parse_python_file(path: str) -> str: """ Read and return the contents of a Python file from its path. Args: path (str): The file path to the Python file to be read. Returns: str: The complete contents of the Python file as a string. """ with open(path, "r") as py_file: return py_file.read() @tool def parse_pdf_file(path: str) -> str: """ Read and return the contents of a pdf file from its path. Args: path (str): The file path to the pdf file to be read. Returns: str: The complete contents of the pdf file as a string. """ from pypdf import PdfReader if not path.endswith(".pdf"): return "file does not end with .pdf" reader = PdfReader(path) len_pages = len(reader.pages) out = "" for p in range(len_pages): page = reader.pages[0] text = page.extract_text() out+=text+"\n" return out class TestAgent: def __init__(self): # import code agent and basic tool from smolagent from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient # import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools #from langchain_community.agent_toolkits import load_tools from langchain_community.agent_toolkits.load_tools import load_tools from smolagents import Tool wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0]) wikipedia_tool.top_k_results=3 # import tools from MCP servers @ https://github.com/mcp #from mcp import StdioServerParameters #server_parameters = StdioServerParameters(command="uvx", # args=["--quiet", "youtubeqa@0.2.1"], # env={"UV_PYTHON": "3.12", **os.environ}, # ) #youtube_tools = MCPServerTool(server_params=server_parameters) model = OpenAIServerModel(model_id="gpt-4.1-mini") #model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct") # Instantiate the agent self.agent = CodeAgent( tools=[download_and_get_path_for_provided_file, # V4. get attached file DuckDuckGoSearchTool(), # basic tools from smolagent VisitWebpageTool(), wikipedia_tool, # tool from langchain with extra parmaeters #youtube_tools, # tool from MCP server get_transcript_from_youtube_file_id, # V4 parse_python_file, # V4 describe_image, # V4 extract_text_from_audio, # V4 parse_pdf_file, # V5 FinalAnswerTool()], additional_authorized_imports=["pandas","markdownify","requests","chess","os"], # V2 add markdownify & requests V5 add chess and os model=model, max_steps=6, # V3 increase steps planning_interval=3, # V3 add structure verbosity_level=0, use_structured_outputs_internally=True # V3. Adds structure ) # V3. add Guidance #prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!" #self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance # V4. use prompt from the paper as guidance prompt = """\n\n It is very important to remember the foillowing: You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don’t use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. \n\n Now it's your turn. """ self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt def __call__(self, question: str) -> str: print(f"Agent received question (first 50 chars): {question[:50]}...") answer = self.agent.run(question) print(f"Agent returning his answer: {answer}") return answer