Spaces:
Sleeping
Sleeping
| import math | |
| from typing import Optional, Tuple, Literal | |
| from smolagents import tool | |
| import base64 | |
| from openai import OpenAI | |
| import joblib | |
| import os | |
| def download_and_get_path_for_provided_file(path: str) -> str: | |
| """ | |
| Download and cache the provided file. Returns the path of the cached file. | |
| Args: | |
| path (str): Intended file path | |
| Returns: | |
| bytes: The binary content of the downloaded file | |
| """ | |
| from huggingface_hub import hf_hub_download | |
| for dataset in ["test","validation"]: | |
| try: | |
| file_path = hf_hub_download( | |
| repo_id="gaia-benchmark/GAIA", | |
| filename=f"2023/{dataset}/{path}", | |
| repo_type="dataset", | |
| token=os.environ['HF_TOKEN']) | |
| if file_path: | |
| return file_path | |
| except Exception as e: | |
| print(e) | |
| continue | |
| def extract_text_from_audio(file_path: str) -> str: | |
| """ | |
| Extract and return text transcription from an audio file given its path. | |
| Args: | |
| file_path (str): Path to the audio file to be transcribed. | |
| Returns: | |
| str: The extracted text content from the audio file. | |
| Raises: | |
| Exception : the exception | |
| Examples: | |
| >>> extract_text_from_audio("meeting_recording.wav") | |
| "Hello team, welcome to our weekly meeting..." | |
| >>> extract_text_from_audio("/path/to/audio/interview.mp3") | |
| "Could you please introduce yourself and your background?" | |
| """ | |
| try: | |
| return joblib.load(f"{file_path}") | |
| except: | |
| client = OpenAI() | |
| audio_file = open(file_path, "rb") | |
| transcription = client.audio.transcriptions.create( | |
| model="gpt-4o-transcribe", | |
| file=audio_file, | |
| response_format="text" | |
| ) | |
| joblib.dump(transcription, f"{file_path}") | |
| return transcription | |
| def describe_image(request:str, file_path: str) -> str: | |
| """ | |
| Extract and return the requested information from an image given its path. | |
| Args: | |
| request: The information to retreive from the image. The request must be simple, short and precise. | |
| file_path (str): Path to the audio file to be transcribed. The file should | |
| be in a format compatible with the SpeechRecognition library. | |
| Returns: | |
| str: The extracted text from the image. | |
| Examples: | |
| >>> describe_image("how many birds are in the picture", "underwater_picture.jpg") | |
| "There are 2 birds depicted in an frame placed underwater" | |
| >>> describe_image("what is the position of the black queen?","chess_board.png") | |
| "Qd3" | |
| """ | |
| try : | |
| return joblib.load(f"{file_path}") | |
| except: | |
| client = OpenAI() | |
| # Function to encode the image | |
| def encode_image(image_path): | |
| with open(image_path, "rb") as image_file: | |
| return base64.b64encode(image_file.read()).decode("utf-8") | |
| # Getting the Base64 string | |
| base64_image = encode_image(file_path) | |
| response = client.responses.create( | |
| model="gpt-4.1", | |
| input=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { "type": "input_text", "text": request }, | |
| { | |
| "type": "input_image", | |
| "image_url": f"data:image/jpeg;base64,{base64_image}", | |
| }, | |
| ], | |
| } | |
| ], | |
| ) | |
| joblib.dump(response.output_text,f"{file_path}") | |
| return response.output_text | |
| def get_transcript_from_youtube_file_id(file_id: str) -> str: | |
| """ | |
| Retrieve the transcript for a YouTube video given its id. | |
| Args: | |
| file_id (str): The YouTube video ID (the alphanumeric string that appears after | |
| 'v=' in a YouTube URL, e.g., 'dQw4w9WgXcQ'). | |
| Returns: | |
| str: The transcript content for the specified video. a JSON string or formatted | |
| text containing transcript segments with timestamps. | |
| """ | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| ytt_api = YouTubeTranscriptApi() | |
| transcript = ytt_api.fetch(file_id) | |
| return transcript | |
| def parse_python_file(path: str) -> str: | |
| """ | |
| Read and return the contents of a Python file from its path. | |
| Args: | |
| path (str): The file path to the Python file to be read. | |
| Returns: | |
| str: The complete contents of the Python file as a string. | |
| """ | |
| with open(path, "r") as py_file: | |
| return py_file.read() | |
| def parse_pdf_file(path: str) -> str: | |
| """ | |
| Read and return the contents of a pdf file from its path. | |
| Args: | |
| path (str): The file path to the pdf file to be read. | |
| Returns: | |
| str: The complete contents of the pdf file as a string. | |
| """ | |
| from pypdf import PdfReader | |
| if not path.endswith(".pdf"): | |
| return "file does not end with .pdf" | |
| reader = PdfReader(path) | |
| len_pages = len(reader.pages) | |
| out = "" | |
| for p in range(len_pages): | |
| page = reader.pages[0] | |
| text = page.extract_text() | |
| out+=text+"\n" | |
| return out | |
| class TestAgent: | |
| def __init__(self): | |
| # import code agent and basic tool from smolagent | |
| from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient | |
| # import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools | |
| #from langchain_community.agent_toolkits import load_tools | |
| from langchain_community.agent_toolkits.load_tools import load_tools | |
| from smolagents import Tool | |
| wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0]) | |
| wikipedia_tool.top_k_results=3 | |
| # import tools from MCP servers @ https://github.com/mcp | |
| #from mcp import StdioServerParameters | |
| #server_parameters = StdioServerParameters(command="uvx", | |
| # args=["--quiet", "youtubeqa@0.2.1"], | |
| # env={"UV_PYTHON": "3.12", **os.environ}, | |
| # ) | |
| #youtube_tools = MCPServerTool(server_params=server_parameters) | |
| model = OpenAIServerModel(model_id="gpt-4.1-mini") | |
| #model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct") | |
| # Instantiate the agent | |
| self.agent = CodeAgent( | |
| tools=[download_and_get_path_for_provided_file, # V4. get attached file | |
| DuckDuckGoSearchTool(), # basic tools from smolagent | |
| VisitWebpageTool(), | |
| wikipedia_tool, # tool from langchain with extra parmaeters | |
| #youtube_tools, # tool from MCP server | |
| get_transcript_from_youtube_file_id, # V4 | |
| parse_python_file, # V4 | |
| describe_image, # V4 | |
| extract_text_from_audio, # V4 | |
| parse_pdf_file, # V5 | |
| FinalAnswerTool()], | |
| additional_authorized_imports=["pandas","markdownify","requests","chess","os"], # V2 add markdownify & requests V5 add chess and os | |
| model=model, | |
| max_steps=6, # V3 increase steps | |
| planning_interval=3, # V3 add structure | |
| verbosity_level=0, | |
| use_structured_outputs_internally=True # V3. Adds structure | |
| ) | |
| # V3. add Guidance | |
| #prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!" | |
| #self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance | |
| # V4. use prompt from the paper as guidance | |
| prompt = """\n\n | |
| It is very important to remember the foillowing: You are a general AI assistant. I will ask you a question. Report your thoughts, and | |
| finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. | |
| YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated | |
| list of numbers and/or strings. | |
| If you are asked for a number, don’t use comma to write your number neither use units such as $ or | |
| percent sign unless specified otherwise. | |
| If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the | |
| digits in plain text unless specified otherwise. | |
| If you are asked for a comma separated list, apply the above rules depending of whether the element | |
| to be put in the list is a number or a string. | |
| \n\n | |
| Now it's your turn. | |
| """ | |
| self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt | |
| def __call__(self, question: str) -> str: | |
| print(f"Agent received question (first 50 chars): {question[:50]}...") | |
| answer = self.agent.run(question) | |
| print(f"Agent returning his answer: {answer}") | |
| return answer | |