import math from typing import Optional, Tuple, Literal from smolagents import tool @tool def extract_text_from_audio(file_path : str) -> str: """given a path to an audio file, it extract and returns the text contained in it as a string""" import speech_recognition as sr r = sr.Recognizer() with sr.AudioFile(file_path) as source: # listen for the data (load audio to memory) audio_data = r.record(source) # recognize (convert from speech to text) text = r.recognize_google(audio_data) return text @tool def extract_text_from_audio(file_path: str) -> str: """ Extract and return text transcription from an audio file using speech recognition. This tool uses Google's speech recognition API to convert spoken audio content into text. It supports various audio formats including WAV, AIFF, and FLAC (formats supported by the SpeechRecognition library). Args: file_path (str): Path to the audio file to be transcribed. The file should be in a format compatible with the SpeechRecognition library. Returns: str: The extracted text content from the audio file. Raises: Exception : the exception Examples: >>> extract_text_from_audio("meeting_recording.wav") "Hello team, welcome to our weekly meeting..." >>> extract_text_from_audio("/path/to/audio/interview.mp3") "Could you please introduce yourself and your background?" """ import speech_recognition as sr r = sr.Recognizer() try: with sr.AudioFile(file_path) as source: # listen for the data (load audio to memory) audio_data = r.record(source) # recognize (convert from speech to text) text = r.recognize_google(audio_data) return text except Exception as e: return e class TestAgent: def __init__(self): # import code agent and basic tool from smolagent from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient # import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools #from langchain_community.agent_toolkits import load_tools from langchain_community.agent_toolkits.load_tools import load_tools from smolagents import Tool wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0]) wikipedia_tool.top_k_results=3 # import tools from MCP servers @ https://github.com/mcp #from mcp import StdioServerParameters #server_parameters = StdioServerParameters(command="uvx", # args=["--quiet", "youtubeqa@0.2.1"], # env={"UV_PYTHON": "3.12", **os.environ}, # ) #youtube_tools = MCPServerTool(server_params=server_parameters) model = OpenAIServerModel(model_id="gpt-4o") #model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct") # Instantiate the agent self.agent = CodeAgent( tools=[extract_text_from_audio, # homemade tool DuckDuckGoSearchTool(), # basic tools from smolagent VisitWebpageTool(), wikipedia_tool, # tool from langchain with extra parmaeters #youtube_tools, # tool from MCP server FinalAnswerTool()], additional_authorized_imports=["pandas","markdownify","requests"], # V2 add markdownify & requests model=model, max_steps=4, # V3 increase steps planning_interval=2, # V3 add structure verbosity_level=2, use_structured_outputs_internally=True # V3. Adds structure ) # V3. add Guidance prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!" self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance def __call__(self, question: str) -> str: print(f"Agent received question (first 50 chars): {question[:50]}...") answer = self.agent.run(question) print(f"Agent returning his answer: {answer}") return answer