RCaz's picture
Update agent.py
2bcf72e verified
raw
history blame
4.56 kB
import math
from typing import Optional, Tuple, Literal
from smolagents import tool
@tool
def extract_text_from_audio(file_path : str) -> str:
"""given a path to an audio file, it extract and returns the text contained in it as a string"""
import speech_recognition as sr
r = sr.Recognizer()
with sr.AudioFile(file_path) as source:
# listen for the data (load audio to memory)
audio_data = r.record(source)
# recognize (convert from speech to text)
text = r.recognize_google(audio_data)
return text
@tool
def extract_text_from_audio(file_path: str) -> str:
"""
Extract and return text transcription from an audio file using speech recognition.
This tool uses Google's speech recognition API to convert spoken audio content
into text. It supports various audio formats including WAV, AIFF, and FLAC
(formats supported by the SpeechRecognition library).
Args:
file_path (str): Path to the audio file to be transcribed. The file should
be in a format compatible with the SpeechRecognition library.
Returns:
str: The extracted text content from the audio file.
Raises:
Exception : the exception
Examples:
>>> extract_text_from_audio("meeting_recording.wav")
"Hello team, welcome to our weekly meeting..."
>>> extract_text_from_audio("/path/to/audio/interview.mp3")
"Could you please introduce yourself and your background?"
"""
import speech_recognition as sr
r = sr.Recognizer()
try:
with sr.AudioFile(file_path) as source:
# listen for the data (load audio to memory)
audio_data = r.record(source)
# recognize (convert from speech to text)
text = r.recognize_google(audio_data)
return text
except Exception as e:
return e
class TestAgent:
def __init__(self):
# import code agent and basic tool from smolagent
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient
# import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
#from langchain_community.agent_toolkits import load_tools
from langchain_community.agent_toolkits.load_tools import load_tools
from smolagents import Tool
wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
wikipedia_tool.top_k_results=3
# import tools from MCP servers @ https://github.com/mcp
#from mcp import StdioServerParameters
#server_parameters = StdioServerParameters(command="uvx",
# args=["--quiet", "youtubeqa@0.2.1"],
# env={"UV_PYTHON": "3.12", **os.environ},
# )
#youtube_tools = MCPServerTool(server_params=server_parameters)
model = OpenAIServerModel(model_id="gpt-4o")
#model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
# Instantiate the agent
self.agent = CodeAgent(
tools=[extract_text_from_audio, # homemade tool
DuckDuckGoSearchTool(), # basic tools from smolagent
VisitWebpageTool(),
wikipedia_tool, # tool from langchain with extra parmaeters
#youtube_tools, # tool from MCP server
FinalAnswerTool()],
additional_authorized_imports=["pandas","markdownify","requests"], # V2 add markdownify & requests
model=model,
max_steps=4, # V3 increase steps
planning_interval=2, # V3 add structure
verbosity_level=2,
use_structured_outputs_internally=True # V3. Adds structure
)
# V3. add Guidance
prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
def __call__(self, question: str) -> str:
print(f"Agent received question (first 50 chars): {question[:50]}...")
answer = self.agent.run(question)
print(f"Agent returning his answer: {answer}")
return answer