RCaz's picture
2 more llm for audio and vieo processing
982aacc verified
raw
history blame
5.73 kB
import math
from typing import Optional, Tuple, Literal
from smolagents import tool
import base64
from openai import OpenAI
@tool
def download_and_get_path_for_provided_file(path: str):
"""
Download and cache the provided file. Returns the path of the cached file.
Args:
path (str): Intended file path
Returns:
bytes: The binary content of the downloaded file
"""
file_path = hf_hub_download(
repo_id="gaia-benchmark/GAIA",
filename="2023/test/063800f6-8832-4856-972b-17b877612533.png",
repo_type="dataset",
token=os.environ['HF_TOKEN']
)
return file_path
@tool
def extract_text_from_audio(file_path: str) -> str:
"""
Extract and return text transcription from an audio file.
Args:
file_path (str): Path to the audio file to be transcribed.
Returns:
str: The extracted text content from the audio file.
Raises:
Exception : the exception
Examples:
>>> extract_text_from_audio("meeting_recording.wav")
"Hello team, welcome to our weekly meeting..."
>>> extract_text_from_audio("/path/to/audio/interview.mp3")
"Could you please introduce yourself and your background?"
"""
client = OpenAI()
audio_file = open(file_path, "rb")
transcription = client.audio.transcriptions.create(
model="gpt-4o-transcribe",
file=audio_file,
response_format="text"
)
return transcription
def describe_image(request:str, file_path: str) -> str:
"""
Extract and return the requested information from an image.
Args:
request: The information to retreive from the image.
file_path (str): Path to the audio file to be transcribed. The file should
be in a format compatible with the SpeechRecognition library.
Returns:
str: The extracted text from the image.
Examples:
>>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
"There are 2 birds depicted in an frame placed underwater"
>>> describe_image("what is the position of the black queen?","chess_board.png")
"Qd3"
"""
client = OpenAI()
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Getting the Base64 string
base64_image = encode_image(file_path)
response = client.responses.create(
model="gpt-4.1",
input=[
{
"role": "user",
"content": [
{ "type": "input_text", "text": request },
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_image}",
},
],
}
],
)
return response.output_text
class TestAgent:
def __init__(self):
# import code agent and basic tool from smolagent
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient
# import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
#from langchain_community.agent_toolkits import load_tools
from langchain_community.agent_toolkits.load_tools import load_tools
from smolagents import Tool
wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
wikipedia_tool.top_k_results=3
# import tools from MCP servers @ https://github.com/mcp
#from mcp import StdioServerParameters
#server_parameters = StdioServerParameters(command="uvx",
# args=["--quiet", "youtubeqa@0.2.1"],
# env={"UV_PYTHON": "3.12", **os.environ},
# )
#youtube_tools = MCPServerTool(server_params=server_parameters)
model = OpenAIServerModel(model_id="gpt-4o")
#model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
# Instantiate the agent
self.agent = CodeAgent(
tools=[extract_text_from_audio, # homemade tool
DuckDuckGoSearchTool(), # basic tools from smolagent
VisitWebpageTool(),
wikipedia_tool, # tool from langchain with extra parmaeters
#youtube_tools, # tool from MCP server
FinalAnswerTool()],
additional_authorized_imports=["pandas","markdownify","requests"], # V2 add markdownify & requests
model=model,
max_steps=4, # V3 increase steps
planning_interval=2, # V3 add structure
verbosity_level=2,
use_structured_outputs_internally=True # V3. Adds structure
)
# V3. add Guidance
prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
# V4. use prompt from the paper ?
def __call__(self, question: str) -> str:
print(f"Agent received question (first 50 chars): {question[:50]}...")
answer = self.agent.run(question)
print(f"Agent returning his answer: {answer}")
return answer