Agent_course_Final_Assignment

Sleeping

App Files Files Community

Agent_course_Final_Assignment / agent.py

RCaz

2 more llm for audio and vieo processing

982aacc verified 5 months ago

raw

history blame

5.73 kB

	import math
	from typing import Optional, Tuple, Literal
	from smolagents import tool
	import base64
	from openai import OpenAI


	@tool
	def download_and_get_path_for_provided_file(path: str):
	"""
	Download and cache the provided file. Returns the path of the cached file.

	Args:
	path (str): Intended file path

	Returns:
	bytes: The binary content of the downloaded file

	"""
	file_path = hf_hub_download(
	repo_id="gaia-benchmark/GAIA",
	filename="2023/test/063800f6-8832-4856-972b-17b877612533.png",
	repo_type="dataset",
	token=os.environ['HF_TOKEN']
	)
	return file_path


	@tool
	def extract_text_from_audio(file_path: str) -> str:
	"""
	Extract and return text transcription from an audio file.

	Args:
	file_path (str): Path to the audio file to be transcribed.

	Returns:
	str: The extracted text content from the audio file.

	Raises:
	Exception : the exception

	Examples:
	>>> extract_text_from_audio("meeting_recording.wav")
	"Hello team, welcome to our weekly meeting..."

	>>> extract_text_from_audio("/path/to/audio/interview.mp3")
	"Could you please introduce yourself and your background?"
	"""

	client = OpenAI()
	audio_file = open(file_path, "rb")

	transcription = client.audio.transcriptions.create(
	model="gpt-4o-transcribe",
	file=audio_file,
	response_format="text"
	)
	return transcription


	def describe_image(request:str, file_path: str) -> str:
	"""
	Extract and return the requested information from an image.

	Args:
	request: The information to retreive from the image.
	file_path (str): Path to the audio file to be transcribed. The file should
	be in a format compatible with the SpeechRecognition library.

	Returns:
	str: The extracted text from the image.

	Examples:
	>>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
	"There are 2 birds depicted in an frame placed underwater"

	>>> describe_image("what is the position of the black queen?","chess_board.png")
	"Qd3"
	"""

	client = OpenAI()

	# Function to encode the image
	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")

	# Getting the Base64 string
	base64_image = encode_image(file_path)


	response = client.responses.create(
	model="gpt-4.1",
	input=[
	{
	"role": "user",
	"content": [
	{ "type": "input_text", "text": request },
	{
	"type": "input_image",
	"image_url": f"data:image/jpeg;base64,{base64_image}",
	},
	],
	}
	],
	)

	return response.output_text

	class TestAgent:
	def __init__(self):

	# import code agent and basic tool from smolagent
	from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient

	# import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
	#from langchain_community.agent_toolkits import load_tools
	from langchain_community.agent_toolkits.load_tools import load_tools

	from smolagents import Tool
	wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
	wikipedia_tool.top_k_results=3

	# import tools from MCP servers @ https://github.com/mcp
	#from mcp import StdioServerParameters
	#server_parameters = StdioServerParameters(command="uvx",
	# args=["--quiet", "youtubeqa@0.2.1"],
	# env={"UV_PYTHON": "3.12", **os.environ},
	# )
	#youtube_tools = MCPServerTool(server_params=server_parameters)

	model = OpenAIServerModel(model_id="gpt-4o")
	#model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
	# Instantiate the agent
	self.agent = CodeAgent(
	tools=[extract_text_from_audio, # homemade tool
	DuckDuckGoSearchTool(), # basic tools from smolagent
	VisitWebpageTool(),
	wikipedia_tool, # tool from langchain with extra parmaeters
	#youtube_tools, # tool from MCP server
	FinalAnswerTool()],
	additional_authorized_imports=["pandas","markdownify","requests"], # V2 add markdownify & requests
	model=model,
	max_steps=4, # V3 increase steps
	planning_interval=2, # V3 add structure
	verbosity_level=2,
	use_structured_outputs_internally=True # V3. Adds structure
	)
	# V3. add Guidance
	prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
	self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance

	# V4. use prompt from the paper ?

	def __call__(self, question: str) -> str:

	print(f"Agent received question (first 50 chars): {question[:50]}...")
	answer = self.agent.run(question)
	print(f"Agent returning his answer: {answer}")
	return answer