Agent_Course_Final_Assignment

Sleeping

App Files Files Community

Agent_Course_Final_Assignment / tools.py

giulia-fontanella

Update tools.py

4339f99 verified 9 months ago

raw

history blame

3.64 kB

	import base64
	from langchain_core.messages import AnyMessage, HumanMessage, AIMessage
	from langchain.tools import tool


	@tool
	def extract_text(img_path: str) -> str:
	"""
	Extract text from an image file using a multimodal model.

	Args:
	img_path: A string representing the url of an image (e.g., PNG, JPEG).

	Returns:
	A single string containing the concatenated text extracted from the image.
	"""
	all_text = ""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare the prompt including the base64 image data
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	"Extract all the text from this image. "
	"Return only the extracted text, no explanations."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]

	# Call the vision-capable model
	response = vision_llm.invoke(message)

	# Append extracted text
	all_text += response.content + "\n\n"

	return all_text.strip()
	except Exception as e:
	error_msg = f"Error extracting text: {str(e)}"
	print(error_msg)
	return ""


	@tool
	def describe_image(img_path: str, query: str) -> str:
	"""
	Generate a detailed description of an image using a multimodal model.
	This function reads a image from an url, encodes it, and sends it to a
	vision-capable language model to obtain a comprehensive, natural language
	description of the image's content, including its objects, actions, and context,
	following a specific query.

	Args:
	img_path: A string representing the url of an image (e.g., PNG, JPEG).
	query: Information to extract from the image.

	Returns:
	A single string containing a detailed description of the image.
	"""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare message payload
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]

	# Call the vision model (assumes vision_llm is previously instantiated)
	response = vision_llm.invoke(message)

	return response.content.strip()

	except Exception as e:
	error_msg = f"Error describing image: {str(e)}"
	print(error_msg)
	return ""