Spaces:

Tsitsi19
/

agent-zero-ai

Paused

App Files Files Community

agent-zero-ai / python /tools /vision_load.py

Tsitsi19

Upload folder using huggingface_hub

8d1819a verified 15 days ago

raw

history blame contribute delete

3.85 kB

	import base64
	from python.helpers.print_style import PrintStyle
	from python.helpers.tool import Tool, Response
	from python.helpers import runtime, files, images
	from mimetypes import guess_type
	from python.helpers import history

	# image optimization and token estimation for context window
	MAX_PIXELS = 768_000
	QUALITY = 75
	TOKENS_ESTIMATE = 1500


	class VisionLoad(Tool):
	async def execute(self, paths: list[str] = [], **kwargs) -> Response:

	self.images_dict = {}
	template: list[dict[str, str]] = [] # type: ignore

	for path in paths:
	if not await runtime.call_development_function(files.exists, str(path)):
	continue

	if path not in self.images_dict:
	mime_type, _ = guess_type(str(path))
	if mime_type and mime_type.startswith("image/"):
	try:
	# Read binary file
	file_content = await runtime.call_development_function(
	files.read_file_base64, str(path)
	)
	file_content = base64.b64decode(file_content)
	# Compress and convert to JPEG
	compressed = images.compress_image(
	file_content, max_pixels=MAX_PIXELS, quality=QUALITY
	)
	# Encode as base64
	file_content_b64 = base64.b64encode(compressed).decode("utf-8")

	# DEBUG: Save compressed image
	# await runtime.call_development_function(
	# files.write_file_base64, str(path), file_content_b64
	# )

	# Construct the data URL (always JPEG after compression)
	self.images_dict[path] = file_content_b64
	except Exception as e:
	self.images_dict[path] = None
	PrintStyle().error(f"Error processing image {path}: {e}")
	self.agent.context.log.log("warning", f"Error processing image {path}: {e}")

	return Response(message="dummy", break_loop=False)

	async def after_execution(self, response: Response, **kwargs):

	# build image data messages for LLMs, or error message
	content = []
	if self.images_dict:
	for path, image in self.images_dict.items():
	if image:
	content.append(
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{image}"},
	}
	)
	else:
	content.append(
	{
	"type": "text",
	"text": "Error processing image " + path,
	}
	)
	# append as raw message content for LLMs with vision tokens estimate
	msg = history.RawMessage(raw_content=content, preview="<Base64 encoded image data>")
	self.agent.hist_add_message(
	False, content=msg, tokens=TOKENS_ESTIMATE * len(content)
	)
	else:
	self.agent.hist_add_tool_result(self.name, "No images processed")

	# print and log short version
	message = (
	"No images processed"
	if not self.images_dict
	else f"{len(self.images_dict)} images processed"
	)
	PrintStyle(
	font_color="#1B4F72", background_color="white", padding=True, bold=True
	).print(f"{self.agent.agent_name}: Response from tool '{self.name}'")
	PrintStyle(font_color="#85C1E9").print(message)
	self.log.update(result=message)