Spaces:
Paused
Paused
File size: 3,854 Bytes
8d1819a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import base64
from python.helpers.print_style import PrintStyle
from python.helpers.tool import Tool, Response
from python.helpers import runtime, files, images
from mimetypes import guess_type
from python.helpers import history
# image optimization and token estimation for context window
MAX_PIXELS = 768_000
QUALITY = 75
TOKENS_ESTIMATE = 1500
class VisionLoad(Tool):
async def execute(self, paths: list[str] = [], **kwargs) -> Response:
self.images_dict = {}
template: list[dict[str, str]] = [] # type: ignore
for path in paths:
if not await runtime.call_development_function(files.exists, str(path)):
continue
if path not in self.images_dict:
mime_type, _ = guess_type(str(path))
if mime_type and mime_type.startswith("image/"):
try:
# Read binary file
file_content = await runtime.call_development_function(
files.read_file_base64, str(path)
)
file_content = base64.b64decode(file_content)
# Compress and convert to JPEG
compressed = images.compress_image(
file_content, max_pixels=MAX_PIXELS, quality=QUALITY
)
# Encode as base64
file_content_b64 = base64.b64encode(compressed).decode("utf-8")
# DEBUG: Save compressed image
# await runtime.call_development_function(
# files.write_file_base64, str(path), file_content_b64
# )
# Construct the data URL (always JPEG after compression)
self.images_dict[path] = file_content_b64
except Exception as e:
self.images_dict[path] = None
PrintStyle().error(f"Error processing image {path}: {e}")
self.agent.context.log.log("warning", f"Error processing image {path}: {e}")
return Response(message="dummy", break_loop=False)
async def after_execution(self, response: Response, **kwargs):
# build image data messages for LLMs, or error message
content = []
if self.images_dict:
for path, image in self.images_dict.items():
if image:
content.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
}
)
else:
content.append(
{
"type": "text",
"text": "Error processing image " + path,
}
)
# append as raw message content for LLMs with vision tokens estimate
msg = history.RawMessage(raw_content=content, preview="<Base64 encoded image data>")
self.agent.hist_add_message(
False, content=msg, tokens=TOKENS_ESTIMATE * len(content)
)
else:
self.agent.hist_add_tool_result(self.name, "No images processed")
# print and log short version
message = (
"No images processed"
if not self.images_dict
else f"{len(self.images_dict)} images processed"
)
PrintStyle(
font_color="#1B4F72", background_color="white", padding=True, bold=True
).print(f"{self.agent.agent_name}: Response from tool '{self.name}'")
PrintStyle(font_color="#85C1E9").print(message)
self.log.update(result=message)
|