"""This module contains tools for processing images or videos.""" import os import base64 import mimetypes from langchain_core.tools import tool from langchain_google_genai import ChatGoogleGenerativeAI @tool def query_image(image_path: str, query: str) -> str: """Uses a multimodal LLM to answer a query for a given image. Args: image_path (str): The path to the image to process query (str): The query to be answered based on the image Returns: str: Answer of the query based on the image """ llm = ChatGoogleGenerativeAI( model="gemini-2.0-flash-001", temperature=0.8, max_tokens=None, timeout=None, max_retries=2, google_api_key=os.getenv("GOOGLE_API_KEY") # Get API key from environment variable ) with open(image_path, "rb") as f: image_bytes = f.read() mime_type = mimetypes.guess_type(image_path)[0] or "image/jpeg" image_b64 = base64.b64encode(image_bytes).decode("utf-8") image_dict = { "mime_type": mime_type, "data": image_b64 } response = llm.invoke( input=query, images=[image_dict] ) return response.content