GAIA_Agent_Final / tools /analyze_image.py
pkduongsu's picture
eval 45/100, still cannot access files for questions
737d955
import base64
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
@tool
def analyze_image(img_path: str, question: str) -> str:
"""
Extract text from an image file using a multimodal model.
"""
all_text = ""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare the prompt including the base64 image data
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
"Analyze the image and answer the following question: " + question
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
# Call the vision-capable model
# Call the vision-capable model with the prepared message list
response = llm.invoke(message)
# Append extracted text
all_text += response.content + "\n\n"
return all_text.strip()
except Exception as e:
# A butler should handle errors gracefully
error_msg = f"Error extracting text: {str(e)}"
print(error_msg)
return ""
if __name__ == "__main__":
# Example usage
img_path = r"C:\Users\pkduo\OneDrive\Máy tính\HF Agent Course Final\Final_Assignment_Template\Screenshot 2025-05-02 144021.png"
question = "Review the chess position provided in the image. It is white's turn. Provide the correct next move for white which guarantees a win. Please provide your response in algebraic notation.?"
# Invoke the tool using the recommended .invoke() method with a dictionary input
result = analyze_image.invoke({"img_path": img_path, "question": question})
print(result)