Agent_Course_Final_Assignment

Sleeping

File size: 3,642 Bytes

9b2bab8
 
4339f99
9b2bab8
 
4339f99
9b2bab8
 
 
d1ebe54
 
797ded5
d1ebe54
 
797ded5
9b2bab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fae792
4339f99
07b040d
3fae792
 
d1ebe54
3fae792
 
 
d1ebe54
3fae792
797ded5
 
d1ebe54
3fae792
797ded5
3fae792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b2bab8

import base64
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage
from langchain.tools import tool


@tool
def extract_text(img_path: str) -> str:
    """
    Extract text from an image file using a multimodal model.

    Args:
        img_path: A string representing the url of an image (e.g., PNG, JPEG).

    Returns:
        A single string containing the concatenated text extracted from the image.    
    """
    all_text = ""
    try:
        # Read image and encode as base64
        with open(img_path, "rb") as image_file:
            image_bytes = image_file.read()

        image_base64 = base64.b64encode(image_bytes).decode("utf-8")

        # Prepare the prompt including the base64 image data
        message = [
            HumanMessage(
                content=[
                    {
                        "type": "text",
                        "text": (
                            "Extract all the text from this image. "
                            "Return only the extracted text, no explanations."
                        ),
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_base64}"
                        },
                    },
                ]
            )
        ]

        # Call the vision-capable model
        response = vision_llm.invoke(message)

        # Append extracted text
        all_text += response.content + "\n\n"

        return all_text.strip()
    except Exception as e:
        error_msg = f"Error extracting text: {str(e)}"
        print(error_msg)
        return ""


@tool
def describe_image(img_path: str, query: str) -> str:
    """
    Generate a detailed description of an image using a multimodal model.
    This function reads a image from an url, encodes it, and sends it to a 
    vision-capable language model to obtain a comprehensive, natural language 
    description of the image's content, including its objects, actions, and context,
    following a specific query.
    
    Args:
        img_path: A string representing the url of an image (e.g., PNG, JPEG).
        query: Information to extract from the image.
        
    Returns:
        A single string containing a detailed description of the image.
    """
    try:
        # Read image and encode as base64
        with open(img_path, "rb") as image_file:
            image_bytes = image_file.read()

        image_base64 = base64.b64encode(image_bytes).decode("utf-8")

        # Prepare message payload
        message = [
            HumanMessage(
                content=[
                    {
                        "type": "text",
                        "text": (
                            f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_base64}"
                        },
                    },
                ]
            )
        ]

        # Call the vision model (assumes vision_llm is previously instantiated)
        response = vision_llm.invoke(message)

        return response.content.strip()

    except Exception as e:
        error_msg = f"Error describing image: {str(e)}"
        print(error_msg)
        return ""