File size: 3,642 Bytes
9b2bab8 4339f99 9b2bab8 4339f99 9b2bab8 d1ebe54 797ded5 d1ebe54 797ded5 9b2bab8 3fae792 4339f99 07b040d 3fae792 d1ebe54 3fae792 d1ebe54 3fae792 797ded5 d1ebe54 3fae792 797ded5 3fae792 9b2bab8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | import base64
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage
from langchain.tools import tool
@tool
def extract_text(img_path: str) -> str:
"""
Extract text from an image file using a multimodal model.
Args:
img_path: A string representing the url of an image (e.g., PNG, JPEG).
Returns:
A single string containing the concatenated text extracted from the image.
"""
all_text = ""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare the prompt including the base64 image data
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
"Extract all the text from this image. "
"Return only the extracted text, no explanations."
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
# Call the vision-capable model
response = vision_llm.invoke(message)
# Append extracted text
all_text += response.content + "\n\n"
return all_text.strip()
except Exception as e:
error_msg = f"Error extracting text: {str(e)}"
print(error_msg)
return ""
@tool
def describe_image(img_path: str, query: str) -> str:
"""
Generate a detailed description of an image using a multimodal model.
This function reads a image from an url, encodes it, and sends it to a
vision-capable language model to obtain a comprehensive, natural language
description of the image's content, including its objects, actions, and context,
following a specific query.
Args:
img_path: A string representing the url of an image (e.g., PNG, JPEG).
query: Information to extract from the image.
Returns:
A single string containing a detailed description of the image.
"""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare message payload
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
# Call the vision model (assumes vision_llm is previously instantiated)
response = vision_llm.invoke(message)
return response.content.strip()
except Exception as e:
error_msg = f"Error describing image: {str(e)}"
print(error_msg)
return ""
|