| | import base64 |
| | from langchain_core.messages import AnyMessage, HumanMessage, AIMessage |
| | from langchain.tools import tool |
| |
|
| |
|
| | @tool |
| | def extract_text(img_path: str) -> str: |
| | """ |
| | Extract text from an image file using a multimodal model. |
| | |
| | Args: |
| | img_path: A string representing the url of an image (e.g., PNG, JPEG). |
| | |
| | Returns: |
| | A single string containing the concatenated text extracted from the image. |
| | """ |
| | all_text = "" |
| | try: |
| | |
| | with open(img_path, "rb") as image_file: |
| | image_bytes = image_file.read() |
| |
|
| | image_base64 = base64.b64encode(image_bytes).decode("utf-8") |
| |
|
| | |
| | message = [ |
| | HumanMessage( |
| | content=[ |
| | { |
| | "type": "text", |
| | "text": ( |
| | "Extract all the text from this image. " |
| | "Return only the extracted text, no explanations." |
| | ), |
| | }, |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": f"data:image/png;base64,{image_base64}" |
| | }, |
| | }, |
| | ] |
| | ) |
| | ] |
| |
|
| | |
| | response = vision_llm.invoke(message) |
| |
|
| | |
| | all_text += response.content + "\n\n" |
| |
|
| | return all_text.strip() |
| | except Exception as e: |
| | error_msg = f"Error extracting text: {str(e)}" |
| | print(error_msg) |
| | return "" |
| |
|
| |
|
| | @tool |
| | def describe_image(img_path: str, query: str) -> str: |
| | """ |
| | Generate a detailed description of an image using a multimodal model. |
| | This function reads a image from an url, encodes it, and sends it to a |
| | vision-capable language model to obtain a comprehensive, natural language |
| | description of the image's content, including its objects, actions, and context, |
| | following a specific query. |
| | |
| | Args: |
| | img_path: A string representing the url of an image (e.g., PNG, JPEG). |
| | query: Information to extract from the image. |
| | |
| | Returns: |
| | A single string containing a detailed description of the image. |
| | """ |
| | try: |
| | |
| | with open(img_path, "rb") as image_file: |
| | image_bytes = image_file.read() |
| |
|
| | image_base64 = base64.b64encode(image_bytes).decode("utf-8") |
| |
|
| | |
| | message = [ |
| | HumanMessage( |
| | content=[ |
| | { |
| | "type": "text", |
| | "text": ( |
| | f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ), |
| | }, |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": f"data:image/png;base64,{image_base64}" |
| | }, |
| | }, |
| | ] |
| | ) |
| | ] |
| |
|
| | |
| | response = vision_llm.invoke(message) |
| |
|
| | return response.content.strip() |
| |
|
| | except Exception as e: |
| | error_msg = f"Error describing image: {str(e)}" |
| | print(error_msg) |
| | return "" |
| | |