Update agent.py
Browse files
agent.py
CHANGED
|
@@ -57,7 +57,7 @@ class BasicAgent():
|
|
| 57 |
Extract text from an image file using a multimodal model.
|
| 58 |
|
| 59 |
Args:
|
| 60 |
-
img_path: A
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
A single string containing the concatenated text extracted from each image.
|
|
@@ -73,13 +73,13 @@ class BasicAgent():
|
|
| 73 |
|
| 74 |
describe_image(img_path: str, query: str) -> str:
|
| 75 |
Generate a detailed description of an image using a multimodal model.
|
| 76 |
-
This function reads a
|
| 77 |
vision-capable language model to obtain a comprehensive, natural language
|
| 78 |
description of the image's content, including its objects, actions, and context,
|
| 79 |
following a specific query.
|
| 80 |
|
| 81 |
Args:
|
| 82 |
-
img_path: A
|
| 83 |
query: Information to extract from the image
|
| 84 |
|
| 85 |
Returns:
|
|
|
|
| 57 |
Extract text from an image file using a multimodal model.
|
| 58 |
|
| 59 |
Args:
|
| 60 |
+
img_path: A url pointing to an image (e.g., PNG, JPEG).
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
A single string containing the concatenated text extracted from each image.
|
|
|
|
| 73 |
|
| 74 |
describe_image(img_path: str, query: str) -> str:
|
| 75 |
Generate a detailed description of an image using a multimodal model.
|
| 76 |
+
This function reads a image from an url, encodes it, and sends it to a
|
| 77 |
vision-capable language model to obtain a comprehensive, natural language
|
| 78 |
description of the image's content, including its objects, actions, and context,
|
| 79 |
following a specific query.
|
| 80 |
|
| 81 |
Args:
|
| 82 |
+
img_path: A url pointing to an image (e.g., PNG, JPEG).
|
| 83 |
query: Information to extract from the image
|
| 84 |
|
| 85 |
Returns:
|