Agent_Course_Final_Assignment

Sleeping

App Files Files Community

giulia-fontanella commited on Jun 4, 2025

Commit

3fae792

verified ·

1 Parent(s): a8e3583

Update tools.py

Browse files

Files changed (1) hide show

tools.py +52 -1

tools.py CHANGED Viewed

@@ -47,5 +47,56 @@ def extract_text(img_path: str) -> str:
         print(error_msg)
         return ""

         print(error_msg)
         return ""
+    def describe_image(img_path: str, query: str) -> str:
+    """
+    Generate a detailed description of an image using a multimodal model.
+    This function reads a local image file, encodes it, and sends it to a
+    vision-capable language model to obtain a comprehensive, natural language
+    description of the image's content, including its objects, actions, and context,
+    following a specific query.
+    Args:
+        img_path: A string path to a local image file (e.g., PNG, JPEG).
+        query: Information to extract from the image
+    Returns:
+        A single string containing a detailed, human-readable description of the image.
+    """
+    try:
+        # Read image and encode as base64
+        with open(img_path, "rb") as image_file:
+            image_bytes = image_file.read()
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        # Prepare message payload
+        message = [
+            HumanMessage(
+                content=[
+                    {
+                        "type": "text",
+                        "text": (
+                            f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{image_base64}"
+                        },
+                    },
+                ]
+            )
+        ]
+        # Call the vision model (assumes vision_llm is previously instantiated)
+        response = vision_llm.invoke(message)
+        return response.content.strip()
+    except Exception as e:
+        error_msg = f"Error describing image: {str(e)}"
+        print(error_msg)
+        return ""