Agent_Course_Final_Assignment

Sleeping

App Files Files Community

giulia-fontanella commited on Jun 5, 2025

Commit

9e53814

verified ·

1 Parent(s): b6f6740

Update tools.py

Browse files

Files changed (1) hide show

tools.py +100 -99

tools.py CHANGED Viewed

@@ -46,109 +46,110 @@ def read_python(file_path: str) -> str:
     except Exception as e:
         return f"Error reading Python file: {str(e)}"
-@tool
-def extract_text_from_image(img_path: str) -> str:
-    """
-    Extract text from an image file using a multimodal model.
-    Args:
-        img_path: A string representing the url of an image (e.g., PNG, JPEG).
-    Returns:
-        A single string containing the concatenated text extracted from the image.
-    """
-    all_text = ""
-    try:
-        # Read image and encode as base64
-        with open(img_path, "rb") as image_file:
-            image_bytes = image_file.read()
-        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-        # Prepare the prompt including the base64 image data
-        message = [
-            HumanMessage(
-                content=[
-                    {
-                        "type": "text",
-                        "text": (
-                            "Extract all the text from this image. "
-                            "Return only the extracted text, no explanations."
-                        ),
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{image_base64}"
                         },
-                    },
-                ]
-            )
-        ]
-        # Call the vision-capable model
-        response = vision_llm.invoke(message)
-        # Append extracted text
-        all_text += response.content + "\n\n"
-        return all_text.strip()
-    except Exception as e:
-        error_msg = f"Error extracting text: {str(e)}"
-        print(error_msg)
-        return ""
-@tool
-def describe_image(img_path: str, query: str) -> str:
-    """
-    Generate a detailed description of an image using a multimodal model.
-    This function reads a image from an url, encodes it, and sends it to a
-    vision-capable language model to obtain a comprehensive, natural language
-    description of the image's content, including its objects, actions, and context,
-    following a specific query.
-    Args:
-        img_path: A string representing the url of an image (e.g., PNG, JPEG).
-        query: Information to extract from the image.
-    Returns:
-        A single string containing a detailed description of the image.
-    """
-    try:
-        # Read image and encode as base64
-        with open(img_path, "rb") as image_file:
-            image_bytes = image_file.read()
-        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-        # Prepare message payload
-        message = [
-            HumanMessage(
-                content=[
-                    {
-                        "type": "text",
-                        "text": (
-                            f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{image_base64}"
                         },
-                    },
-                ]
-            )
-        ]
-        response = vision_llm.invoke(message)
-        return response.content.strip()
-    except Exception as e:
-        error_msg = f"Error describing image: {str(e)}"
-        print(error_msg)
-        return ""
 @tool
 def wiki_search(query: str) -> str:

     except Exception as e:
         return f"Error reading Python file: {str(e)}"
+def make_text_from_image_tool(vision_llm):
+    @tool
+    def extract_text_from_image(img_path: str) -> str:
+        """
+        Extract text from an image file using a multimodal model.
+        Args:
+            img_path: A string representing the url of an image (e.g., PNG, JPEG).
+        Returns:
+            A single string containing the concatenated text extracted from the image.
+        """
+        all_text = ""
+        try:
+            # Read image and encode as base64
+            with open(img_path, "rb") as image_file:
+                image_bytes = image_file.read()
+            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+            # Prepare the prompt including the base64 image data
+            message = [
+                HumanMessage(
+                    content=[
+                        {
+                            "type": "text",
+                            "text": (
+                                "Extract all the text from this image. "
+                                "Return only the extracted text, no explanations."
+                            ),
                         },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                    ]
+                )
+            ]
+            # Call the vision-capable model
+            response = vision_llm.invoke(message)
+            # Append extracted text
+            all_text += response.content + "\n\n"
+            return all_text.strip()
+        except Exception as e:
+            error_msg = f"Error extracting text: {str(e)}"
+            print(error_msg)
+            return ""
+def make_describe_image_tool(vision_llm):
+    @tool
+    def describe_image(img_path: str, query: str) -> str:
+        """
+        Generate a detailed description of an image using a multimodal model.
+        This function reads a image from an url, encodes it, and sends it to a
+        vision-capable language model to obtain a comprehensive, natural language
+        description of the image's content, including its objects, actions, and context,
+        following a specific query.
+        Args:
+            img_path: A string representing the url of an image (e.g., PNG, JPEG).
+            query: Information to extract from the image.
+        Returns:
+            A single string containing a detailed description of the image.
+        """
+        try:
+            # Read image and encode as base64
+            with open(img_path, "rb") as image_file:
+                image_bytes = image_file.read()
+            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+            # Prepare message payload
+            message = [
+                HumanMessage(
+                    content=[
+                        {
+                            "type": "text",
+                            "text": (
+                                f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
                         },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                    ]
+                )
+            ]
+            response = vision_llm.invoke(message)
+            return response.content.strip()
+        except Exception as e:
+            error_msg = f"Error describing image: {str(e)}"
+            print(error_msg)
+            return ""
 @tool
 def wiki_search(query: str) -> str: