Final_Assignment_Template

Sleeping

App Files Files Community

wishmi1234 commited on Jul 29, 2025

Commit

f6c578a

verified ·

1 Parent(s): f499570

Update app.py

Browse files

Made changes so that the agent uses the ImageCaptioningTool only when it is needed an otherwise use other tools. Added a real pretrained ImageCaptioningTool using transformers.

Files changed (1) hide show

app.py +128 -87

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from io import BytesIO
 import base64
 from typing import Any
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
     description = "Performs a DuckDuckGo web search."
@@ -31,95 +33,68 @@ class DuckDuckGoSearchTool(Tool):
             f"[{r['title']}]({r['href']})\n{r['body']}" for r in results
         )
 model = InferenceClientModel("qwen/Qwen2.5-0.5B-Instruct",
-                            max_tokens=512,
-                            system_message="""
-You are a highly capable AI assistant designed to solve real-world, multi-step reasoning tasks in the GAIA benchmark.
-Your job is to:
-- Search the web or Wikipedia if needed
-- Perform Python calculations or date arithmetic
-Instructions:
-1. Think step-by-step and use tools wisely.
-2. Always return a short, direct answer — no explanation or formatting.
-Examples:
-- Q: What is the capital of France?
-- A: Paris
-Your output must be: a single clean answer string only.
-"""
 )
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 from smolagents.tools import Tool
 class ImageCaptioningTool(Tool):
-    name = "image_captioner"
-    description = "Generate a caption for an image using a prompt or question."
-    inputs = {
-        "image": {
-            "type": "image",
-            "description": "An image file input."
-        },
-        "question": {
-            "type": "string",
-            "description": "A prompt or question about the image."
-        }
-    }
-    output_type = "string"
-    def forward(self, image, question):
-        # You can now use image and question directly
-        return f"Caption for the image based on: '{question}'"
-# class ImageCaptioningTool(Tool):
-#     name = "image_captioner"
-#     description = "Generate a caption for an image."
-#     inputs = {"image": Any, "question": "str"}
-#     output_type = "text"
-#     def run(self, inputs: dict) -> str:
-#         image = inputs.get("image")
-#         if not image:
-#             return "No image provided."
-#         # You could run your model here instead
-#         return "This is a placeholder caption for the uploaded image."
-# ---------------------- TOOL CONFIGURATION ---------------------- #
-# tools=[
-#     DuckDuckGoSearchTool(max_results=5, rate_limit=2.0),
-#     WikipediaSearchTool(user_agent="my-agent", language="en"),
-#     PythonInterpreterTool(),
-#     UserInputTool(),
-#     ImageCaptioningTool(),
-# ]
 tools = [
-    ImageCaptioningTool(
-        name="image-captioning",
-        description="Generates a caption for an input image."
-    ),
-    DuckDuckGoSearchTool(max_results=5),
     WikipediaSearchTool(),
     PythonInterpreterTool(),
     UserInputTool(),
-    # load_tool("duckduckgo-search", trust_remote_code=True),
-    # DuckDuckGoSearchTool(),
-    # load_tool("wikipedia", trust_remote_code=True),
-    # load_tool("python", trust_remote_code=True),
-    # load_tool("user-input", trust_remote_code=True),
 ]
-# # ---------------------- AGENT SETUP ---------------------- #
-# agent = CodeAgent(
-#     model = model,
-#     tools = tools,
-# )
 # ---------------------- MAIN LOGIC ---------------------- #
 class BasicAgent:
@@ -146,20 +121,91 @@ class BasicAgent:
-def run_agent_on_image(image, agent):
     try:
-        # Wrap image as expected by the agent tool
-        response = agent("Describe this image", inputs={"image": image})
-        return response
     except Exception as e:
         return f"Error: {e}"
-# iface = gr.Interface(fn=run_agent_on_image, inputs=gr.Image(type="pil"), outputs="text")
-# iface.launch()
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -211,11 +257,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         return f"An unexpected error occurred fetching questions: {e}", None
-#     question_text = item.get("question")
-# question_input = {"question": question_text}
-# if "image" in item:
-#     question_input["image"] = item["image"]
-# submitted_answer = agent(question_input)
     # 3. Run your Agent
     results_log = []
     answers_payload = []

 import base64
 from typing import Any
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
     description = "Performs a DuckDuckGo web search."
             f"[{r['title']}]({r['href']})\n{r['body']}" for r in results
         )
 model = InferenceClientModel("qwen/Qwen2.5-0.5B-Instruct",
+                            max_tokens=512
 )
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 from smolagents.tools import Tool
+from transformers import pipeline
+from PIL import Image
+import torch
 class ImageCaptioningTool(Tool):
+    name = "image-captioning"
+    description = "Generates a caption for an input image."
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Load the captioning model only once
+        self.captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0 if torch.cuda.is_available() else -1)
+    def use(self, image, question):
+        if not isinstance(image, Image.Image):
+            image = Image.open(BytesIO(image))  # Handles raw bytes
+        captions = self.captioner(image)
+# class ImageCaptioningTool(Tool):
+#     name = "image-captioning"
+#     description = "Generate a caption for an image using a prompt or question."
+#     inputs = {
+#         "image": {
+#             "type": "image",
+#             "description": "An image file input."
+#         },
+#         "question": {
+#             "type": "string",
+#             "description": "A prompt or question about the image."
+#         }
+#     }
+#     output_type = "string"
+#     def forward(self, image, question):
+#         # You can now use image and question directly
+#         return f"Caption for the image based on: '{question}'"
+image_captioner = ImageCaptioningTool(
+    name="image-captioning",
+    description="Generates a caption for an input image."
+)
+web_search = DuckDuckGoSearchTool(max_results=5)
 tools = [
+    image_captioner,
+    web_search,
     WikipediaSearchTool(),
     PythonInterpreterTool(),
     UserInputTool(),
 ]
 # ---------------------- MAIN LOGIC ---------------------- #
 class BasicAgent:
+system_prompt = """
+You are a highly capable AI assistant designed to solve real-world, multi-step reasoning tasks in the GAIA benchmark.
+Your job is to:
+- Search the web or Wikipedia if needed
+- Perform Python calculations or date arithmetic
+- Automatically search for and describe images if the question mentions or refers to one
+Instructions:
+1. Think step-by-step and use tools wisely.
+2. If the question references an image (e.g. "What’s in this image of..."), search for a relevant image online and generate a caption to assist your reasoning.
+3. Use the image caption internally to help answer the question, but do not include it in your response.
+4. Always return a single, short, direct answer — no explanation, formatting, or extra information.
+Examples:
+- Q: What is the capital of France?
+- A: Paris
+- Q: What date is 30 days after January 1, 2023?
+- A: January 31, 2023
+- Q: What is 17 times 4?
+- A: 68
+- Q: What is the tallest building shown in the image of Dubai’s skyline?
+- A: Burj Khalifa
+- Q: What fruit is in the image of a bowl on the kitchen table?
+- A: Bananas
+- Q: What is shown in the picture of the moon landing?
+- A: Astronaut on the Moon
+Your output must be: a single clean answer string only.
+"""
+def find_image_online(query):
+    """Use DuckDuckGo to find an image related to the query."""
+    with DDGS() as ddgs:
+        results = ddgs.images(query)
+        for result in results:
+            if result.get("image"):
+                return result["image"]
+    return None
+def download_image(url):
+    """Download an image form a URL and return a PIL image."""
     try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return Image.open(BytesIO(response.content))
+    except Exception:
+        return None
+def ask_agent(question):
+    try:
+        prompt = system_prompt + "\n\nUser: " + question.strip()
+        image = None
+        image_caption = ""
+        # Only try to get an image if the question mentions or implies one
+        keywords = ["image", "picture","photo","painting", "what's in this picture", "describe this picture"]
+        question_lower = question.lower()
+        if any(word in question_lower for word in keywords):
+            image_url = find_image_online(question)
+            if image_url:
+                image = download_image(image_url)
+                if image:
+                    # Use the ImageCaptioningTool to get a caption
+                    image_captioner = [tool for tool in tools if tool.name == "image-captioning"][0]
+                    image_caption = image_captioner(image=image, question=question)
+                    #Append the caption to the user's original question
+                    prompt +=f"\n\nThe image contains: {image_caption}"
+                #Run the agent (image is passed only if present; prompt always includes the caption if available)
+                inputs = {"image":image} if image else{}
+                return agent.run(prompt, inputs=inputs).strip()
     except Exception as e:
         return f"Error: {e}"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
         return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run your Agent
     results_log = []
     answers_payload = []