wishmi1234 commited on
Commit
f6c578a
·
verified ·
1 Parent(s): f499570

Update app.py

Browse files

Made changes so that the agent uses the ImageCaptioningTool only when it is needed an otherwise use other tools. Added a real pretrained ImageCaptioningTool using transformers.

Files changed (1) hide show
  1. app.py +128 -87
app.py CHANGED
@@ -12,6 +12,8 @@ from io import BytesIO
12
  import base64
13
  from typing import Any
14
 
 
 
15
  class DuckDuckGoSearchTool(Tool):
16
  name = "web_search"
17
  description = "Performs a DuckDuckGo web search."
@@ -31,95 +33,68 @@ class DuckDuckGoSearchTool(Tool):
31
  f"[{r['title']}]({r['href']})\n{r['body']}" for r in results
32
  )
33
  model = InferenceClientModel("qwen/Qwen2.5-0.5B-Instruct",
34
- max_tokens=512,
35
- system_message="""
36
- You are a highly capable AI assistant designed to solve real-world, multi-step reasoning tasks in the GAIA benchmark.
37
- Your job is to:
38
- - Search the web or Wikipedia if needed
39
- - Perform Python calculations or date arithmetic
40
-
41
- Instructions:
42
- 1. Think step-by-step and use tools wisely.
43
- 2. Always return a short, direct answer — no explanation or formatting.
44
-
45
- Examples:
46
- - Q: What is the capital of France?
47
- - A: Paris
48
-
49
- Your output must be: a single clean answer string only.
50
-
51
- """
52
  )
53
 
54
 
55
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
56
 
57
  from smolagents.tools import Tool
 
 
 
58
 
59
  class ImageCaptioningTool(Tool):
60
- name = "image_captioner"
61
- description = "Generate a caption for an image using a prompt or question."
62
-
63
- inputs = {
64
- "image": {
65
- "type": "image",
66
- "description": "An image file input."
67
- },
68
- "question": {
69
- "type": "string",
70
- "description": "A prompt or question about the image."
71
- }
72
- }
73
 
74
- output_type = "string"
 
 
 
 
 
 
 
 
75
 
76
- def forward(self, image, question):
77
- # You can now use image and question directly
78
- return f"Caption for the image based on: '{question}'"
79
-
80
- # class ImageCaptioningTool(Tool):
81
- # name = "image_captioner"
82
- # description = "Generate a caption for an image."
83
- # inputs = {"image": Any, "question": "str"}
84
- # output_type = "text"
85
 
86
- # def run(self, inputs: dict) -> str:
87
- # image = inputs.get("image")
88
- # if not image:
89
- # return "No image provided."
90
- # # You could run your model here instead
91
- # return "This is a placeholder caption for the uploaded image."
92
-
93
-
94
- # ---------------------- TOOL CONFIGURATION ---------------------- #
95
- # tools=[
96
- # DuckDuckGoSearchTool(max_results=5, rate_limit=2.0),
97
- # WikipediaSearchTool(user_agent="my-agent", language="en"),
98
- # PythonInterpreterTool(),
99
- # UserInputTool(),
100
- # ImageCaptioningTool(),
101
- # ]
 
 
 
 
 
 
 
 
 
 
 
 
102
  tools = [
103
- ImageCaptioningTool(
104
- name="image-captioning",
105
- description="Generates a caption for an input image."
106
- ),
107
- DuckDuckGoSearchTool(max_results=5),
108
  WikipediaSearchTool(),
109
  PythonInterpreterTool(),
110
  UserInputTool(),
111
- # load_tool("duckduckgo-search", trust_remote_code=True),
112
- # DuckDuckGoSearchTool(),
113
- # load_tool("wikipedia", trust_remote_code=True),
114
- # load_tool("python", trust_remote_code=True),
115
- # load_tool("user-input", trust_remote_code=True),
116
  ]
117
- # # ---------------------- AGENT SETUP ---------------------- #
118
- # agent = CodeAgent(
119
- # model = model,
120
- # tools = tools,
121
- # )
122
-
123
  # ---------------------- MAIN LOGIC ---------------------- #
124
 
125
  class BasicAgent:
@@ -146,20 +121,91 @@ class BasicAgent:
146
 
147
 
148
 
149
-
150
- def run_agent_on_image(image, agent):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  try:
152
- # Wrap image as expected by the agent tool
153
- response = agent("Describe this image", inputs={"image": image})
154
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
  return f"Error: {e}"
 
157
 
158
- # iface = gr.Interface(fn=run_agent_on_image, inputs=gr.Image(type="pil"), outputs="text")
159
- # iface.launch()
160
-
161
 
162
-
163
  def run_and_submit_all( profile: gr.OAuthProfile | None):
164
  """
165
  Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -211,11 +257,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
211
  return f"An unexpected error occurred fetching questions: {e}", None
212
 
213
 
214
- # question_text = item.get("question")
215
- # question_input = {"question": question_text}
216
- # if "image" in item:
217
- # question_input["image"] = item["image"]
218
- # submitted_answer = agent(question_input)
219
  # 3. Run your Agent
220
  results_log = []
221
  answers_payload = []
 
12
  import base64
13
  from typing import Any
14
 
15
+
16
+
17
  class DuckDuckGoSearchTool(Tool):
18
  name = "web_search"
19
  description = "Performs a DuckDuckGo web search."
 
33
  f"[{r['title']}]({r['href']})\n{r['body']}" for r in results
34
  )
35
  model = InferenceClientModel("qwen/Qwen2.5-0.5B-Instruct",
36
+ max_tokens=512
37
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  )
39
 
40
 
41
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
42
 
43
  from smolagents.tools import Tool
44
+ from transformers import pipeline
45
+ from PIL import Image
46
+ import torch
47
 
48
  class ImageCaptioningTool(Tool):
49
+ name = "image-captioning"
50
+ description = "Generates a caption for an input image."
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def __init__(self, **kwargs):
53
+ super().__init__(**kwargs)
54
+ # Load the captioning model only once
55
+ self.captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0 if torch.cuda.is_available() else -1)
56
+
57
+ def use(self, image, question):
58
+ if not isinstance(image, Image.Image):
59
+ image = Image.open(BytesIO(image)) # Handles raw bytes
60
+ captions = self.captioner(image)
61
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # class ImageCaptioningTool(Tool):
64
+ # name = "image-captioning"
65
+ # description = "Generate a caption for an image using a prompt or question."
66
+
67
+ # inputs = {
68
+ # "image": {
69
+ # "type": "image",
70
+ # "description": "An image file input."
71
+ # },
72
+ # "question": {
73
+ # "type": "string",
74
+ # "description": "A prompt or question about the image."
75
+ # }
76
+ # }
77
+
78
+ # output_type = "string"
79
+
80
+ # def forward(self, image, question):
81
+ # # You can now use image and question directly
82
+ # return f"Caption for the image based on: '{question}'"
83
+
84
+ image_captioner = ImageCaptioningTool(
85
+ name="image-captioning",
86
+ description="Generates a caption for an input image."
87
+ )
88
+
89
+ web_search = DuckDuckGoSearchTool(max_results=5)
90
+
91
  tools = [
92
+ image_captioner,
93
+ web_search,
 
 
 
94
  WikipediaSearchTool(),
95
  PythonInterpreterTool(),
96
  UserInputTool(),
 
 
 
 
 
97
  ]
 
 
 
 
 
 
98
  # ---------------------- MAIN LOGIC ---------------------- #
99
 
100
  class BasicAgent:
 
121
 
122
 
123
 
124
+
125
+ system_prompt = """
126
+ You are a highly capable AI assistant designed to solve real-world, multi-step reasoning tasks in the GAIA benchmark.
127
+ Your job is to:
128
+ - Search the web or Wikipedia if needed
129
+ - Perform Python calculations or date arithmetic
130
+ - Automatically search for and describe images if the question mentions or refers to one
131
+
132
+ Instructions:
133
+ 1. Think step-by-step and use tools wisely.
134
+ 2. If the question references an image (e.g. "What’s in this image of..."), search for a relevant image online and generate a caption to assist your reasoning.
135
+ 3. Use the image caption internally to help answer the question, but do not include it in your response.
136
+ 4. Always return a single, short, direct answer — no explanation, formatting, or extra information.
137
+
138
+ Examples:
139
+ - Q: What is the capital of France?
140
+ - A: Paris
141
+
142
+ - Q: What date is 30 days after January 1, 2023?
143
+ - A: January 31, 2023
144
+
145
+ - Q: What is 17 times 4?
146
+ - A: 68
147
+
148
+ - Q: What is the tallest building shown in the image of Dubai’s skyline?
149
+ - A: Burj Khalifa
150
+
151
+ - Q: What fruit is in the image of a bowl on the kitchen table?
152
+ - A: Bananas
153
+
154
+ - Q: What is shown in the picture of the moon landing?
155
+ - A: Astronaut on the Moon
156
+
157
+ Your output must be: a single clean answer string only.
158
+ """
159
+
160
+
161
+ def find_image_online(query):
162
+ """Use DuckDuckGo to find an image related to the query."""
163
+ with DDGS() as ddgs:
164
+ results = ddgs.images(query)
165
+ for result in results:
166
+ if result.get("image"):
167
+ return result["image"]
168
+ return None
169
+
170
+ def download_image(url):
171
+ """Download an image form a URL and return a PIL image."""
172
  try:
173
+ response = requests.get(url)
174
+ response.raise_for_status()
175
+ return Image.open(BytesIO(response.content))
176
+ except Exception:
177
+ return None
178
+
179
+ def ask_agent(question):
180
+ try:
181
+ prompt = system_prompt + "\n\nUser: " + question.strip()
182
+
183
+ image = None
184
+ image_caption = ""
185
+ # Only try to get an image if the question mentions or implies one
186
+ keywords = ["image", "picture","photo","painting", "what's in this picture", "describe this picture"]
187
+ question_lower = question.lower()
188
+ if any(word in question_lower for word in keywords):
189
+ image_url = find_image_online(question)
190
+ if image_url:
191
+ image = download_image(image_url)
192
+ if image:
193
+ # Use the ImageCaptioningTool to get a caption
194
+ image_captioner = [tool for tool in tools if tool.name == "image-captioning"][0]
195
+ image_caption = image_captioner(image=image, question=question)
196
+ #Append the caption to the user's original question
197
+ prompt +=f"\n\nThe image contains: {image_caption}"
198
+
199
+ #Run the agent (image is passed only if present; prompt always includes the caption if available)
200
+ inputs = {"image":image} if image else{}
201
+ return agent.run(prompt, inputs=inputs).strip()
202
+
203
  except Exception as e:
204
  return f"Error: {e}"
205
+
206
 
207
+
 
 
208
 
 
209
  def run_and_submit_all( profile: gr.OAuthProfile | None):
210
  """
211
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
257
  return f"An unexpected error occurred fetching questions: {e}", None
258
 
259
 
 
 
 
 
 
260
  # 3. Run your Agent
261
  results_log = []
262
  answers_payload = []