Ricardo Teixeira commited on
Commit
ed3a95c
·
1 Parent(s): a87a417

Final submission version

Browse files
Files changed (6) hide show
  1. agent.py +3 -2
  2. code_interpreter.py +16 -2
  3. image_tools.py +0 -310
  4. multimodal_tools.py +17 -13
  5. system_prompt.txt +4 -5
  6. tools.py +3 -4
agent.py CHANGED
@@ -31,12 +31,13 @@ class Agent():
31
  llm = ChatOllama(model=model, temperature=0)
32
  elif provider == 'google':
33
  if not model:
34
- model = "gemini-2.0-flash"
35
  gemini_api_key = os.getenv("GEMINI_API_KEY")
36
  llm = ChatGoogleGenerativeAI(model=model, temperature=0,google_api_key=gemini_api_key)
37
  elif provider == 'groq':
38
  if not model:
39
  model = "meta-llama/llama-4-scout-17b-16e-instruct"
 
40
  groq_api_key = os.getenv("GROQ_API_KEY")
41
  llm = ChatGroq(model=model, temperature=0, groq_api_key=groq_api_key)
42
  else:
@@ -91,7 +92,7 @@ if __name__ == "__main__":
91
  def main():
92
  agent = Agent()
93
  question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name."
94
- model = "meta-llama/llama-4-scout-17b-16e-instruct"
95
  graph = agent.build_graph('google', model)
96
  messages = [HumanMessage(content=question)]
97
  messages = graph.invoke({"messages": messages})
 
31
  llm = ChatOllama(model=model, temperature=0)
32
  elif provider == 'google':
33
  if not model:
34
+ model = "gemini-2.5-flash"
35
  gemini_api_key = os.getenv("GEMINI_API_KEY")
36
  llm = ChatGoogleGenerativeAI(model=model, temperature=0,google_api_key=gemini_api_key)
37
  elif provider == 'groq':
38
  if not model:
39
  model = "meta-llama/llama-4-scout-17b-16e-instruct"
40
+ #model = "meta-llama/llama-4-maverick-17b-128e-instruct"
41
  groq_api_key = os.getenv("GROQ_API_KEY")
42
  llm = ChatGroq(model=model, temperature=0, groq_api_key=groq_api_key)
43
  else:
 
92
  def main():
93
  agent = Agent()
94
  question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name."
95
+ model = "gemini-2.5-flash"
96
  graph = agent.build_graph('google', model)
97
  messages = [HumanMessage(content=question)]
98
  messages = graph.invoke({"messages": messages})
code_interpreter.py CHANGED
@@ -288,7 +288,7 @@ interpreter_instance = CodeInterpreter()
288
  def execute_code_multilang(code: str, language: str = "python") -> str:
289
  """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
290
  Args:
291
- code (str): The source code to execute.
292
  language (str): The language of the code. Supported: "python", "bash", "sql", "c", "java".
293
  Returns:
294
  A string summarizing the execution results (stdout, stderr, errors, plots, dataframes if any).
@@ -345,4 +345,18 @@ def execute_code_multilang(code: str, language: str = "python") -> str:
345
  "\n**Error Log:**\n```\n" + result["stderr"].strip() + "\n```"
346
  )
347
 
348
- return "\n".join(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  def execute_code_multilang(code: str, language: str = "python") -> str:
289
  """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
290
  Args:
291
+ code (str): The source code to execute as a string.
292
  language (str): The language of the code. Supported: "python", "bash", "sql", "c", "java".
293
  Returns:
294
  A string summarizing the execution results (stdout, stderr, errors, plots, dataframes if any).
 
345
  "\n**Error Log:**\n```\n" + result["stderr"].strip() + "\n```"
346
  )
347
 
348
+ return "\n".join(response)
349
+
350
+ @tool
351
+ def load_code_file(file_path: str):
352
+ """
353
+ Loads the content of a code file to be executed.
354
+ Args:
355
+ file_path (str): the path to the code file.
356
+ Returns:
357
+ str: the code in the file as a string.
358
+ """
359
+ with open(file_path,'r') as f:
360
+ code = f.read()
361
+
362
+ return code
image_tools.py DELETED
@@ -1,310 +0,0 @@
1
- import os
2
- import io
3
- import base64
4
- import uuid
5
- from PIL import Image
6
- from typing import List, Dict, Any, Optional
7
- from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
8
- import numpy as np
9
- from langchain_core.tools import tool
10
-
11
- # Helper functions for image processing
12
- def encode_image(image_path: str) -> str:
13
- """Convert an image file to base64 string."""
14
- with open(image_path, "rb") as image_file:
15
- return base64.b64encode(image_file.read()).decode("utf-8")
16
-
17
-
18
- def decode_image(base64_string: str) -> Image.Image:
19
- """Convert a base64 string to a PIL Image."""
20
- image_data = base64.b64decode(base64_string)
21
- return Image.open(io.BytesIO(image_data))
22
-
23
-
24
- def save_image(image: Image.Image, directory: str = "image_outputs") -> str:
25
- """Save a PIL Image to disk and return the path."""
26
- os.makedirs(directory, exist_ok=True)
27
- image_id = str(uuid.uuid4())
28
- image_path = os.path.join(directory, f"{image_id}.png")
29
- image.save(image_path)
30
- return image_path
31
-
32
- @tool
33
- def analyze_image(image_base64: str) -> Dict[str, Any]:
34
- """
35
- Analyze basic properties of an image (size, mode, color analysis, thumbnail preview).
36
- Args:
37
- image_base64 (str): Base64 encoded image string
38
- Returns:
39
- Dictionary with analysis result
40
- """
41
- try:
42
- img = decode_image(image_base64)
43
- width, height = img.size
44
- mode = img.mode
45
-
46
- if mode in ("RGB", "RGBA"):
47
- arr = np.array(img)
48
- avg_colors = arr.mean(axis=(0, 1))
49
- dominant = ["Red", "Green", "Blue"][np.argmax(avg_colors[:3])]
50
- brightness = avg_colors.mean()
51
- color_analysis = {
52
- "average_rgb": avg_colors.tolist(),
53
- "brightness": brightness,
54
- "dominant_color": dominant,
55
- }
56
- else:
57
- color_analysis = {"note": f"No color analysis for mode {mode}"}
58
-
59
- thumbnail = img.copy()
60
- thumbnail.thumbnail((100, 100))
61
- thumb_path = save_image(thumbnail, "thumbnails")
62
- thumbnail_base64 = encode_image(thumb_path)
63
-
64
- return {
65
- "dimensions": (width, height),
66
- "mode": mode,
67
- "color_analysis": color_analysis,
68
- "thumbnail": thumbnail_base64,
69
- }
70
- except Exception as e:
71
- return {"error": str(e)}
72
-
73
-
74
- @tool
75
- def transform_image(
76
- image_base64: str, operation: str, params: Optional[Dict[str, Any]] = None
77
- ) -> Dict[str, Any]:
78
- """
79
- Apply transformations: resize, rotate, crop, flip, brightness, contrast, blur, sharpen, grayscale.
80
- Args:
81
- image_base64 (str): Base64 encoded input image
82
- operation (str): Transformation operation
83
- params (Dict[str, Any], optional): Parameters for the operation
84
- Returns:
85
- Dictionary with transformed image (base64)
86
- """
87
- try:
88
- img = decode_image(image_base64)
89
- params = params or {}
90
-
91
- if operation == "resize":
92
- img = img.resize(
93
- (
94
- params.get("width", img.width // 2),
95
- params.get("height", img.height // 2),
96
- )
97
- )
98
- elif operation == "rotate":
99
- img = img.rotate(params.get("angle", 90), expand=True)
100
- elif operation == "crop":
101
- img = img.crop(
102
- (
103
- params.get("left", 0),
104
- params.get("top", 0),
105
- params.get("right", img.width),
106
- params.get("bottom", img.height),
107
- )
108
- )
109
- elif operation == "flip":
110
- if params.get("direction", "horizontal") == "horizontal":
111
- img = img.transpose(Image.FLIP_LEFT_RIGHT)
112
- else:
113
- img = img.transpose(Image.FLIP_TOP_BOTTOM)
114
- elif operation == "adjust_brightness":
115
- img = ImageEnhance.Brightness(img).enhance(params.get("factor", 1.5))
116
- elif operation == "adjust_contrast":
117
- img = ImageEnhance.Contrast(img).enhance(params.get("factor", 1.5))
118
- elif operation == "blur":
119
- img = img.filter(ImageFilter.GaussianBlur(params.get("radius", 2)))
120
- elif operation == "sharpen":
121
- img = img.filter(ImageFilter.SHARPEN)
122
- elif operation == "grayscale":
123
- img = img.convert("L")
124
- else:
125
- return {"error": f"Unknown operation: {operation}"}
126
-
127
- result_path = save_image(img)
128
- result_base64 = encode_image(result_path)
129
- return {"transformed_image": result_base64}
130
-
131
- except Exception as e:
132
- return {"error": str(e)}
133
-
134
-
135
- @tool
136
- def draw_on_image(
137
- image_base64: str, drawing_type: str, params: Dict[str, Any]
138
- ) -> Dict[str, Any]:
139
- """
140
- Draw shapes (rectangle, circle, line) or text onto an image.
141
- Args:
142
- image_base64 (str): Base64 encoded input image
143
- drawing_type (str): Drawing type
144
- params (Dict[str, Any]): Drawing parameters
145
- Returns:
146
- Dictionary with result image (base64)
147
- """
148
- try:
149
- img = decode_image(image_base64)
150
- draw = ImageDraw.Draw(img)
151
- color = params.get("color", "red")
152
-
153
- if drawing_type == "rectangle":
154
- draw.rectangle(
155
- [params["left"], params["top"], params["right"], params["bottom"]],
156
- outline=color,
157
- width=params.get("width", 2),
158
- )
159
- elif drawing_type == "circle":
160
- x, y, r = params["x"], params["y"], params["radius"]
161
- draw.ellipse(
162
- (x - r, y - r, x + r, y + r),
163
- outline=color,
164
- width=params.get("width", 2),
165
- )
166
- elif drawing_type == "line":
167
- draw.line(
168
- (
169
- params["start_x"],
170
- params["start_y"],
171
- params["end_x"],
172
- params["end_y"],
173
- ),
174
- fill=color,
175
- width=params.get("width", 2),
176
- )
177
- elif drawing_type == "text":
178
- font_size = params.get("font_size", 20)
179
- try:
180
- font = ImageFont.truetype("arial.ttf", font_size)
181
- except IOError:
182
- font = ImageFont.load_default()
183
- draw.text(
184
- (params["x"], params["y"]),
185
- params.get("text", "Text"),
186
- fill=color,
187
- font=font,
188
- )
189
- else:
190
- return {"error": f"Unknown drawing type: {drawing_type}"}
191
-
192
- result_path = save_image(img)
193
- result_base64 = encode_image(result_path)
194
- return {"result_image": result_base64}
195
-
196
- except Exception as e:
197
- return {"error": str(e)}
198
-
199
-
200
- @tool
201
- def generate_simple_image(
202
- image_type: str,
203
- width: int = 500,
204
- height: int = 500,
205
- params: Optional[Dict[str, Any]] = None,
206
- ) -> Dict[str, Any]:
207
- """
208
- Generate a simple image (gradient, noise, pattern, chart).
209
- Args:
210
- image_type (str): Type of image
211
- width (int), height (int)
212
- params (Dict[str, Any], optional): Specific parameters
213
- Returns:
214
- Dictionary with generated image (base64)
215
- """
216
- try:
217
- params = params or {}
218
-
219
- if image_type == "gradient":
220
- direction = params.get("direction", "horizontal")
221
- start_color = params.get("start_color", (255, 0, 0))
222
- end_color = params.get("end_color", (0, 0, 255))
223
-
224
- img = Image.new("RGB", (width, height))
225
- draw = ImageDraw.Draw(img)
226
-
227
- if direction == "horizontal":
228
- for x in range(width):
229
- r = int(
230
- start_color[0] + (end_color[0] - start_color[0]) * x / width
231
- )
232
- g = int(
233
- start_color[1] + (end_color[1] - start_color[1]) * x / width
234
- )
235
- b = int(
236
- start_color[2] + (end_color[2] - start_color[2]) * x / width
237
- )
238
- draw.line([(x, 0), (x, height)], fill=(r, g, b))
239
- else:
240
- for y in range(height):
241
- r = int(
242
- start_color[0] + (end_color[0] - start_color[0]) * y / height
243
- )
244
- g = int(
245
- start_color[1] + (end_color[1] - start_color[1]) * y / height
246
- )
247
- b = int(
248
- start_color[2] + (end_color[2] - start_color[2]) * y / height
249
- )
250
- draw.line([(0, y), (width, y)], fill=(r, g, b))
251
-
252
- elif image_type == "noise":
253
- noise_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
254
- img = Image.fromarray(noise_array, "RGB")
255
-
256
- else:
257
- return {"error": f"Unsupported image_type {image_type}"}
258
-
259
- result_path = save_image(img)
260
- result_base64 = encode_image(result_path)
261
- return {"generated_image": result_base64}
262
-
263
- except Exception as e:
264
- return {"error": str(e)}
265
-
266
-
267
- @tool
268
- def combine_images(
269
- images_base64: List[str], operation: str, params: Optional[Dict[str, Any]] = None
270
- ) -> Dict[str, Any]:
271
- """
272
- Combine multiple images (collage, stack, blend).
273
- Args:
274
- images_base64 (List[str]): List of base64 images
275
- operation (str): Combination type
276
- params (Dict[str, Any], optional)
277
- Returns:
278
- Dictionary with combined image (base64)
279
- """
280
- try:
281
- images = [decode_image(b64) for b64 in images_base64]
282
- params = params or {}
283
-
284
- if operation == "stack":
285
- direction = params.get("direction", "horizontal")
286
- if direction == "horizontal":
287
- total_width = sum(img.width for img in images)
288
- max_height = max(img.height for img in images)
289
- new_img = Image.new("RGB", (total_width, max_height))
290
- x = 0
291
- for img in images:
292
- new_img.paste(img, (x, 0))
293
- x += img.width
294
- else:
295
- max_width = max(img.width for img in images)
296
- total_height = sum(img.height for img in images)
297
- new_img = Image.new("RGB", (max_width, total_height))
298
- y = 0
299
- for img in images:
300
- new_img.paste(img, (0, y))
301
- y += img.height
302
- else:
303
- return {"error": f"Unsupported combination operation {operation}"}
304
-
305
- result_path = save_image(new_img)
306
- result_base64 = encode_image(result_path)
307
- return {"combined_image": result_base64}
308
-
309
- except Exception as e:
310
- return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
multimodal_tools.py CHANGED
@@ -6,20 +6,20 @@ from dotenv import load_dotenv
6
  from langchain_google_genai import ChatGoogleGenerativeAI
7
  import os
8
  from langchain_core.messages import HumanMessage
 
9
 
10
  load_dotenv()
11
 
12
  @tool
13
- def analyse_image(img_path: str, query: str) -> str:
14
  """
15
  Analyses and extracts information from an image file using a multimodal model.
16
  Args:
17
- img_path: The local path for the image to be analysed.
18
- query: Information to be extrated from the image by the multimodal model
19
  """
20
  all_text = ""
21
  gemini_api_key = os.getenv("GEMINI_API_KEY")
22
- vision_llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0,google_api_key=gemini_api_key)
23
 
24
  try:
25
  # Read image and encode as base64
@@ -33,11 +33,13 @@ def analyse_image(img_path: str, query: str) -> str:
33
  content=[
34
  {
35
  "type": "text",
36
- "text": f'{query}',
37
  },
38
  {
39
- "type": "image_url",
40
- "image_url": {'data': image_base64,'format': 'png'},
 
 
41
  },
42
  ]
43
  )
@@ -60,14 +62,14 @@ def analyse_audio(audio_path: str) -> str:
60
  """
61
  Transcribes voice inputs from an audio file using a multimodal model to text.
62
  Args:
63
- audio_path: The local path for the audio to be transcribed.
64
  """
65
  all_text = ""
66
  gemini_api_key = os.getenv("GEMINI_API_KEY")
67
- audio_llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0,google_api_key=gemini_api_key)
68
 
69
  try:
70
- with open("audio_input.wav", "rb") as f:
71
  audio = f.read()
72
  audio_b64 = base64.b64encode(audio).decode()
73
 
@@ -75,10 +77,12 @@ def analyse_audio(audio_path: str) -> str:
75
  [
76
  HumanMessage(
77
  content=[
78
- {"type": "text", "text": "Transcribe the following:"},
79
  {
80
- "type": "input_audio",
81
- "input_audio": {"data": audio_b64, "format": "wav"},
 
 
82
  },
83
  ],
84
  ),
 
6
  from langchain_google_genai import ChatGoogleGenerativeAI
7
  import os
8
  from langchain_core.messages import HumanMessage
9
+ from langchain_groq import ChatGroq
10
 
11
  load_dotenv()
12
 
13
  @tool
14
+ def analyse_image(img_path: str) -> str:
15
  """
16
  Analyses and extracts information from an image file using a multimodal model.
17
  Args:
18
+ img_path: The local path of the image to be analysed.
 
19
  """
20
  all_text = ""
21
  gemini_api_key = os.getenv("GEMINI_API_KEY")
22
+ vision_llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0,google_api_key=gemini_api_key)
23
 
24
  try:
25
  # Read image and encode as base64
 
33
  content=[
34
  {
35
  "type": "text",
36
+ "text": 'Extract information from this image with as much detail as possible:',
37
  },
38
  {
39
+ "type": "image",
40
+ "source_type": "base64",
41
+ "data": image_base64,
42
+ "mime_type": "image/png",
43
  },
44
  ]
45
  )
 
62
  """
63
  Transcribes voice inputs from an audio file using a multimodal model to text.
64
  Args:
65
+ audio_path: The local path of the audio to be transcribed.
66
  """
67
  all_text = ""
68
  gemini_api_key = os.getenv("GEMINI_API_KEY")
69
+ audio_llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0,google_api_key=gemini_api_key)
70
 
71
  try:
72
+ with open(audio_path, "rb") as f:
73
  audio = f.read()
74
  audio_b64 = base64.b64encode(audio).decode()
75
 
 
77
  [
78
  HumanMessage(
79
  content=[
80
+ {"type": "text", "text": "Transcribe the following audio:"},
81
  {
82
+ "type": "audio",
83
+ "source_type": "base64",
84
+ "data": audio_b64,
85
+ "mime_type": "audio/mp3"
86
  },
87
  ],
88
  ),
system_prompt.txt CHANGED
@@ -1,11 +1,10 @@
1
  You are a general AI assistant.
2
  I will ask you a question.
3
- Do not provide any explanations, reasoning, or context. Only respond with the final answer in the following strict format:
4
- FINAL ANSWER: [YOUR FINAL ANSWER]
5
  YOUR FINAL ANSWER must adhere to the following rules:
6
  - It must be a single number, a few words, or a comma-separated list of numbers and/or strings.
7
  - If the answer is a number, DO NOT use commas, units, currency symbols (e.g., $, %), or any other special characters unless explicitly specified.
8
- - If the answer is a string, DO NOT use articles (e.g., "the", "a") or abbreviations (e.g., "NYC" for "New York City"). Write the digits in plain text (e.g., "five" instead of "5") unless otherwise instructed.
9
- - If you are asked for a comma-separated list, apply the above rules depending on the type of element (number or string).
10
- - No extra explanation, no elaboration, no additional context - just the final answer.
11
  Make sure the format is followed precisely with no deviations.
 
1
  You are a general AI assistant.
2
  I will ask you a question.
3
+ Respond with the final answer in the following strict format: FINAL ANSWER: [YOUR FINAL ANSWER]
 
4
  YOUR FINAL ANSWER must adhere to the following rules:
5
  - It must be a single number, a few words, or a comma-separated list of numbers and/or strings.
6
  - If the answer is a number, DO NOT use commas, units, currency symbols (e.g., $, %), or any other special characters unless explicitly specified.
7
+ - If the answer is a string, DO NOT use articles (e.g., "the", "a") or abbreviations (e.g., "NYC" for "New York City").
8
+ - If you are asked for a comma-separated list, apply the above rules depending on the type of element (number or string) and add a space after each coma.
9
+ Give priority to extracting information from tools before you arrive to your FINAL ANSWER, instead of trying to gess the result.
10
  Make sure the format is followed precisely with no deviations.
tools.py CHANGED
@@ -16,8 +16,7 @@ import os
16
  import uuid
17
  import requests
18
  from PIL import Image
19
- import pytesseract
20
- from code_interpreter import execute_code_multilang
21
  from multimodal_tools import analyse_image, analyse_audio
22
 
23
  ########################## Search Tools ##########################
@@ -39,7 +38,7 @@ def wiki_search(query: str) -> str:
39
 
40
  @tool
41
  def web_search(query: str) -> str:
42
- """Search Tavily for a query and return maximum 3 results.
43
  Args:
44
  query: The search query."""
45
  search_docs = TavilySearch(max_results=3).invoke(input=query)
@@ -210,7 +209,7 @@ doc_tools = [analyze_csv_file,analyze_excel_file]
210
 
211
  ######################### Code tools #########################
212
 
213
- code_tools = [execute_code_multilang]
214
 
215
  ######################### Image tools #########################
216
 
 
16
  import uuid
17
  import requests
18
  from PIL import Image
19
+ from code_interpreter import execute_code_multilang, load_code_file
 
20
  from multimodal_tools import analyse_image, analyse_audio
21
 
22
  ########################## Search Tools ##########################
 
38
 
39
  @tool
40
  def web_search(query: str) -> str:
41
+ """Search the web for a query using Tavily search engine and return maximum 3 results.
42
  Args:
43
  query: The search query."""
44
  search_docs = TavilySearch(max_results=3).invoke(input=query)
 
209
 
210
  ######################### Code tools #########################
211
 
212
+ code_tools = [execute_code_multilang,load_code_file]
213
 
214
  ######################### Image tools #########################
215