Final_Assignment_Template

Sleeping

App Files Files Community

Ricardo Teixeira commited on Jun 27, 2025

Commit

ed3a95c

1 Parent(s): a87a417

Final submission version

Browse files

Files changed (6) hide show

agent.py +3 -2
code_interpreter.py +16 -2
image_tools.py +0 -310
multimodal_tools.py +17 -13
system_prompt.txt +4 -5
tools.py +3 -4

agent.py CHANGED Viewed

@@ -31,12 +31,13 @@ class Agent():
             llm = ChatOllama(model=model, temperature=0)
         elif provider == 'google':
             if not model:
-                model = "gemini-2.0-flash"
             gemini_api_key = os.getenv("GEMINI_API_KEY")
             llm = ChatGoogleGenerativeAI(model=model, temperature=0,google_api_key=gemini_api_key)
         elif provider == 'groq':
             if not model:
                 model = "meta-llama/llama-4-scout-17b-16e-instruct"
             groq_api_key = os.getenv("GROQ_API_KEY")
             llm = ChatGroq(model=model, temperature=0, groq_api_key=groq_api_key)
         else:
@@ -91,7 +92,7 @@ if __name__ == "__main__":
     def main():
         agent = Agent()
         question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name."
-        model = "meta-llama/llama-4-scout-17b-16e-instruct"
         graph = agent.build_graph('google', model)
         messages = [HumanMessage(content=question)]
         messages = graph.invoke({"messages": messages})

             llm = ChatOllama(model=model, temperature=0)
         elif provider == 'google':
             if not model:
+                model = "gemini-2.5-flash"
             gemini_api_key = os.getenv("GEMINI_API_KEY")
             llm = ChatGoogleGenerativeAI(model=model, temperature=0,google_api_key=gemini_api_key)
         elif provider == 'groq':
             if not model:
                 model = "meta-llama/llama-4-scout-17b-16e-instruct"
+                #model = "meta-llama/llama-4-maverick-17b-128e-instruct"
             groq_api_key = os.getenv("GROQ_API_KEY")
             llm = ChatGroq(model=model, temperature=0, groq_api_key=groq_api_key)
         else:
     def main():
         agent = Agent()
         question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name."
+        model = "gemini-2.5-flash"
         graph = agent.build_graph('google', model)
         messages = [HumanMessage(content=question)]
         messages = graph.invoke({"messages": messages})

code_interpreter.py CHANGED Viewed

@@ -288,7 +288,7 @@ interpreter_instance = CodeInterpreter()
 def execute_code_multilang(code: str, language: str = "python") -> str:
     """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
     Args:
-        code (str): The source code to execute.
         language (str): The language of the code. Supported: "python", "bash", "sql", "c", "java".
     Returns:
         A string summarizing the execution results (stdout, stderr, errors, plots, dataframes if any).
@@ -345,4 +345,18 @@ def execute_code_multilang(code: str, language: str = "python") -> str:
                 "\n**Error Log:**\n```\n" + result["stderr"].strip() + "\n```"
             )
-    return "\n".join(response)

 def execute_code_multilang(code: str, language: str = "python") -> str:
     """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
     Args:
+        code (str): The source code to execute as a string.
         language (str): The language of the code. Supported: "python", "bash", "sql", "c", "java".
     Returns:
         A string summarizing the execution results (stdout, stderr, errors, plots, dataframes if any).
                 "\n**Error Log:**\n```\n" + result["stderr"].strip() + "\n```"
             )
+    return "\n".join(response)
+@tool
+def load_code_file(file_path: str):
+    """
+    Loads the content of a code file to be executed.
+    Args:
+        file_path (str): the path to the code file.
+    Returns:
+        str: the code in the file as a string.
+    """
+    with open(file_path,'r') as f:
+        code = f.read()
+    return code

image_tools.py DELETED Viewed

@@ -1,310 +0,0 @@
-import os
-import io
-import base64
-import uuid
-from PIL import Image
-from typing import List, Dict, Any, Optional
-from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
-import numpy as np
-from langchain_core.tools import tool
-# Helper functions for image processing
-def encode_image(image_path: str) -> str:
-    """Convert an image file to base64 string."""
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
-def decode_image(base64_string: str) -> Image.Image:
-    """Convert a base64 string to a PIL Image."""
-    image_data = base64.b64decode(base64_string)
-    return Image.open(io.BytesIO(image_data))
-def save_image(image: Image.Image, directory: str = "image_outputs") -> str:
-    """Save a PIL Image to disk and return the path."""
-    os.makedirs(directory, exist_ok=True)
-    image_id = str(uuid.uuid4())
-    image_path = os.path.join(directory, f"{image_id}.png")
-    image.save(image_path)
-    return image_path
-@tool
-def analyze_image(image_base64: str) -> Dict[str, Any]:
-    """
-    Analyze basic properties of an image (size, mode, color analysis, thumbnail preview).
-    Args:
-        image_base64 (str): Base64 encoded image string
-    Returns:
-        Dictionary with analysis result
-    """
-    try:
-        img = decode_image(image_base64)
-        width, height = img.size
-        mode = img.mode
-        if mode in ("RGB", "RGBA"):
-            arr = np.array(img)
-            avg_colors = arr.mean(axis=(0, 1))
-            dominant = ["Red", "Green", "Blue"][np.argmax(avg_colors[:3])]
-            brightness = avg_colors.mean()
-            color_analysis = {
-                "average_rgb": avg_colors.tolist(),
-                "brightness": brightness,
-                "dominant_color": dominant,
-            }
-        else:
-            color_analysis = {"note": f"No color analysis for mode {mode}"}
-        thumbnail = img.copy()
-        thumbnail.thumbnail((100, 100))
-        thumb_path = save_image(thumbnail, "thumbnails")
-        thumbnail_base64 = encode_image(thumb_path)
-        return {
-            "dimensions": (width, height),
-            "mode": mode,
-            "color_analysis": color_analysis,
-            "thumbnail": thumbnail_base64,
-        }
-    except Exception as e:
-        return {"error": str(e)}
-@tool
-def transform_image(
-    image_base64: str, operation: str, params: Optional[Dict[str, Any]] = None
-) -> Dict[str, Any]:
-    """
-    Apply transformations: resize, rotate, crop, flip, brightness, contrast, blur, sharpen, grayscale.
-    Args:
-        image_base64 (str): Base64 encoded input image
-        operation (str): Transformation operation
-        params (Dict[str, Any], optional): Parameters for the operation
-    Returns:
-        Dictionary with transformed image (base64)
-    """
-    try:
-        img = decode_image(image_base64)
-        params = params or {}
-        if operation == "resize":
-            img = img.resize(
-                (
-                    params.get("width", img.width // 2),
-                    params.get("height", img.height // 2),
-                )
-            )
-        elif operation == "rotate":
-            img = img.rotate(params.get("angle", 90), expand=True)
-        elif operation == "crop":
-            img = img.crop(
-                (
-                    params.get("left", 0),
-                    params.get("top", 0),
-                    params.get("right", img.width),
-                    params.get("bottom", img.height),
-                )
-            )
-        elif operation == "flip":
-            if params.get("direction", "horizontal") == "horizontal":
-                img = img.transpose(Image.FLIP_LEFT_RIGHT)
-            else:
-                img = img.transpose(Image.FLIP_TOP_BOTTOM)
-        elif operation == "adjust_brightness":
-            img = ImageEnhance.Brightness(img).enhance(params.get("factor", 1.5))
-        elif operation == "adjust_contrast":
-            img = ImageEnhance.Contrast(img).enhance(params.get("factor", 1.5))
-        elif operation == "blur":
-            img = img.filter(ImageFilter.GaussianBlur(params.get("radius", 2)))
-        elif operation == "sharpen":
-            img = img.filter(ImageFilter.SHARPEN)
-        elif operation == "grayscale":
-            img = img.convert("L")
-        else:
-            return {"error": f"Unknown operation: {operation}"}
-        result_path = save_image(img)
-        result_base64 = encode_image(result_path)
-        return {"transformed_image": result_base64}
-    except Exception as e:
-        return {"error": str(e)}
-@tool
-def draw_on_image(
-    image_base64: str, drawing_type: str, params: Dict[str, Any]
-) -> Dict[str, Any]:
-    """
-    Draw shapes (rectangle, circle, line) or text onto an image.
-    Args:
-        image_base64 (str): Base64 encoded input image
-        drawing_type (str): Drawing type
-        params (Dict[str, Any]): Drawing parameters
-    Returns:
-        Dictionary with result image (base64)
-    """
-    try:
-        img = decode_image(image_base64)
-        draw = ImageDraw.Draw(img)
-        color = params.get("color", "red")
-        if drawing_type == "rectangle":
-            draw.rectangle(
-                [params["left"], params["top"], params["right"], params["bottom"]],
-                outline=color,
-                width=params.get("width", 2),
-            )
-        elif drawing_type == "circle":
-            x, y, r = params["x"], params["y"], params["radius"]
-            draw.ellipse(
-                (x - r, y - r, x + r, y + r),
-                outline=color,
-                width=params.get("width", 2),
-            )
-        elif drawing_type == "line":
-            draw.line(
-                (
-                    params["start_x"],
-                    params["start_y"],
-                    params["end_x"],
-                    params["end_y"],
-                ),
-                fill=color,
-                width=params.get("width", 2),
-            )
-        elif drawing_type == "text":
-            font_size = params.get("font_size", 20)
-            try:
-                font = ImageFont.truetype("arial.ttf", font_size)
-            except IOError:
-                font = ImageFont.load_default()
-            draw.text(
-                (params["x"], params["y"]),
-                params.get("text", "Text"),
-                fill=color,
-                font=font,
-            )
-        else:
-            return {"error": f"Unknown drawing type: {drawing_type}"}
-        result_path = save_image(img)
-        result_base64 = encode_image(result_path)
-        return {"result_image": result_base64}
-    except Exception as e:
-        return {"error": str(e)}
-@tool
-def generate_simple_image(
-    image_type: str,
-    width: int = 500,
-    height: int = 500,
-    params: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Generate a simple image (gradient, noise, pattern, chart).
-    Args:
-        image_type (str): Type of image
-        width (int), height (int)
-        params (Dict[str, Any], optional): Specific parameters
-    Returns:
-        Dictionary with generated image (base64)
-    """
-    try:
-        params = params or {}
-        if image_type == "gradient":
-            direction = params.get("direction", "horizontal")
-            start_color = params.get("start_color", (255, 0, 0))
-            end_color = params.get("end_color", (0, 0, 255))
-            img = Image.new("RGB", (width, height))
-            draw = ImageDraw.Draw(img)
-            if direction == "horizontal":
-                for x in range(width):
-                    r = int(
-                        start_color[0] + (end_color[0] - start_color[0]) * x / width
-                    )
-                    g = int(
-                        start_color[1] + (end_color[1] - start_color[1]) * x / width
-                    )
-                    b = int(
-                        start_color[2] + (end_color[2] - start_color[2]) * x / width
-                    )
-                    draw.line([(x, 0), (x, height)], fill=(r, g, b))
-            else:
-                for y in range(height):
-                    r = int(
-                        start_color[0] + (end_color[0] - start_color[0]) * y / height
-                    )
-                    g = int(
-                        start_color[1] + (end_color[1] - start_color[1]) * y / height
-                    )
-                    b = int(
-                        start_color[2] + (end_color[2] - start_color[2]) * y / height
-                    )
-                    draw.line([(0, y), (width, y)], fill=(r, g, b))
-        elif image_type == "noise":
-            noise_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-            img = Image.fromarray(noise_array, "RGB")
-        else:
-            return {"error": f"Unsupported image_type {image_type}"}
-        result_path = save_image(img)
-        result_base64 = encode_image(result_path)
-        return {"generated_image": result_base64}
-    except Exception as e:
-        return {"error": str(e)}
-@tool
-def combine_images(
-    images_base64: List[str], operation: str, params: Optional[Dict[str, Any]] = None
-) -> Dict[str, Any]:
-    """
-    Combine multiple images (collage, stack, blend).
-    Args:
-        images_base64 (List[str]): List of base64 images
-        operation (str): Combination type
-        params (Dict[str, Any], optional)
-    Returns:
-        Dictionary with combined image (base64)
-    """
-    try:
-        images = [decode_image(b64) for b64 in images_base64]
-        params = params or {}
-        if operation == "stack":
-            direction = params.get("direction", "horizontal")
-            if direction == "horizontal":
-                total_width = sum(img.width for img in images)
-                max_height = max(img.height for img in images)
-                new_img = Image.new("RGB", (total_width, max_height))
-                x = 0
-                for img in images:
-                    new_img.paste(img, (x, 0))
-                    x += img.width
-            else:
-                max_width = max(img.width for img in images)
-                total_height = sum(img.height for img in images)
-                new_img = Image.new("RGB", (max_width, total_height))
-                y = 0
-                for img in images:
-                    new_img.paste(img, (0, y))
-                    y += img.height
-        else:
-            return {"error": f"Unsupported combination operation {operation}"}
-        result_path = save_image(new_img)
-        result_base64 = encode_image(result_path)
-        return {"combined_image": result_base64}
-    except Exception as e:
-        return {"error": str(e)}

multimodal_tools.py CHANGED Viewed

@@ -6,20 +6,20 @@ from dotenv import load_dotenv
 from langchain_google_genai import ChatGoogleGenerativeAI
 import os
 from langchain_core.messages import HumanMessage
 load_dotenv()
 @tool
-def analyse_image(img_path: str, query: str) -> str:
     """
     Analyses and extracts information from an image file using a multimodal model.
     Args:
-        img_path: The local path for the image to be analysed.
-        query: Information to be extrated from the image by the multimodal model
     """
     all_text = ""
     gemini_api_key = os.getenv("GEMINI_API_KEY")
-    vision_llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0,google_api_key=gemini_api_key)
     try:
         # Read image and encode as base64
@@ -33,11 +33,13 @@ def analyse_image(img_path: str, query: str) -> str:
                 content=[
                     {
                         "type": "text",
-                        "text": f'{query}',
                     },
                     {
-                        "type": "image_url",
-                        "image_url": {'data': image_base64,'format': 'png'},
                     },
                 ]
             )
@@ -60,14 +62,14 @@ def analyse_audio(audio_path: str) -> str:
     """
     Transcribes voice inputs from an audio file using a multimodal model to text.
     Args:
-        audio_path: The local path for the audio to be transcribed.
     """
     all_text = ""
     gemini_api_key = os.getenv("GEMINI_API_KEY")
-    audio_llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0,google_api_key=gemini_api_key)
     try:
-        with open("audio_input.wav", "rb") as f:
             audio = f.read()
             audio_b64 = base64.b64encode(audio).decode()
@@ -75,10 +77,12 @@ def analyse_audio(audio_path: str) -> str:
             [
                 HumanMessage(
                     content=[
-                        {"type": "text", "text": "Transcribe the following:"},
                         {
-                            "type": "input_audio",
-                            "input_audio": {"data": audio_b64, "format": "wav"},
                         },
                     ],
                 ),

 from langchain_google_genai import ChatGoogleGenerativeAI
 import os
 from langchain_core.messages import HumanMessage
+from langchain_groq import ChatGroq
 load_dotenv()
 @tool
+def analyse_image(img_path: str) -> str:
     """
     Analyses and extracts information from an image file using a multimodal model.
     Args:
+        img_path: The local path of the image to be analysed.
     """
     all_text = ""
     gemini_api_key = os.getenv("GEMINI_API_KEY")
+    vision_llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0,google_api_key=gemini_api_key)
     try:
         # Read image and encode as base64
                 content=[
                     {
                         "type": "text",
+                        "text": 'Extract information from this image with as much detail as possible:',
                     },
                     {
+                        "type": "image",
+                        "source_type": "base64",
+                        "data": image_base64,
+                        "mime_type": "image/png",
                     },
                 ]
             )
     """
     Transcribes voice inputs from an audio file using a multimodal model to text.
     Args:
+        audio_path: The local path of the audio to be transcribed.
     """
     all_text = ""
     gemini_api_key = os.getenv("GEMINI_API_KEY")
+    audio_llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0,google_api_key=gemini_api_key)
     try:
+        with open(audio_path, "rb") as f:
             audio = f.read()
             audio_b64 = base64.b64encode(audio).decode()
             [
                 HumanMessage(
                     content=[
+                        {"type": "text", "text": "Transcribe the following audio:"},
                         {
+                            "type": "audio",
+                            "source_type": "base64",
+                            "data": audio_b64,
+                            "mime_type": "audio/mp3"
                         },
                     ],
                 ),

system_prompt.txt CHANGED Viewed

@@ -1,11 +1,10 @@
 You are a general AI assistant.
 I will ask you a question.
-Do not provide any explanations, reasoning, or context. Only respond with the final answer in the following strict format:
-FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER must adhere to the following rules:
 - It must be a single number, a few words, or a comma-separated list of numbers and/or strings.
 - If the answer is a number, DO NOT use commas, units, currency symbols (e.g., $, %), or any other special characters unless explicitly specified.
-- If the answer is a string, DO NOT use articles (e.g., "the", "a") or abbreviations (e.g., "NYC" for "New York City"). Write the digits in plain text (e.g., "five" instead of "5") unless otherwise instructed.
-- If you are asked for a comma-separated list, apply the above rules depending on the type of element (number or string).
-- No extra explanation, no elaboration, no additional context - just the final answer.
 Make sure the format is followed precisely with no deviations.

 You are a general AI assistant.
 I will ask you a question.
+Respond with the final answer in the following strict format: FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER must adhere to the following rules:
 - It must be a single number, a few words, or a comma-separated list of numbers and/or strings.
 - If the answer is a number, DO NOT use commas, units, currency symbols (e.g., $, %), or any other special characters unless explicitly specified.
+- If the answer is a string, DO NOT use articles (e.g., "the", "a") or abbreviations (e.g., "NYC" for "New York City").
+- If you are asked for a comma-separated list, apply the above rules depending on the type of element (number or string) and add a space after each coma.
+Give priority to extracting information from tools before you arrive to your FINAL ANSWER, instead of trying to gess the result.
 Make sure the format is followed precisely with no deviations.

tools.py CHANGED Viewed

@@ -16,8 +16,7 @@ import os
 import uuid
 import requests
 from PIL import Image
-import pytesseract
-from code_interpreter import execute_code_multilang
 from multimodal_tools import analyse_image, analyse_audio
 ########################## Search Tools ##########################
@@ -39,7 +38,7 @@ def wiki_search(query: str) -> str:
 @tool
 def web_search(query: str) -> str:
-    """Search Tavily for a query and return maximum 3 results.
     Args:
         query: The search query."""
     search_docs = TavilySearch(max_results=3).invoke(input=query)
@@ -210,7 +209,7 @@ doc_tools = [analyze_csv_file,analyze_excel_file]
 ######################### Code tools #########################
-code_tools = [execute_code_multilang]
 ######################### Image tools #########################

 import uuid
 import requests
 from PIL import Image
+from code_interpreter import execute_code_multilang, load_code_file
 from multimodal_tools import analyse_image, analyse_audio
 ########################## Search Tools ##########################
 @tool
 def web_search(query: str) -> str:
+    """Search the web for a query using Tavily search engine and return maximum 3 results.
     Args:
         query: The search query."""
     search_docs = TavilySearch(max_results=3).invoke(input=query)
 ######################### Code tools #########################
+code_tools = [execute_code_multilang,load_code_file]
 ######################### Image tools #########################