Final_Assignment_Template

Sleeping

zeerafle commited on Jun 27, 2025

Commit

16c8f15

1 Parent(s): e0922af

Add YouTube and multimodal file input support

The additions enable handling videos, images, PDFs and other files as
input, plus improved error handling and helper methods for MIME type
detection and file downloads.

Files changed (2) hide show

agents/base_agent.py +5 -2
app.py +169 -10

agents/base_agent.py CHANGED Viewed

@@ -8,20 +8,23 @@ from tools.code_execution import CodeExecutionTool
 # from tools.google_search import GoogleSearchTool
 from tools.web_search import tavily_search_tool
 from tools.wikipedia_search import wikipedia_search_tool
 import os
-SYSTEM_PROMPT = "You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."
 model = get_default_model()
 code_execution_tool = CodeExecutionTool(api_key=os.environ['GOOGLE_API_KEY'])
 # google_search_tool = GoogleSearchTool(api_key=os.environ['GOOGLE_API_KEY'])
 tools = [
     arxiv_tool,
     wikipedia_search_tool,
     calculator_tool,
     code_execution_tool,
     # google_search_tool,
-    tavily_search_tool
 ]
 agent_executor = create_react_agent(model, tools, prompt=SYSTEM_PROMPT)

 # from tools.google_search import GoogleSearchTool
 from tools.web_search import tavily_search_tool
 from tools.wikipedia_search import wikipedia_search_tool
+from tools.youtube_understanding import YoutubeUnderstandingTool
 import os
+SYSTEM_PROMPT = "You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with only the answer to the question, nothing else. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."
 model = get_default_model()
 code_execution_tool = CodeExecutionTool(api_key=os.environ['GOOGLE_API_KEY'])
 # google_search_tool = GoogleSearchTool(api_key=os.environ['GOOGLE_API_KEY'])
+youtube_understanding_tool = YoutubeUnderstandingTool(api_key=os.environ['GOOGLE_API_KEY'])
 tools = [
     arxiv_tool,
     wikipedia_search_tool,
     calculator_tool,
     code_execution_tool,
     # google_search_tool,
+    tavily_search_tool,
+    youtube_understanding_tool
 ]
 agent_executor = create_react_agent(model, tools, prompt=SYSTEM_PROMPT)

app.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import os
 import gradio as gr
 from langchain_core.messages import HumanMessage
 import requests
 import pandas as pd
 from agents.base_agent import agent_executor
 # (Keep Constants as is)
 # --- Constants ---
@@ -12,14 +15,160 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
-    def __init__(self):
         print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        messages = [HumanMessage(content=question)]
-        response = agent_executor.invoke({"messages": messages})
-        answer = response['messages'][-1].content
-        return answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -74,7 +223,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     # 3. Run your Agent
     results_log = []
     answers_payload = []
-    print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
@@ -82,12 +232,21 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")

 import os
+from typing import Optional, List, Dict, Any
 import gradio as gr
 from langchain_core.messages import HumanMessage
 import requests
 import pandas as pd
 from agents.base_agent import agent_executor
+import mimetypes
+import base64
 # (Keep Constants as is)
 # --- Constants ---
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
+    def __init__(self, api_url: str = DEFAULT_API_URL):
+        self.api_url = api_url
         print("BasicAgent initialized.")
+    def _get_mime_type(self, file_content: bytes, filename: str) -> str:
+        """Determine MIME type from file content and filename"""
+        # Try to guess from filename first
+        mime_type, _ = mimetypes.guess_type(filename)
+        if mime_type:
+            return mime_type
+        # Fallback: check file headers for common types
+        if file_content.startswith(b'\xff\xd8\xff'):
+            return 'image/jpeg'
+        elif file_content.startswith(b'\x89PNG\r\n\x1a\n'):
+            return 'image/png'
+        elif file_content.startswith(b'GIF8'):
+            return 'image/gif'
+        elif file_content.startswith(b'%PDF'):
+            return 'application/pdf'
+        elif file_content.startswith(b'RIFF') and b'WEBP' in file_content[:12]:
+            return 'image/webp'
+        else:
+            return 'application/octet-stream'
+    def _download_file(self, task_id: str) -> Optional[tuple]:
+        """Download task's associated file"""
+        try:
+            files_url = f"{self.api_url}/files/{task_id}"
+            print(f"Attempting to download file from {files_url}")
+            response = requests.get(files_url, timeout=30)
+            if response.status_code == 404:
+                print('File not found for task ID:', task_id)
+                return None
+            response.raise_for_status()
+            # try to get filename from Content-Disposition header
+            filename = "file"
+            if 'content-disposition' in response.headers:
+                content_disposition = response.headers['content-disposition']
+                if 'filename=' in content_disposition:
+                    filename = content_disposition.split('filename=')[1].strip('"')
+            file_content = response.content
+            mime_type = self._get_mime_type(file_content, filename)
+            print(f"Downloaded file: {filename} ({len(file_content)} bytes, {mime_type})")
+            return file_content, filename, mime_type
+        except requests.exceptions.RequestException as e:
+            print(f"Error downloading file for task {task_id}: {e}")
+            return None
+        except Exception as e:
+            print(f"Unexpected error downloading file for task {task_id}: {e}")
+            return None
+    def _create_multimodal_content(self, question: str, task_id: str) -> List[Dict[str, Any]]:
+        """Create content blocks for multimodal input."""
+        content_blocks = [{"type": "text", "text": question}]
+        # Try to download associated file
+        file_data = self._download_file(task_id)
+        if file_data:
+            file_content, filename, mime_type = file_data
+            # Convert file content to base64
+            base64_content = base64.b64encode(file_content).decode('utf-8')
+            # Create appropriate content block based on file type
+            if mime_type.startswith('image/'):
+                content_blocks.append({
+                    "type": "image",
+                    "source_type": "base64",
+                    "data": base64_content,
+                    "mime_type": mime_type
+                })
+                print(f"Added image content block: {filename}")
+            elif mime_type == 'application/pdf':
+                content_blocks.append({
+                    "type": "file",
+                    "source_type": "base64",
+                    "data": base64_content,
+                    "mime_type": mime_type
+                })
+                print(f"Added PDF content block: {filename}")
+            elif mime_type.startswith('audio/'):
+                content_blocks.append({
+                    "type": "audio",
+                    "source_type": "base64",
+                    "data": base64_content,
+                    "mime_type": mime_type
+                })
+                print(f"Added audio content block: {filename}")
+            elif mime_type.startswith('video/'):
+                content_blocks.append({
+                    "type": "video",
+                    "source_type": "base64",
+                    "data": base64_content,
+                    "mime_type": mime_type
+                })
+                print(f"Added video content block: {filename}")
+            else:
+                # For other file types, add as generic file
+                content_blocks.append({
+                    "type": "file",
+                    "source_type": "base64",
+                    "data": base64_content,
+                    "mime_type": mime_type
+                })
+                print(f"Added generic file content block: {filename} ({mime_type})")
+            # Add context about the file to the text prompt
+            content_blocks[0]["text"] += f"\n\nNote: I have attached a file named '{filename}' of type '{mime_type}'. Please analyze this file in the context of the question above."
+        return content_blocks
+    def __call__(self, question: str, task_id: str = "") -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        if task_id:
+            print(f"Processing task_id: {task_id}")
+        try:
+            # Create multimodal content if task_id is provided
+            if task_id:
+                content = self._create_multimodal_content(question, task_id)
+                message = HumanMessage(content=content)
+            else:
+                # Fallback to text-only
+                message = HumanMessage(content=question)
+            # Invoke the agent
+            response = agent_executor.invoke({"messages": [message]})
+            answer = response['messages'][-1].content
+            return answer
+        except Exception as e:
+            print(f"Error in agent execution: {e}")
+            # Fallback to text-only if multimodal fails
+            try:
+                message = HumanMessage(content=question)
+                response = agent_executor.invoke({"messages": [message]})
+                answer = response['messages'][-1].content
+                return answer
+            except Exception as fallback_error:
+                print(f"Fallback also failed: {fallback_error}")
+                return f"Error processing question: {e}"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     # 3. Run your Agent
     results_log = []
     answers_payload = []
+    print(f"Running agent with multimodal support on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            # Pass both question and task_id to enable multimodal processing
+            submitted_answer = agent(question_text, task_id)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text,
+                "Submitted Answer": submitted_answer
+            })
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
+             results_log.append({
+                 "Task ID": task_id,
+                 "Question": question_text,
+                 "Submitted Answer": f"AGENT ERROR: {e}"
+             })
     if not answers_payload:
         print("Agent did not produce any answers to submit.")