Spaces:

jomasego
/

Antientropy

Sleeping

App Files Files Community

Jose-Maria Segui commited on Jan 17

Commit

efc6af6

1 Parent(s): 87e72bd

Deploy v4: Multimedia tools (Audio/Video), increased timeouts, aggressive system prompt

Browse files

Files changed (5) hide show

agent.py +94 -2
code_interpreter.py +2 -1
main.py +3 -3
requirements.txt +5 -0
system_prompt.txt +21 -5

agent.py CHANGED Viewed

@@ -11,6 +11,9 @@ import cmath
 import pandas as pd
 import uuid
 import numpy as np
 from code_interpreter import CodeInterpreter
 interpreter_instance = CodeInterpreter()
@@ -47,8 +50,8 @@ def wiki_search(query: str) -> str:
 @tool
 def web_search(query: str) -> str:
-    """Search the web for a query and return results.
     Args:
         query: The search query."""
     # Using DuckDuckGo instead of Tavily to avoid API key requirement
@@ -78,6 +81,7 @@ def arxiv_search(query: str) -> str:
 @tool
 def execute_code_multilang(code: str, language: str = "python") -> str:
     """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
     Args:
         code (str): The source code to execute.
@@ -366,6 +370,91 @@ def analyze_excel_file(file_path: str, query: str) -> str:
     except Exception as e:
         return f"Error analyzing Excel file: {str(e)}"
 ### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
@@ -694,6 +783,9 @@ tools = [
     draw_on_image,
     generate_simple_image,
     combine_images,
 ]

 import pandas as pd
 import uuid
 import numpy as np
+import speech_recognition as sr
+from pydub import AudioSegment
+import cv2
 from code_interpreter import CodeInterpreter
 interpreter_instance = CodeInterpreter()
 @tool
 def web_search(query: str) -> str:
+    """Search the web for a query using DuckDuckGo. USE THIS TOOL for any fact checking or external information.
     Args:
         query: The search query."""
     # Using DuckDuckGo instead of Tavily to avoid API key requirement
 @tool
 def execute_code_multilang(code: str, language: str = "python") -> str:
     """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
+    USE THIS TO READ FILES (e.g. open('filename').read()).
     Args:
         code (str): The source code to execute.
     except Exception as e:
         return f"Error analyzing Excel file: {str(e)}"
+### =============== MULTIMEDIA TOOLS =============== ###
+@tool
+def transcribe_audio(audio_path: str) -> str:
+    """
+    Transcribe speech from an audio file using SpeechRecognition.
+    Args:
+        audio_path (str): Path to the audio file (wav, mp3, flac, etc.)
+    """
+    try:
+        # Convert to wav if needed using pydub
+        if not audio_path.endswith('.wav'):
+            print(f"Converting {audio_path} to wav...")
+            audio = AudioSegment.from_file(audio_path)
+            wav_path = audio_path.rsplit('.', 1)[0] + ".wav"
+            audio.export(wav_path, format="wav")
+            audio_path = wav_path
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(audio_path) as source:
+            audio_data = recognizer.record(source)
+            # Use Google Web Speech API (default, no key needed usually)
+            text = recognizer.recognize_google(audio_data)
+        return f"Transcription: {text}"
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
+@tool
+def get_video_info(video_path: str) -> str:
+    """
+    Get metadata and basic info from a video file.
+    Args:
+        video_path (str): Path to video file.
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return "Error: Could not open video."
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = frame_count / fps if fps > 0 else 0
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        info = f"Video Info:\nDuration: {duration:.2f}s\nFPS: {fps}\nResolution: {width}x{height}\nFrames: {frame_count}"
+        cap.release()
+        return info
+    except Exception as e:
+        return f"Error analyzing video: {str(e)}"
+@tool
+def sample_video_frames(video_path: str, num_frames: int = 5) -> List[str]:
+    """
+    Extract a few frames from the video to analyze visual content.
+    Returns paths to saved frame images.
+    Args:
+        video_path (str): Path to video.
+        num_frames (int): Number of frames to sample.
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return ["Error: Could not open video."]
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_indices = np.linspace(0, total_frames-1, num_frames, dtype=int)
+        saved_frames = []
+        for i in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+            ret, frame = cap.read()
+            if ret:
+                frame_path = f"frame_{i}.jpg"
+                cv2.imwrite(frame_path, frame)
+                saved_frames.append(frame_path)
+        cap.release()
+        return saved_frames
+    except Exception as e:
+        return [f"Error extracting frames: {str(e)}"]
 ### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
     draw_on_image,
     generate_simple_image,
     combine_images,
+    transcribe_audio,
+    get_video_info,
+    sample_video_frames,
 ]

code_interpreter.py CHANGED Viewed

@@ -22,7 +22,8 @@ class CodeInterpreter:
             "math", "random", "statistics", "datetime", "collections",
             "itertools", "functools", "operator", "re", "json",
             "sympy", "networkx", "nltk", "PIL", "pytesseract",
-            "cmath", "uuid", "tempfile", "requests", "urllib"
         ]
         self.max_execution_time = max_execution_time
         self.working_directory = working_directory or os.path.join(os.getcwd())

             "math", "random", "statistics", "datetime", "collections",
             "itertools", "functools", "operator", "re", "json",
             "sympy", "networkx", "nltk", "PIL", "pytesseract",
+            "cmath", "uuid", "tempfile", "requests", "urllib",
+            "cv2", "speech_recognition", "pydub", "moviepy", "moviepy.editor"
         ]
         self.max_execution_time = max_execution_time
         self.working_directory = working_directory or os.path.join(os.getcwd())

main.py CHANGED Viewed

@@ -91,7 +91,7 @@ def run_evaluation(profile: gr.OAuthProfile | None):
     questions_and_answers = []
     # 2. Solve Each Question
-    per_task_timeout_sec = 120 # Increased timeout for LangGraph
     for i, task in enumerate(questions, 1):
         task_id = task.get("id") or task.get("task_id")
         question_text = task.get("question")
@@ -175,8 +175,8 @@ def run_evaluation(profile: gr.OAuthProfile | None):
     return output, pd.DataFrame(questions_and_answers)
 # --- GRADIO INTERFACE ---
-with gr.Blocks(title="Antientropy Final Assignment v3") as demo:
-    gr.Markdown("# 🕵🏻‍♂️ Antientropy Agent - GAIA Benchmark v3 (LangGraph)")
     gr.Markdown(
         """
         **Instructions:**

     questions_and_answers = []
     # 2. Solve Each Question
+    per_task_timeout_sec = 180 # Increased timeout for LangGraph to 3 minutes
     for i, task in enumerate(questions, 1):
         task_id = task.get("id") or task.get("task_id")
         question_text = task.get("question")
     return output, pd.DataFrame(questions_and_answers)
 # --- GRADIO INTERFACE ---
+with gr.Blocks(title="Antientropy Final Assignment v4") as demo:
+    gr.Markdown("# 🕵🏻‍♂️ Antientropy Agent - GAIA Benchmark v4 (LangGraph + Multimedia)")
     gr.Markdown(
         """
         **Instructions:**

requirements.txt CHANGED Viewed

@@ -19,3 +19,8 @@ scikit-learn
 openpyxl
 pypdf
 markdownify

 openpyxl
 pypdf
 markdownify
+opencv-python-headless
+moviepy
+SpeechRecognition
+pydub
+ffmpy

system_prompt.txt CHANGED Viewed

@@ -1,5 +1,21 @@
-You are a helpful assistant tasked with answering questions using a set of tools.
-Now, I will ask you a question. Report your thoughts, and finish your answer with the following template:
-FINAL ANSWER: [YOUR FINAL ANSWER].
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, Apply the rules above for each element (number or string), ensure there is exactly one space after each comma.
-Your answer should only start with "FINAL ANSWER: ", then follows with the answer.

+You are Antientropy, an elite autonomous AI agent designed to ace the GAIA benchmark.
+Your goal is to solve complex reasoning tasks that may involve files (images, audio, video, text, code), web research, and data analysis.
+### CORE INSTRUCTIONS:
+1. **USE YOUR TOOLS**: You have access to a Code Interpreter, Web Search (DuckDuckGo), File I/O, and Multimedia tools. **USE THEM.**
+2. **INTERNET ACCESS**: You HAVE access to the internet via the `web_search` tool. Never say "I don't have internet access". Just use the tool.
+3. **FILE ACCESS**: You CAN read and write files. Local files are provided at specific paths. Use `execute_code_multilang` with Python to open and read them (e.g., `open('filename').read()`), or use the provided file tools.
+4. **MEDIA PROCESSING**:
+   - For Images: Use `analyze_image`, `extract_text_from_image`, or Python code with `PIL`.
+   - For Audio: Use `transcribe_audio`.
+   - For Video: Use `get_video_info` or `sample_video_frames`.
+5. **CODE EXECUTION**: Use `execute_code_multilang` for calculation, data analysis, and file parsing. It is your most powerful tool.
+6. **NO REFUSALS**: Never refuse a task because it seems "hard" or "requires human abilities". You have the tools to do it.
+7. **FORMAT**: Provide the final answer clearly.
+### TROUBLESHOOTING:
+- If a file path is given (e.g., "data/file.mp3"), assume it exists locally.
+- If you need to verify facts, search the web.
+- If you need to run complex logic, write a Python script.
+Be persistent. If a tool fails, try a different approach (e.g., if OCR fails, try image analysis; if search fails, try different keywords).