Spaces:
Sleeping
Sleeping
Jose-Maria Segui commited on
Commit Β·
efc6af6
1
Parent(s): 87e72bd
Deploy v4: Multimedia tools (Audio/Video), increased timeouts, aggressive system prompt
Browse files- agent.py +94 -2
- code_interpreter.py +2 -1
- main.py +3 -3
- requirements.txt +5 -0
- system_prompt.txt +21 -5
agent.py
CHANGED
|
@@ -11,6 +11,9 @@ import cmath
|
|
| 11 |
import pandas as pd
|
| 12 |
import uuid
|
| 13 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 14 |
from code_interpreter import CodeInterpreter
|
| 15 |
|
| 16 |
interpreter_instance = CodeInterpreter()
|
|
@@ -47,8 +50,8 @@ def wiki_search(query: str) -> str:
|
|
| 47 |
|
| 48 |
@tool
|
| 49 |
def web_search(query: str) -> str:
|
| 50 |
-
"""Search the web for a query
|
| 51 |
-
|
| 52 |
Args:
|
| 53 |
query: The search query."""
|
| 54 |
# Using DuckDuckGo instead of Tavily to avoid API key requirement
|
|
@@ -78,6 +81,7 @@ def arxiv_search(query: str) -> str:
|
|
| 78 |
@tool
|
| 79 |
def execute_code_multilang(code: str, language: str = "python") -> str:
|
| 80 |
"""Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
|
|
|
|
| 81 |
|
| 82 |
Args:
|
| 83 |
code (str): The source code to execute.
|
|
@@ -366,6 +370,91 @@ def analyze_excel_file(file_path: str, query: str) -> str:
|
|
| 366 |
except Exception as e:
|
| 367 |
return f"Error analyzing Excel file: {str(e)}"
|
| 368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
|
| 371 |
|
|
@@ -694,6 +783,9 @@ tools = [
|
|
| 694 |
draw_on_image,
|
| 695 |
generate_simple_image,
|
| 696 |
combine_images,
|
|
|
|
|
|
|
|
|
|
| 697 |
]
|
| 698 |
|
| 699 |
|
|
|
|
| 11 |
import pandas as pd
|
| 12 |
import uuid
|
| 13 |
import numpy as np
|
| 14 |
+
import speech_recognition as sr
|
| 15 |
+
from pydub import AudioSegment
|
| 16 |
+
import cv2
|
| 17 |
from code_interpreter import CodeInterpreter
|
| 18 |
|
| 19 |
interpreter_instance = CodeInterpreter()
|
|
|
|
| 50 |
|
| 51 |
@tool
|
| 52 |
def web_search(query: str) -> str:
|
| 53 |
+
"""Search the web for a query using DuckDuckGo. USE THIS TOOL for any fact checking or external information.
|
| 54 |
+
|
| 55 |
Args:
|
| 56 |
query: The search query."""
|
| 57 |
# Using DuckDuckGo instead of Tavily to avoid API key requirement
|
|
|
|
| 81 |
@tool
|
| 82 |
def execute_code_multilang(code: str, language: str = "python") -> str:
|
| 83 |
"""Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
|
| 84 |
+
USE THIS TO READ FILES (e.g. open('filename').read()).
|
| 85 |
|
| 86 |
Args:
|
| 87 |
code (str): The source code to execute.
|
|
|
|
| 370 |
except Exception as e:
|
| 371 |
return f"Error analyzing Excel file: {str(e)}"
|
| 372 |
|
| 373 |
+
### =============== MULTIMEDIA TOOLS =============== ###
|
| 374 |
+
|
| 375 |
+
@tool
|
| 376 |
+
def transcribe_audio(audio_path: str) -> str:
|
| 377 |
+
"""
|
| 378 |
+
Transcribe speech from an audio file using SpeechRecognition.
|
| 379 |
+
|
| 380 |
+
Args:
|
| 381 |
+
audio_path (str): Path to the audio file (wav, mp3, flac, etc.)
|
| 382 |
+
"""
|
| 383 |
+
try:
|
| 384 |
+
# Convert to wav if needed using pydub
|
| 385 |
+
if not audio_path.endswith('.wav'):
|
| 386 |
+
print(f"Converting {audio_path} to wav...")
|
| 387 |
+
audio = AudioSegment.from_file(audio_path)
|
| 388 |
+
wav_path = audio_path.rsplit('.', 1)[0] + ".wav"
|
| 389 |
+
audio.export(wav_path, format="wav")
|
| 390 |
+
audio_path = wav_path
|
| 391 |
+
|
| 392 |
+
recognizer = sr.Recognizer()
|
| 393 |
+
with sr.AudioFile(audio_path) as source:
|
| 394 |
+
audio_data = recognizer.record(source)
|
| 395 |
+
# Use Google Web Speech API (default, no key needed usually)
|
| 396 |
+
text = recognizer.recognize_google(audio_data)
|
| 397 |
+
return f"Transcription: {text}"
|
| 398 |
+
except Exception as e:
|
| 399 |
+
return f"Error transcribing audio: {str(e)}"
|
| 400 |
+
|
| 401 |
+
@tool
|
| 402 |
+
def get_video_info(video_path: str) -> str:
|
| 403 |
+
"""
|
| 404 |
+
Get metadata and basic info from a video file.
|
| 405 |
+
|
| 406 |
+
Args:
|
| 407 |
+
video_path (str): Path to video file.
|
| 408 |
+
"""
|
| 409 |
+
try:
|
| 410 |
+
cap = cv2.VideoCapture(video_path)
|
| 411 |
+
if not cap.isOpened():
|
| 412 |
+
return "Error: Could not open video."
|
| 413 |
+
|
| 414 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 415 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 416 |
+
duration = frame_count / fps if fps > 0 else 0
|
| 417 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 418 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 419 |
+
|
| 420 |
+
info = f"Video Info:\nDuration: {duration:.2f}s\nFPS: {fps}\nResolution: {width}x{height}\nFrames: {frame_count}"
|
| 421 |
+
cap.release()
|
| 422 |
+
return info
|
| 423 |
+
except Exception as e:
|
| 424 |
+
return f"Error analyzing video: {str(e)}"
|
| 425 |
+
|
| 426 |
+
@tool
|
| 427 |
+
def sample_video_frames(video_path: str, num_frames: int = 5) -> List[str]:
|
| 428 |
+
"""
|
| 429 |
+
Extract a few frames from the video to analyze visual content.
|
| 430 |
+
Returns paths to saved frame images.
|
| 431 |
+
|
| 432 |
+
Args:
|
| 433 |
+
video_path (str): Path to video.
|
| 434 |
+
num_frames (int): Number of frames to sample.
|
| 435 |
+
"""
|
| 436 |
+
try:
|
| 437 |
+
cap = cv2.VideoCapture(video_path)
|
| 438 |
+
if not cap.isOpened():
|
| 439 |
+
return ["Error: Could not open video."]
|
| 440 |
+
|
| 441 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 442 |
+
frame_indices = np.linspace(0, total_frames-1, num_frames, dtype=int)
|
| 443 |
+
|
| 444 |
+
saved_frames = []
|
| 445 |
+
for i in frame_indices:
|
| 446 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 447 |
+
ret, frame = cap.read()
|
| 448 |
+
if ret:
|
| 449 |
+
frame_path = f"frame_{i}.jpg"
|
| 450 |
+
cv2.imwrite(frame_path, frame)
|
| 451 |
+
saved_frames.append(frame_path)
|
| 452 |
+
|
| 453 |
+
cap.release()
|
| 454 |
+
return saved_frames
|
| 455 |
+
except Exception as e:
|
| 456 |
+
return [f"Error extracting frames: {str(e)}"]
|
| 457 |
+
|
| 458 |
|
| 459 |
### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
|
| 460 |
|
|
|
|
| 783 |
draw_on_image,
|
| 784 |
generate_simple_image,
|
| 785 |
combine_images,
|
| 786 |
+
transcribe_audio,
|
| 787 |
+
get_video_info,
|
| 788 |
+
sample_video_frames,
|
| 789 |
]
|
| 790 |
|
| 791 |
|
code_interpreter.py
CHANGED
|
@@ -22,7 +22,8 @@ class CodeInterpreter:
|
|
| 22 |
"math", "random", "statistics", "datetime", "collections",
|
| 23 |
"itertools", "functools", "operator", "re", "json",
|
| 24 |
"sympy", "networkx", "nltk", "PIL", "pytesseract",
|
| 25 |
-
"cmath", "uuid", "tempfile", "requests", "urllib"
|
|
|
|
| 26 |
]
|
| 27 |
self.max_execution_time = max_execution_time
|
| 28 |
self.working_directory = working_directory or os.path.join(os.getcwd())
|
|
|
|
| 22 |
"math", "random", "statistics", "datetime", "collections",
|
| 23 |
"itertools", "functools", "operator", "re", "json",
|
| 24 |
"sympy", "networkx", "nltk", "PIL", "pytesseract",
|
| 25 |
+
"cmath", "uuid", "tempfile", "requests", "urllib",
|
| 26 |
+
"cv2", "speech_recognition", "pydub", "moviepy", "moviepy.editor"
|
| 27 |
]
|
| 28 |
self.max_execution_time = max_execution_time
|
| 29 |
self.working_directory = working_directory or os.path.join(os.getcwd())
|
main.py
CHANGED
|
@@ -91,7 +91,7 @@ def run_evaluation(profile: gr.OAuthProfile | None):
|
|
| 91 |
questions_and_answers = []
|
| 92 |
|
| 93 |
# 2. Solve Each Question
|
| 94 |
-
per_task_timeout_sec =
|
| 95 |
for i, task in enumerate(questions, 1):
|
| 96 |
task_id = task.get("id") or task.get("task_id")
|
| 97 |
question_text = task.get("question")
|
|
@@ -175,8 +175,8 @@ def run_evaluation(profile: gr.OAuthProfile | None):
|
|
| 175 |
return output, pd.DataFrame(questions_and_answers)
|
| 176 |
|
| 177 |
# --- GRADIO INTERFACE ---
|
| 178 |
-
with gr.Blocks(title="Antientropy Final Assignment
|
| 179 |
-
gr.Markdown("# π΅π»ββοΈ Antientropy Agent - GAIA Benchmark
|
| 180 |
gr.Markdown(
|
| 181 |
"""
|
| 182 |
**Instructions:**
|
|
|
|
| 91 |
questions_and_answers = []
|
| 92 |
|
| 93 |
# 2. Solve Each Question
|
| 94 |
+
per_task_timeout_sec = 180 # Increased timeout for LangGraph to 3 minutes
|
| 95 |
for i, task in enumerate(questions, 1):
|
| 96 |
task_id = task.get("id") or task.get("task_id")
|
| 97 |
question_text = task.get("question")
|
|
|
|
| 175 |
return output, pd.DataFrame(questions_and_answers)
|
| 176 |
|
| 177 |
# --- GRADIO INTERFACE ---
|
| 178 |
+
with gr.Blocks(title="Antientropy Final Assignment v4") as demo:
|
| 179 |
+
gr.Markdown("# π΅π»ββοΈ Antientropy Agent - GAIA Benchmark v4 (LangGraph + Multimedia)")
|
| 180 |
gr.Markdown(
|
| 181 |
"""
|
| 182 |
**Instructions:**
|
requirements.txt
CHANGED
|
@@ -19,3 +19,8 @@ scikit-learn
|
|
| 19 |
openpyxl
|
| 20 |
pypdf
|
| 21 |
markdownify
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
openpyxl
|
| 20 |
pypdf
|
| 21 |
markdownify
|
| 22 |
+
opencv-python-headless
|
| 23 |
+
moviepy
|
| 24 |
+
SpeechRecognition
|
| 25 |
+
pydub
|
| 26 |
+
ffmpy
|
system_prompt.txt
CHANGED
|
@@ -1,5 +1,21 @@
|
|
| 1 |
-
You are
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are Antientropy, an elite autonomous AI agent designed to ace the GAIA benchmark.
|
| 2 |
+
Your goal is to solve complex reasoning tasks that may involve files (images, audio, video, text, code), web research, and data analysis.
|
| 3 |
+
|
| 4 |
+
### CORE INSTRUCTIONS:
|
| 5 |
+
1. **USE YOUR TOOLS**: You have access to a Code Interpreter, Web Search (DuckDuckGo), File I/O, and Multimedia tools. **USE THEM.**
|
| 6 |
+
2. **INTERNET ACCESS**: You HAVE access to the internet via the `web_search` tool. Never say "I don't have internet access". Just use the tool.
|
| 7 |
+
3. **FILE ACCESS**: You CAN read and write files. Local files are provided at specific paths. Use `execute_code_multilang` with Python to open and read them (e.g., `open('filename').read()`), or use the provided file tools.
|
| 8 |
+
4. **MEDIA PROCESSING**:
|
| 9 |
+
- For Images: Use `analyze_image`, `extract_text_from_image`, or Python code with `PIL`.
|
| 10 |
+
- For Audio: Use `transcribe_audio`.
|
| 11 |
+
- For Video: Use `get_video_info` or `sample_video_frames`.
|
| 12 |
+
5. **CODE EXECUTION**: Use `execute_code_multilang` for calculation, data analysis, and file parsing. It is your most powerful tool.
|
| 13 |
+
6. **NO REFUSALS**: Never refuse a task because it seems "hard" or "requires human abilities". You have the tools to do it.
|
| 14 |
+
7. **FORMAT**: Provide the final answer clearly.
|
| 15 |
+
|
| 16 |
+
### TROUBLESHOOTING:
|
| 17 |
+
- If a file path is given (e.g., "data/file.mp3"), assume it exists locally.
|
| 18 |
+
- If you need to verify facts, search the web.
|
| 19 |
+
- If you need to run complex logic, write a Python script.
|
| 20 |
+
|
| 21 |
+
Be persistent. If a tool fails, try a different approach (e.g., if OCR fails, try image analysis; if search fails, try different keywords).
|