Jose-Maria Segui commited on
Commit
efc6af6
Β·
1 Parent(s): 87e72bd

Deploy v4: Multimedia tools (Audio/Video), increased timeouts, aggressive system prompt

Browse files
Files changed (5) hide show
  1. agent.py +94 -2
  2. code_interpreter.py +2 -1
  3. main.py +3 -3
  4. requirements.txt +5 -0
  5. system_prompt.txt +21 -5
agent.py CHANGED
@@ -11,6 +11,9 @@ import cmath
11
  import pandas as pd
12
  import uuid
13
  import numpy as np
 
 
 
14
  from code_interpreter import CodeInterpreter
15
 
16
  interpreter_instance = CodeInterpreter()
@@ -47,8 +50,8 @@ def wiki_search(query: str) -> str:
47
 
48
  @tool
49
  def web_search(query: str) -> str:
50
- """Search the web for a query and return results.
51
-
52
  Args:
53
  query: The search query."""
54
  # Using DuckDuckGo instead of Tavily to avoid API key requirement
@@ -78,6 +81,7 @@ def arxiv_search(query: str) -> str:
78
  @tool
79
  def execute_code_multilang(code: str, language: str = "python") -> str:
80
  """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
 
81
 
82
  Args:
83
  code (str): The source code to execute.
@@ -366,6 +370,91 @@ def analyze_excel_file(file_path: str, query: str) -> str:
366
  except Exception as e:
367
  return f"Error analyzing Excel file: {str(e)}"
368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
  ### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
371
 
@@ -694,6 +783,9 @@ tools = [
694
  draw_on_image,
695
  generate_simple_image,
696
  combine_images,
 
 
 
697
  ]
698
 
699
 
 
11
  import pandas as pd
12
  import uuid
13
  import numpy as np
14
+ import speech_recognition as sr
15
+ from pydub import AudioSegment
16
+ import cv2
17
  from code_interpreter import CodeInterpreter
18
 
19
  interpreter_instance = CodeInterpreter()
 
50
 
51
  @tool
52
  def web_search(query: str) -> str:
53
+ """Search the web for a query using DuckDuckGo. USE THIS TOOL for any fact checking or external information.
54
+
55
  Args:
56
  query: The search query."""
57
  # Using DuckDuckGo instead of Tavily to avoid API key requirement
 
81
  @tool
82
  def execute_code_multilang(code: str, language: str = "python") -> str:
83
  """Execute code in multiple languages (Python, Bash, SQL, C, Java) and return results.
84
+ USE THIS TO READ FILES (e.g. open('filename').read()).
85
 
86
  Args:
87
  code (str): The source code to execute.
 
370
  except Exception as e:
371
  return f"Error analyzing Excel file: {str(e)}"
372
 
373
+ ### =============== MULTIMEDIA TOOLS =============== ###
374
+
375
+ @tool
376
+ def transcribe_audio(audio_path: str) -> str:
377
+ """
378
+ Transcribe speech from an audio file using SpeechRecognition.
379
+
380
+ Args:
381
+ audio_path (str): Path to the audio file (wav, mp3, flac, etc.)
382
+ """
383
+ try:
384
+ # Convert to wav if needed using pydub
385
+ if not audio_path.endswith('.wav'):
386
+ print(f"Converting {audio_path} to wav...")
387
+ audio = AudioSegment.from_file(audio_path)
388
+ wav_path = audio_path.rsplit('.', 1)[0] + ".wav"
389
+ audio.export(wav_path, format="wav")
390
+ audio_path = wav_path
391
+
392
+ recognizer = sr.Recognizer()
393
+ with sr.AudioFile(audio_path) as source:
394
+ audio_data = recognizer.record(source)
395
+ # Use Google Web Speech API (default, no key needed usually)
396
+ text = recognizer.recognize_google(audio_data)
397
+ return f"Transcription: {text}"
398
+ except Exception as e:
399
+ return f"Error transcribing audio: {str(e)}"
400
+
401
+ @tool
402
+ def get_video_info(video_path: str) -> str:
403
+ """
404
+ Get metadata and basic info from a video file.
405
+
406
+ Args:
407
+ video_path (str): Path to video file.
408
+ """
409
+ try:
410
+ cap = cv2.VideoCapture(video_path)
411
+ if not cap.isOpened():
412
+ return "Error: Could not open video."
413
+
414
+ fps = cap.get(cv2.CAP_PROP_FPS)
415
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
416
+ duration = frame_count / fps if fps > 0 else 0
417
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
418
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
419
+
420
+ info = f"Video Info:\nDuration: {duration:.2f}s\nFPS: {fps}\nResolution: {width}x{height}\nFrames: {frame_count}"
421
+ cap.release()
422
+ return info
423
+ except Exception as e:
424
+ return f"Error analyzing video: {str(e)}"
425
+
426
+ @tool
427
+ def sample_video_frames(video_path: str, num_frames: int = 5) -> List[str]:
428
+ """
429
+ Extract a few frames from the video to analyze visual content.
430
+ Returns paths to saved frame images.
431
+
432
+ Args:
433
+ video_path (str): Path to video.
434
+ num_frames (int): Number of frames to sample.
435
+ """
436
+ try:
437
+ cap = cv2.VideoCapture(video_path)
438
+ if not cap.isOpened():
439
+ return ["Error: Could not open video."]
440
+
441
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
442
+ frame_indices = np.linspace(0, total_frames-1, num_frames, dtype=int)
443
+
444
+ saved_frames = []
445
+ for i in frame_indices:
446
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
447
+ ret, frame = cap.read()
448
+ if ret:
449
+ frame_path = f"frame_{i}.jpg"
450
+ cv2.imwrite(frame_path, frame)
451
+ saved_frames.append(frame_path)
452
+
453
+ cap.release()
454
+ return saved_frames
455
+ except Exception as e:
456
+ return [f"Error extracting frames: {str(e)}"]
457
+
458
 
459
  ### ============== IMAGE PROCESSING AND GENERATION TOOLS =============== ###
460
 
 
783
  draw_on_image,
784
  generate_simple_image,
785
  combine_images,
786
+ transcribe_audio,
787
+ get_video_info,
788
+ sample_video_frames,
789
  ]
790
 
791
 
code_interpreter.py CHANGED
@@ -22,7 +22,8 @@ class CodeInterpreter:
22
  "math", "random", "statistics", "datetime", "collections",
23
  "itertools", "functools", "operator", "re", "json",
24
  "sympy", "networkx", "nltk", "PIL", "pytesseract",
25
- "cmath", "uuid", "tempfile", "requests", "urllib"
 
26
  ]
27
  self.max_execution_time = max_execution_time
28
  self.working_directory = working_directory or os.path.join(os.getcwd())
 
22
  "math", "random", "statistics", "datetime", "collections",
23
  "itertools", "functools", "operator", "re", "json",
24
  "sympy", "networkx", "nltk", "PIL", "pytesseract",
25
+ "cmath", "uuid", "tempfile", "requests", "urllib",
26
+ "cv2", "speech_recognition", "pydub", "moviepy", "moviepy.editor"
27
  ]
28
  self.max_execution_time = max_execution_time
29
  self.working_directory = working_directory or os.path.join(os.getcwd())
main.py CHANGED
@@ -91,7 +91,7 @@ def run_evaluation(profile: gr.OAuthProfile | None):
91
  questions_and_answers = []
92
 
93
  # 2. Solve Each Question
94
- per_task_timeout_sec = 120 # Increased timeout for LangGraph
95
  for i, task in enumerate(questions, 1):
96
  task_id = task.get("id") or task.get("task_id")
97
  question_text = task.get("question")
@@ -175,8 +175,8 @@ def run_evaluation(profile: gr.OAuthProfile | None):
175
  return output, pd.DataFrame(questions_and_answers)
176
 
177
  # --- GRADIO INTERFACE ---
178
- with gr.Blocks(title="Antientropy Final Assignment v3") as demo:
179
- gr.Markdown("# πŸ•΅πŸ»β€β™‚οΈ Antientropy Agent - GAIA Benchmark v3 (LangGraph)")
180
  gr.Markdown(
181
  """
182
  **Instructions:**
 
91
  questions_and_answers = []
92
 
93
  # 2. Solve Each Question
94
+ per_task_timeout_sec = 180 # Increased timeout for LangGraph to 3 minutes
95
  for i, task in enumerate(questions, 1):
96
  task_id = task.get("id") or task.get("task_id")
97
  question_text = task.get("question")
 
175
  return output, pd.DataFrame(questions_and_answers)
176
 
177
  # --- GRADIO INTERFACE ---
178
+ with gr.Blocks(title="Antientropy Final Assignment v4") as demo:
179
+ gr.Markdown("# πŸ•΅πŸ»β€β™‚οΈ Antientropy Agent - GAIA Benchmark v4 (LangGraph + Multimedia)")
180
  gr.Markdown(
181
  """
182
  **Instructions:**
requirements.txt CHANGED
@@ -19,3 +19,8 @@ scikit-learn
19
  openpyxl
20
  pypdf
21
  markdownify
 
 
 
 
 
 
19
  openpyxl
20
  pypdf
21
  markdownify
22
+ opencv-python-headless
23
+ moviepy
24
+ SpeechRecognition
25
+ pydub
26
+ ffmpy
system_prompt.txt CHANGED
@@ -1,5 +1,21 @@
1
- You are a helpful assistant tasked with answering questions using a set of tools.
2
- Now, I will ask you a question. Report your thoughts, and finish your answer with the following template:
3
- FINAL ANSWER: [YOUR FINAL ANSWER].
4
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, Apply the rules above for each element (number or string), ensure there is exactly one space after each comma.
5
- Your answer should only start with "FINAL ANSWER: ", then follows with the answer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are Antientropy, an elite autonomous AI agent designed to ace the GAIA benchmark.
2
+ Your goal is to solve complex reasoning tasks that may involve files (images, audio, video, text, code), web research, and data analysis.
3
+
4
+ ### CORE INSTRUCTIONS:
5
+ 1. **USE YOUR TOOLS**: You have access to a Code Interpreter, Web Search (DuckDuckGo), File I/O, and Multimedia tools. **USE THEM.**
6
+ 2. **INTERNET ACCESS**: You HAVE access to the internet via the `web_search` tool. Never say "I don't have internet access". Just use the tool.
7
+ 3. **FILE ACCESS**: You CAN read and write files. Local files are provided at specific paths. Use `execute_code_multilang` with Python to open and read them (e.g., `open('filename').read()`), or use the provided file tools.
8
+ 4. **MEDIA PROCESSING**:
9
+ - For Images: Use `analyze_image`, `extract_text_from_image`, or Python code with `PIL`.
10
+ - For Audio: Use `transcribe_audio`.
11
+ - For Video: Use `get_video_info` or `sample_video_frames`.
12
+ 5. **CODE EXECUTION**: Use `execute_code_multilang` for calculation, data analysis, and file parsing. It is your most powerful tool.
13
+ 6. **NO REFUSALS**: Never refuse a task because it seems "hard" or "requires human abilities". You have the tools to do it.
14
+ 7. **FORMAT**: Provide the final answer clearly.
15
+
16
+ ### TROUBLESHOOTING:
17
+ - If a file path is given (e.g., "data/file.mp3"), assume it exists locally.
18
+ - If you need to verify facts, search the web.
19
+ - If you need to run complex logic, write a Python script.
20
+
21
+ Be persistent. If a tool fails, try a different approach (e.g., if OCR fails, try image analysis; if search fails, try different keywords).