Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 19, 2025

Commit

915154c

1 Parent(s): 5b6dc13

Agents' guides and prompts

Browse files

Files changed (2) hide show

mini_agents.py +6 -6
prompts.yaml +3 -1

mini_agents.py CHANGED Viewed

@@ -2,7 +2,7 @@ from smolagents import CodeAgent, InferenceClientModel
 from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
 from tools import tavily_search_tool, visit_webpage_tool
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
-from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file
 from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools, get_youtube_transcript_from_url
 import os
@@ -41,9 +41,9 @@ audio_agent = CodeAgent(
     tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
     max_steps=6,
     # prompt_templates=PROMPT_TEMPLATE["audio_agent"],
-    additional_authorized_imports=["pytube", "pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
     name="audio_agent",
-    description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it."
 )
 vlm_model = InferenceClientModel(
@@ -53,12 +53,12 @@ vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
-    tools=[download_image, image_processing, object_detection_tool, ocr_scan_tool, extract_frames_from_video, get_image_from_file],
     max_steps=6,
     # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
-    additional_authorized_imports=["cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'youtube_dl', 'bs4'],
     name="vlm_agent",
-    description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
 )
 arithmetic_model = InferenceClientModel(

 from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
 from tools import tavily_search_tool, visit_webpage_tool
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
+from vlm_tools import image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path
 from audio_tools import transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools, get_youtube_transcript_from_url
 import os
     tools=[transcribe_audio_tool, audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization],
     max_steps=6,
     # prompt_templates=PROMPT_TEMPLATE["audio_agent"],
+    additional_authorized_imports=["pytube", "pytube3", "youtube_dl", "pydub", "pyAudioAnalysis", "base64", "io", "sklearn", "scipy", "numpy", "pandas", "json", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle'],
     name="audio_agent",
+    description="This agent is responsible for rocessing audio, transcribing audio and extracting text from it. It cannot process videos."
 )
 vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
+    tools=[image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path],
     max_steps=6,
     # prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
+    additional_authorized_imports=["pytube", "pytube3", "youtube_dl", "cv2", "numpy", "pytesseract", "requests", "base64", "onnxruntime", "PIL", "io", "os", "logging", "yaml", "pyplot", "matplotlib", 'hmmlearn', 'pickle', 'youtube_dl', 'bs4'],
     name="vlm_agent",
+    description="This agent is responsible for downloading images or videos, processing images or videos, detecting objects in them and extracting text from them. It cannot process audios."
 )
 arithmetic_model = InferenceClientModel(

prompts.yaml CHANGED Viewed

@@ -192,7 +192,8 @@ system_prompt: |-
   7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
   8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
   9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
-  10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
   Now Begin!
 planning:
@@ -320,6 +321,7 @@ managed_agent:
       {{task}}
       ---
       You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
       Your final_answer WILL HAVE to contain these parts:
       ### 1. Task outcome (short version):

   7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
   8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
   9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+  10. Some questions might require you to use audio or video files. The file path is often mentioned at the end of the task description (example: abcxyz.mp3). You must pass this file path to your managed agents for them to use as arguments to their tools.
+  11. Don't give up! You're in charge of solving the task, not providing directions to solve it.
   Now Begin!
 planning:
       {{task}}
       ---
       You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
+      Your manager may pass you a file path to an audio or video file. You must use this file path as an argument to your tools.
       Your final_answer WILL HAVE to contain these parts:
       ### 1. Task outcome (short version):