Francesco-A commited on
Commit
15a3001
·
1 Parent(s): da5af70

Updated space

Browse files

Added:
- transciber tools
- GeminiAgent
- LocalAgent (not used in space)

Files changed (4) hide show
  1. agent.py +245 -63
  2. app.py +23 -73
  3. requirements.txt +14 -0
  4. tools/audio_tools.py +78 -0
agent.py CHANGED
@@ -1,10 +1,21 @@
1
- from smolagents import tool
 
 
 
2
  import pandas as pd
 
 
 
 
3
 
 
4
  from smolagents import (
5
  CodeAgent,
6
- InferenceClientModel,
 
 
7
  Tool,
 
8
  DuckDuckGoSearchTool,
9
  VisitWebpageTool,
10
  WikipediaSearchTool,
@@ -15,66 +26,237 @@ from smolagents import (
15
  # Import your custom tools (to be used in app, not in local notebook)
16
  from tools.download_file import download_file_from_url
17
  from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- def create_agent(
20
- model_path: str = "Qwen/Qwen3-Next-80B-A3B-Thinking"
21
- ):
22
- """
23
- Creates and configures a CodeAgent.
24
-
25
- This function initializes a smolagents CodeAgent equipped with the
26
- recommended default tools (web search, browser, and Python interpreter),
27
- together with any custom tools you may define.
28
-
29
- Args:
30
- model_path (str): The identifier or local path of the Hugging Face
31
- model to be loaded. By default, it uses `Qwen/Qwen3-Next-80B-A3B-Thinking`,
32
- but any compatible model can be substituted.
33
-
34
- Returns:
35
- CodeAgent: A fully initialized agent ready to run code, query tools,
36
- and perform multi-step reasoning using the selected model.
37
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Choose a lightweight but reasoning-capable model
40
- model = InferenceClientModel(
41
- model_id=model_path,
42
- temperature = 0.0,
43
- top_p = 1.0, # NEW
44
- )
45
-
46
- # Default smolagents tools (high-level)
47
- default_tools = [
48
- DuckDuckGoSearchTool(), # Internet search
49
- VisitWebpageTool(), # Retrieve webpage content
50
- PythonInterpreterTool(), # Executes agent-generated Python code
51
- FinalAnswerTool(), # Ends agent reasoning and returns final answer
52
- ]
53
-
54
- # Custom tools (critical for GAIA)
55
- custom_tools = [
56
- download_file_from_url, # file downloader
57
- text_file_to_string, # .txt, .md, .json, etc.
58
- pdf_to_text, # PyMuPDF-based safe PDF parser
59
- image_to_text, # OCR for images
60
- ]
61
-
62
- tools = default_tools + custom_tools
63
-
64
- # Create the CodeAgent (best for GAIA because it supports Python)
65
- agent = CodeAgent(
66
- model=model,
67
- tools=tools,
68
- add_base_tools=True, # probably redundant, but it does not hurt
69
- max_steps=7,
70
- additional_authorized_imports = ['numpy','subprocess', 're', 'pandas',
71
- 'json', 'os', 'pathlib', 'tempfile',
72
- # 'matplotlib.pyplot', 'seaborn'
73
- ],
74
- verbosity_level = 1,
75
- max_print_outputs_length=1_000_000
76
- )
77
-
78
- return agent
79
-
80
- # WIP: Agentic RAG Systems
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Generic agent
3
+ import os
4
+ from typing import Optional
5
  import pandas as pd
6
+ import torch
7
+
8
+ # Local agent specific
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
 
11
+ # Smolagents imports
12
  from smolagents import (
13
  CodeAgent,
14
+ InferenceClientModel,
15
+ TransformersModel,
16
+ LiteLLMModel,
17
  Tool,
18
+ tool,
19
  DuckDuckGoSearchTool,
20
  VisitWebpageTool,
21
  WikipediaSearchTool,
 
26
  # Import your custom tools (to be used in app, not in local notebook)
27
  from tools.download_file import download_file_from_url
28
  from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
29
+ from tools.audio_tools import youtube_to_text, transcribe_audio
30
+
31
+ # Define tools
32
+ AGENT_TOOLS = [
33
+ # Default Tools
34
+ DuckDuckGoSearchTool(), # Internet search
35
+ VisitWebpageTool(), # Retrieve webpage content
36
+ PythonInterpreterTool(), # Executes agent-generated Python code
37
+ FinalAnswerTool(), # Ends agent reasoning and returns final answer
38
+
39
+ # Custom Tools
40
+ download_file_from_url, # file downloader
41
+ text_file_to_string, # .txt, .md, .json, etc.
42
+ pdf_to_text, # PyMuPDF-based safe PDF parser
43
+ image_to_text, # OCR for images
44
+ youtube_to_text, # Youtube audio to text
45
+ transcribe_audio, # Audio file to text
46
+ ]
47
+
48
+ # System prompt
49
+ SYSTEM_PROMPT = """
50
+ You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
51
+
52
+ ### 1. Reason-Act-Observe
53
+ Follow a **PLAN → ACT → OBSERVE** loop:
54
+ - **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step.
55
+ - **ACT:** Write and run one self-contained Python block per step.
56
+ - **OBSERVE:** Examine outputs or errors before proceeding.
57
+
58
+ ### 2. File Handling
59
+ - When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
60
+
61
+ You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
62
+
63
+ | File Type / Source | Tool / Method to Use |
64
+ | :--- | :--- |
65
+ | `.csv` | `pd.read_csv(filepath)` |
66
+ | `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
67
+ | `.pdf` | `pdf_to_text(filepath)` |
68
+ | `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
69
+ | `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
70
+ | **YouTube URL** | `youtube_to_text(url)` |
71
+ | `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
72
+
73
+ **Important rules:**
74
+ - When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
75
+ - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
76
+ - For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
77
 
78
+ ### 3. Data Analysis & Answer
79
+ - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
80
+ - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
81
+ - Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
82
+
83
+ ### 4. Additional instructions for the following tasks provided by GAIA team
84
+ - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
85
+ - Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
86
+
87
+ ### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
88
+
89
+ - Example of how to end the task:
90
+
91
+ Thought: I have found the answer. I will now provide it.
92
+ <code>
93
+ final_answer("FINAL ANSWER: The capital of France is Paris")
94
+ </code>
95
+
96
+ \n\n
97
+ """
98
+
99
+ class BasicAgent:
100
+ def __init__(self):
101
+ self.system_prompt = SYSTEM_PROMPT
102
+ self.model = InferenceClientModel(
103
+ model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking",
104
+ temperature = 0.0,
105
+ top_p = 1.0,
106
+ max_tokens = 8196,
107
+ )
108
+ self.tools = AGENT_TOOLS
109
+ self.basic_agent = CodeAgent(
110
+ name = "basic_agent",
111
+ description = "Basic smolagents CodeAgent",
112
+ model = self.model,
113
+ tools = self.tools,
114
+ add_base_tools = True, # probably redundant, but it does not hurt
115
+ max_steps = 5,
116
+ additional_authorized_imports = [
117
+ 'numpy','subprocess', 're', 'pandas',
118
+ 'json', 'os', 'datetime', 'tempfile',
119
+ ],
120
+ verbosity_level = 1,
121
+ max_print_outputs_length=1_000_000
122
+ )
123
+
124
+ print("✅ Basic agent initialized")
125
+
126
+ def __call__(self, question: str, file_path: Optional[str] = None) -> str:
127
+
128
+ if file_path:
129
+ # Inject system prompt + question and (optional) file path
130
+ prompt = (
131
+ f"{self.system_prompt}\n\n"
132
+ f"Question: {question}\n\n"
133
+ f"There is an associated file at path: {file_path}.\n"
134
+ f"Use the appropriate tool to download it (if necessary) and read it before answering"
135
+ )
136
+ else:
137
+ prompt = (
138
+ f"{self.system_prompt}\n\n"
139
+ f"Question: {question}\n\n"
140
+ )
141
+
142
+ return self.basic_agent.run(prompt)
143
 
144
+ class GeminiAgent:
145
+ def __init__(self):
146
+ self.system_prompt = SYSTEM_PROMPT
147
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
148
+ if not GOOGLE_API_KEY:
149
+ raise RuntimeError(
150
+ "GOOGLE_API_KEY not found."
151
+ )
152
+ self.model = LiteLLMModel(
153
+ model_id = "gemini/gemini-2.0-flash",
154
+ api_key = GOOGLE_API_KEY,
155
+ temperature = 0.0,
156
+ top_p = 1.0,
157
+ max_tokens = 8196,
158
+ )
159
+ self.tools = AGENT_TOOLS
160
+ self.gemini_agent = CodeAgent(
161
+ name = "gemini_agent",
162
+ description = "Gemini CodeAgent",
163
+ model = self.model,
164
+ tools = self.tools,
165
+ add_base_tools = True, # probably redundant, but it does not hurt
166
+ max_steps = 5,
167
+ additional_authorized_imports = [
168
+ 'numpy','subprocess', 're', 'pandas',
169
+ 'json', 'os', 'datetime', 'tempfile',
170
+ ],
171
+ verbosity_level = 1,
172
+ max_print_outputs_length=1_000_000
173
+ )
174
+
175
+ print("✅ Gemini agent initialized")
176
+
177
+ def __call__(self, question: str, file_path: Optional[str] = None) -> str:
178
+
179
+ if file_path:
180
+ # Inject system prompt + question and (optional) file path
181
+ prompt = (
182
+ f"{self.system_prompt}\n\n"
183
+ f"Question: {question}\n\n"
184
+ f"There is an associated file at path: {file_path}.\n"
185
+ f"Use the appropriate tool to download it (if necessary) and read it before answering"
186
+ )
187
+ else:
188
+ prompt = (
189
+ f"{self.system_prompt}\n\n"
190
+ f"Question: {question}\n\n"
191
+ )
192
+
193
+ return self.gemini_agent.run(prompt)
194
+
195
+ class LocalAgent:
196
+ def __init__(self):
197
+ checkpoint = "Qwen/Qwen2.5-7B-Instruct"
198
+ quantized_model_dir = "./quantized_model"
199
+
200
+ # Define the quantized configuration
201
+ bnb_config = BitsAndBytesConfig(
202
+ load_in_4bit = True,
203
+ bnb_4bit_quant_type = "nf4",
204
+ bnb_4bit_compute_dtype = torch.bfloat16,
205
+ bnb_4bit_use_double_quant = True,
206
+ )
207
+
208
+ # Load quantized model and tokenizer
209
+ temp_model = AutoModelForCausalLM.from_pretrained(
210
+ checkpoint,
211
+ quantization_config = bnb_config,
212
+ device_map="auto" # use multiple GPUs if available
213
+ )
214
+ temp_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
215
+
216
+ # Save the model in local path (seems like it's the only way to make it work with TransformersModel)
217
+ temp_model.save_pretrained(quantized_model_dir)
218
+ temp_tokenizer.save_pretrained(quantized_model_dir)
219
+
220
+ self.system_prompt = SYSTEM_PROMPT
221
+
222
+ self.model = TransformersModel(
223
+ model_path = quantized_model_dir,
224
+ temperature = 0.1,
225
+ top_p = 0.95,
226
+ device_map = "auto",
227
+ max_new_tokens = 8196 # https://github.com/huggingface/smolagents/issues/414#:~:text=Running%20with%20TransformersModel%20does%20not%20work
228
+ )
229
+ self.tools = AGENT_TOOLS
230
+
231
+ self.local_agent = CodeAgent(
232
+ model=self.model,
233
+ tools=tools,
234
+ add_base_tools=True, # probably redundant, but it does not hurt
235
+ max_steps=5,
236
+ additional_authorized_imports = ['numpy','subprocess', 're', 'pandas',
237
+ 'json', 'os', 'pathlib', 'tempfile',
238
+ # 'matplotlib.pyplot', 'seaborn'
239
+ ],
240
+ verbosity_level = 1,
241
+ max_print_outputs_length=1_000_000
242
+ )
243
+
244
+ print("✅ Local (quantized) agent initialized.")
245
+
246
+ def __call__(self, question: str, file_path: Optional[str] = None) -> str:
247
+
248
+ if file_path:
249
+ # Inject system prompt + question and (optional) file path
250
+ prompt = (
251
+ f"{self.system_prompt}\n\n"
252
+ f"Question: {question}\n\n"
253
+ f"There is an associated file at path: {file_path}.\n"
254
+ f"Use the appropriate tool to download it (if necessary) and read it before answering"
255
+ )
256
+ else:
257
+ prompt = (
258
+ f"{self.system_prompt}\n\n"
259
+ f"Question: {question}\n\n"
260
+ )
261
+
262
+ return self.local_agent.run(prompt)
app.py CHANGED
@@ -1,80 +1,18 @@
1
  import os
 
2
  import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
6
- from agent import create_agent
7
  from typing import Optional
8
 
 
 
9
  # (Keep Constants as is)
10
  # --- Constants ---
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
 
13
- # --- Basic Agent Definition ---
14
- class BasicAgent:
15
- def __init__(self):
16
- self.agent = create_agent()
17
- self.system_prompt = """
18
- You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
19
-
20
- ### 1. Reason-Act-Observe
21
- Follow a **PLAN → ACT → OBSERVE** loop:
22
- - **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step.
23
- - **ACT:** Write and run one self-contained Python block per step.
24
- - **OBSERVE:** Examine outputs or errors before proceeding.
25
-
26
- ### 2. File Handling
27
- - When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
28
-
29
- You must select the reading method based strictly on the file extension:
30
- | File Extension | Tool / Method to Use |
31
- | :--- | :--- |
32
- | .csv | `pd.read_csv(filepath)` |
33
- | .xlsx, .xls | `pd.read_excel(filepath)` |
34
- | .pdf | `pdf_to_text(filepath)` |
35
- | .txt, .md, .json | `text_file_to_string(filepath)` |
36
- | .png, .jpg, .jpeg | `image_to_text(filepath)` |
37
-
38
- ### 3. Data Analysis & Answer
39
- - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
40
- - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
41
- - Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
42
-
43
- ### 4. Additional instructions for the following tasks provided by GAIA team
44
- - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
45
- - Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
46
-
47
- ### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
48
-
49
- - Example of how to end the task:
50
-
51
- Thought: I have found the answer. I will now provide it.
52
- <code>
53
- final_answer("FINAL ANSWER: The capital of France is Paris")
54
- </code>
55
-
56
- \n\n
57
- """
58
- # print("Agent initialized.")
59
-
60
- def __call__(self, question: str, file_path: Optional[str] = None) -> str:
61
-
62
- if file_path:
63
- # Inject system prompt + question and (optional) file path
64
- prompt = (
65
- f"{self.system_prompt}\n\n"
66
- f"Question: {question}\n\n"
67
- f"There is an associated file at path: {file_path}.\n"
68
- f"Use the appropriate tool to download it (if necessary) and read it before answering"
69
- )
70
- else:
71
- prompt = (
72
- f"{self.system_prompt}\n\n"
73
- f"Question: {question}\n\n"
74
- )
75
-
76
- return self.agent.run(prompt)
77
-
78
  def run_and_submit_all( profile: gr.OAuthProfile | None):
79
  """
80
  Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -84,7 +22,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
84
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
85
 
86
  if profile:
87
- username= f"{profile.username}"
88
  print(f"User logged in: {username}")
89
  else:
90
  print("User not logged in.")
@@ -94,15 +32,25 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
94
  questions_url = f"{api_url}/questions"
95
  submit_url = f"{api_url}/submit"
96
 
97
- # 1. Instantiate Agent ( modify this part to create your agent)
98
  try:
99
- agent = BasicAgent()
100
- except Exception as e:
101
- print(f"Error instantiating agent: {e}")
102
- return f"Error initializing agent: {e}", None
 
 
 
 
 
 
 
 
 
103
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
104
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
105
- print(agent_code)
 
106
 
107
  # 2. Fetch Questions
108
  print(f"Fetching questions from: {questions_url}")
@@ -139,6 +87,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
139
  submitted_answer = agent(question_text)
140
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
141
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
142
  except Exception as e:
143
  print(f"Error running agent on task {task_id}: {e}")
144
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
1
  import os
2
+ import time
3
  import gradio as gr
4
  import requests
5
  import inspect
6
  import pandas as pd
7
+ from agent import BasicAgent, GeminiAgent
8
  from typing import Optional
9
 
10
+ # (ASK ABOUT ALIGNMENT BETWEEN TIMEOUT ARGUMENTS AND TIME.SLEEP)
11
+
12
  # (Keep Constants as is)
13
  # --- Constants ---
14
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def run_and_submit_all( profile: gr.OAuthProfile | None):
17
  """
18
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
22
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
23
 
24
  if profile:
25
+ username = f"{profile.username}"
26
  print(f"User logged in: {username}")
27
  else:
28
  print("User not logged in.")
 
32
  questions_url = f"{api_url}/questions"
33
  submit_url = f"{api_url}/submit"
34
 
35
+ # 1. Instantiate Agent (modify this part to create your agent)
36
  try:
37
+ agent = GeminiAgent()
38
+ agent_type = "GeminiAgent"
39
+ except Exception as main_agent_error:
40
+ print(f"{agent_type} failed to initialize: {main_agent_error}.")
41
+ try:
42
+ agent = BasicAgent()
43
+ agent_type = "BasicAgent"
44
+ print(f"Falling back to {agent_type}.")
45
+ except Exception as secondary_agent_error:
46
+ print(f"{agent_type} failed to initialize: {secondary_agent_error}.")
47
+ agent_type = "None"
48
+ return f"Error initializing agent: {e}", None
49
+
50
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
51
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
52
+ print(f"Agent code: {agent_code}")
53
+ print(f"Active agent: {agent_code}")
54
 
55
  # 2. Fetch Questions
56
  print(f"Fetching questions from: {questions_url}")
 
87
  submitted_answer = agent(question_text)
88
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
89
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
90
+
91
+ time.sleep(60) # to not exceed free limits
92
  except Exception as e:
93
  print(f"Error running agent on task {task_id}: {e}")
94
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
requirements.txt CHANGED
@@ -19,5 +19,19 @@ Pillow==11.3.0
19
  pdfplumber==0.11.8
20
  PyMuPDF==1.26.7
21
 
 
 
 
 
 
22
  # OCR (OPTIONAL, disabled)
23
  # pytesseract==0.3.13
 
 
 
 
 
 
 
 
 
 
19
  pdfplumber==0.11.8
20
  PyMuPDF==1.26.7
21
 
22
+ # Audio transcriber
23
+ youtube-transcript-api==1.2.3
24
+ pytubefix==10.3.6
25
+ openai-whisper==20250625
26
+
27
  # OCR (OPTIONAL, disabled)
28
  # pytesseract==0.3.13
29
+
30
+ # Additional for LocalAgent (optional)
31
+ !pip install transformers==4.1.0
32
+ !pip install bitsandbytes==0.49.0
33
+ !pip install \
34
+ torch==2.6.0+cu124 \
35
+ torchvision \
36
+ torchaudio \
37
+ --extra-index-url https://download.pytorch.org/whl/cu124
tools/audio_tools.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import tool
2
+ import tempfile
3
+ import os
4
+
5
+ @tool
6
+ def youtube_to_text(url: str) -> str:
7
+ """
8
+ Transcribe a YouTube video.
9
+ First tries to retrieve official captions.
10
+ Falls back to Whisper transcription if captions are unavailable.
11
+
12
+ Args:
13
+ url: Full YouTube video URL
14
+
15
+ Returns:
16
+ Transcribed text
17
+ """
18
+ # ---- Step 1: Try official YouTube transcripts ----
19
+ try:
20
+ from youtube_transcript_api import YouTubeTranscriptApi
21
+ from urllib.parse import urlparse, parse_qs
22
+
23
+ query = parse_qs(orlparse(url).query)
24
+ video_id = query.get("v", [None])[0]
25
+
26
+ if video_id:
27
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
28
+ text = " ".join([chunk["text"] for chunk in transcript])
29
+ return text
30
+ except Exception:
31
+ pass # Silent fallback to Whisper
32
+
33
+ # ---- Step 2: Fallback to Whisper transcription ----
34
+ try:
35
+ import whisper
36
+ from pytubefix import YouTube
37
+
38
+ yt = YouTube(url)
39
+ audio_stream = yt.streams.get_audio_only()
40
+
41
+ temp_dir = tempfile.gettempdir()
42
+ audio_path = audio_stream.download(output_path=temp_dir)
43
+
44
+ model = whisper.load_model("base")
45
+ result = model.transcribe(audio_path)
46
+
47
+ return result["text"]
48
+
49
+ except Exception as e:
50
+ return f"Error transcribing YouTube video: {str(e)}"
51
+
52
+ @tool
53
+ def transcribe_audio(file_path: str) -> str:
54
+ """
55
+ Transcribes audio files into text using the Whisper model.
56
+ Supports multiple formats including .mp3, .wav, .m4a, .flac, and .ogg.
57
+
58
+ Args:
59
+ file_path: The local path to the audio file to be transcribed.
60
+
61
+ Returns:
62
+ The transcribed text as a string.
63
+ """
64
+ try:
65
+ import whisper
66
+
67
+ model = whisper.load_model("base")
68
+ result = model.transcribe(file_path)
69
+
70
+ return result["text"]
71
+
72
+ except ImportError:
73
+ return (
74
+ "Whisper is not installed. "
75
+ "Install it with `pip install openai-whisper` and ensure ffmpeg is available."
76
+ )
77
+ except Exception as e:
78
+ return f"Error transcribing audio file: {str(e)}"