Thanh Vinh Vo commited on
Commit
f164cc2
·
1 Parent(s): 0f547af
Files changed (2) hide show
  1. app.py +71 -3
  2. requirements.txt +1 -0
app.py CHANGED
@@ -18,11 +18,73 @@ from smolagents import (
18
  ToolCollection,
19
  VisitWebpageTool,
20
  )
 
21
 
22
  # (Keep Constants as is)
23
  # --- Constants ---
24
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  @tool
27
  def get_file(question_id: str, file_name: str) -> str:
28
  """
@@ -82,7 +144,7 @@ class BasicAgent:
82
  def __init__(self):
83
  print("BasicAgent initialized.")
84
  self.multimodal_agent = CodeAgent(
85
- tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file],
86
  model= OpenAIServerModel(model_id="gpt-4o"),
87
  additional_authorized_imports=[
88
  "requests",
@@ -96,6 +158,8 @@ class BasicAgent:
96
  "bytes",
97
  "cv2",
98
  "numpy",
 
 
99
  ],
100
  name="multimodal_agent",
101
  description="""
@@ -105,7 +169,7 @@ class BasicAgent:
105
  )
106
 
107
  self.code_agent = CodeAgent(
108
- tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file],
109
  model=InferenceClientModel(
110
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
111
  ),
@@ -125,6 +189,8 @@ class BasicAgent:
125
  "cv2",
126
  "numpy",
127
  "chess.engine",
 
 
128
  ],
129
  name="code_agent",
130
  description="""
@@ -147,7 +213,7 @@ class BasicAgent:
147
  model=InferenceClientModel(
148
  "Qwen/Qwen2.5-32B-Instruct"
149
  ),
150
- tools=[get_file],
151
  managed_agents=[
152
  self.multimodal_agent,
153
  self.code_agent],
@@ -167,6 +233,8 @@ class BasicAgent:
167
  "cv2",
168
  "numpy",
169
  "chess.engine",
 
 
170
  ],
171
  planning_interval=5,
172
  max_steps=15,
 
18
  ToolCollection,
19
  VisitWebpageTool,
20
  )
21
+ import whisper
22
 
23
  # (Keep Constants as is)
24
  # --- Constants ---
25
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
 
27
+ @tool
28
+ def audio_to_text(file_path: str) -> str:
29
+ """
30
+ A tool that converts audio files to text using OpenAI's Whisper speech recognition model.
31
+
32
+ This function transcribes audio content from a local audio file and returns the transcript
33
+ as a JSON string containing timestamped segments. It uses the Whisper "base" model for
34
+ speech-to-text conversion.
35
+
36
+ Args:
37
+ file_path (str): The local file path to the audio file to be transcribed.
38
+ Supports common audio formats like MP3, WAV, M4A, FLAC, etc.
39
+
40
+ Returns:
41
+ str: A JSON string containing the transcript data with the following structure:
42
+ {
43
+ "transcript": [
44
+ {
45
+ "start": float, # Start time in seconds
46
+ "end": float, # End time in seconds
47
+ "text": str # Transcribed text segment
48
+ },
49
+ ...
50
+ ]
51
+ }
52
+
53
+ Raises:
54
+ FileNotFoundError: If the specified audio file does not exist.
55
+ Exception: If the audio file cannot be processed or transcribed.
56
+
57
+ Example:
58
+ >>> result = audio_to_text("path/to/audio.mp3")
59
+ >>> import json
60
+ >>> transcript_data = json.loads(result)
61
+ >>> for segment in transcript_data["transcript"]:
62
+ ... print(f"{segment['start']:.2f}s - {segment['end']:.2f}s: {segment['text']}")
63
+
64
+ Note:
65
+ - Uses OpenAI Whisper "base" model for transcription
66
+ - Processes audio without verbose output or word-level timestamps
67
+ - Returns empty segments list if no speech is detected
68
+ - Processing time depends on audio file length and system performance
69
+ """
70
+ import json
71
+ import whisper
72
+ model = whisper.load_model("base")
73
+ result = model.transcribe(file_path, verbose=False, word_timestamps=False)
74
+
75
+ transcript_data = [
76
+ {
77
+ "start": segment["start"],
78
+ "end": segment["end"],
79
+ "text": segment["text"].strip()
80
+ }
81
+ for segment in result["segments"]
82
+ ]
83
+
84
+ return json.dumps({"transcript": transcript_data})
85
+
86
+
87
+
88
  @tool
89
  def get_file(question_id: str, file_name: str) -> str:
90
  """
 
144
  def __init__(self):
145
  print("BasicAgent initialized.")
146
  self.multimodal_agent = CodeAgent(
147
+ tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
148
  model= OpenAIServerModel(model_id="gpt-4o"),
149
  additional_authorized_imports=[
150
  "requests",
 
158
  "bytes",
159
  "cv2",
160
  "numpy",
161
+ "json",
162
+ "whisper",
163
  ],
164
  name="multimodal_agent",
165
  description="""
 
169
  )
170
 
171
  self.code_agent = CodeAgent(
172
+ tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
173
  model=InferenceClientModel(
174
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
175
  ),
 
189
  "cv2",
190
  "numpy",
191
  "chess.engine",
192
+ "json",
193
+ "whisper",
194
  ],
195
  name="code_agent",
196
  description="""
 
213
  model=InferenceClientModel(
214
  "Qwen/Qwen2.5-32B-Instruct"
215
  ),
216
+ tools=[get_file, audio_to_text],
217
  managed_agents=[
218
  self.multimodal_agent,
219
  self.code_agent],
 
233
  "cv2",
234
  "numpy",
235
  "chess.engine",
236
+ "json",
237
+ "whisper",
238
  ],
239
  planning_interval=5,
240
  max_steps=15,
requirements.txt CHANGED
@@ -13,3 +13,4 @@ pillow
13
  opencv-python
14
  numpy
15
  html5lib
 
 
13
  opencv-python
14
  numpy
15
  html5lib
16
+ whisperopenai-whisper