Final_Project_Agent_Course

Sleeping

App Files Files Community

Thanh Vinh Vo commited on Jul 9, 2025

Commit

f164cc2

1 Parent(s): 0f547af

update

Browse files

Files changed (2) hide show

app.py +71 -3
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -18,11 +18,73 @@ from smolagents import (
     ToolCollection,
     VisitWebpageTool,
 )
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 @tool
 def get_file(question_id: str, file_name: str) -> str:
     """
@@ -82,7 +144,7 @@ class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
         self.multimodal_agent = CodeAgent(
-            tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file],
             model= OpenAIServerModel(model_id="gpt-4o"),
             additional_authorized_imports=[
                 "requests",
@@ -96,6 +158,8 @@ class BasicAgent:
                 "bytes",
                 "cv2",
                 "numpy",
             ],
             name="multimodal_agent",
             description="""
@@ -105,7 +169,7 @@ class BasicAgent:
         )
         self.code_agent = CodeAgent(
-            tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file],
             model=InferenceClientModel(
                 model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
             ),
@@ -125,6 +189,8 @@ class BasicAgent:
                 "cv2",
                 "numpy",
                 "chess.engine",
             ],
             name="code_agent",
             description="""
@@ -147,7 +213,7 @@ class BasicAgent:
             model=InferenceClientModel(
                 "Qwen/Qwen2.5-32B-Instruct"
             ),
-            tools=[get_file],
             managed_agents=[
                 self.multimodal_agent,
                 self.code_agent],
@@ -167,6 +233,8 @@ class BasicAgent:
                 "cv2",
                 "numpy",
                 "chess.engine",
             ],
             planning_interval=5,
             max_steps=15,

     ToolCollection,
     VisitWebpageTool,
 )
+import whisper
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+@tool
+def audio_to_text(file_path: str) -> str:
+    """
+    A tool that converts audio files to text using OpenAI's Whisper speech recognition model.
+    This function transcribes audio content from a local audio file and returns the transcript
+    as a JSON string containing timestamped segments. It uses the Whisper "base" model for
+    speech-to-text conversion.
+    Args:
+        file_path (str): The local file path to the audio file to be transcribed.
+                        Supports common audio formats like MP3, WAV, M4A, FLAC, etc.
+    Returns:
+        str: A JSON string containing the transcript data with the following structure:
+             {
+                 "transcript": [
+                     {
+                         "start": float,  # Start time in seconds
+                         "end": float,    # End time in seconds
+                         "text": str      # Transcribed text segment
+                     },
+                     ...
+                 ]
+             }
+    Raises:
+        FileNotFoundError: If the specified audio file does not exist.
+        Exception: If the audio file cannot be processed or transcribed.
+    Example:
+        >>> result = audio_to_text("path/to/audio.mp3")
+        >>> import json
+        >>> transcript_data = json.loads(result)
+        >>> for segment in transcript_data["transcript"]:
+        ...     print(f"{segment['start']:.2f}s - {segment['end']:.2f}s: {segment['text']}")
+    Note:
+        - Uses OpenAI Whisper "base" model for transcription
+        - Processes audio without verbose output or word-level timestamps
+        - Returns empty segments list if no speech is detected
+        - Processing time depends on audio file length and system performance
+    """
+    import json
+    import whisper
+    model = whisper.load_model("base")
+    result = model.transcribe(file_path, verbose=False, word_timestamps=False)
+    transcript_data = [
+        {
+            "start": segment["start"],
+            "end": segment["end"],
+            "text": segment["text"].strip()
+        }
+        for segment in result["segments"]
+    ]
+    return json.dumps({"transcript": transcript_data})
 @tool
 def get_file(question_id: str, file_name: str) -> str:
     """
     def __init__(self):
         print("BasicAgent initialized.")
         self.multimodal_agent = CodeAgent(
+            tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
             model= OpenAIServerModel(model_id="gpt-4o"),
             additional_authorized_imports=[
                 "requests",
                 "bytes",
                 "cv2",
                 "numpy",
+                "json",
+                "whisper",
             ],
             name="multimodal_agent",
             description="""
         )
         self.code_agent = CodeAgent(
+            tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
             model=InferenceClientModel(
                 model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
             ),
                 "cv2",
                 "numpy",
                 "chess.engine",
+                "json",
+                "whisper",
             ],
             name="code_agent",
             description="""
             model=InferenceClientModel(
                 "Qwen/Qwen2.5-32B-Instruct"
             ),
+            tools=[get_file, audio_to_text],
             managed_agents=[
                 self.multimodal_agent,
                 self.code_agent],
                 "cv2",
                 "numpy",
                 "chess.engine",
+                "json",
+                "whisper",
             ],
             planning_interval=5,
             max_steps=15,

requirements.txt CHANGED Viewed

@@ -13,3 +13,4 @@ pillow
 opencv-python
 numpy
 html5lib

 opencv-python
 numpy
 html5lib
+whisperopenai-whisper