Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 17, 2025

Commit

55d6080

1 Parent(s): 82d0896

Fix tools

Browse files

Files changed (3) hide show

audio_tools.py +40 -0
tools.py +4 -4
vlm_tools.py +5 -5

audio_tools.py CHANGED Viewed

@@ -1,9 +1,49 @@
 import base64
 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
 from pydub import AudioSegment
 from pyAudioAnalysis import audioSegmentation as aS
 from io import BytesIO
 @tool
 def audio_to_base64(file_path: str) -> str:

 import base64
+import os
 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
 from pydub import AudioSegment
 from pyAudioAnalysis import audioSegmentation as aS
 from io import BytesIO
+from huggingface_hub import InferenceClient
+class TranscribeAudioTool(Tool):
+    name = "transcribe_audio"
+    description = "Transcribe an audio file"
+    inputs = {
+        "type": "object",
+        "properties": {
+            "audio": {"type": "string", "description": "The audio file in base64 format"}
+        }
+    }
+    output_type = "string"
+    def setup(self):
+        self.model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
+    def forward(self, audio: str) -> str:
+        audio_data = base64.b64decode(audio)
+        audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+        result = self.model.automatic_speech_recognition(audio_segment)
+        return result["text"]
+transcribe_audio_tool = TranscribeAudioTool()
+@tool
+def transcribe_audio(audio: str) -> str:
+    """
+    Transcribe an audio file
+    Args:
+        audio: The audio file in base64 format
+    Returns:
+        The transcribed text
+    """
+    model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
+    audio_data = base64.b64decode(audio)
+    audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+    result = model.automatic_speech_recognition(audio_segment)
+    return result["text"]
 @tool
 def audio_to_base64(file_path: str) -> str:

tools.py CHANGED Viewed

@@ -2,7 +2,8 @@ from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
 from datetime import datetime
 from typing import Literal, List, Union
-from smolagents import WebSearchTool, DuckDuckGoSearchTool, VisitWebpageTool, WikipediaSearchTool
 import pandas as pd
 @tool
@@ -58,10 +59,9 @@ def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "
             return sorted(my_list, reverse=how[order] == "desc")
 #smolagents tools
-web_search_tool = WebSearchTool()
-duckduckgo_search_tool = DuckDuckGoSearchTool()
 visit_webpage_tool = VisitWebpageTool()
-wikipedia_search_tool = WikipediaSearchTool()
 @tool
 def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:

 from smolagents.tools import Tool, tool
 from datetime import datetime
 from typing import Literal, List, Union
+from smolagents import VisitWebpageTool
+from langchain_community.tools.tavily_search import TavilySearchResults
 import pandas as pd
 @tool
             return sorted(my_list, reverse=how[order] == "desc")
 #smolagents tools
 visit_webpage_tool = VisitWebpageTool()
+tavily_search_tool = TavilySearchResults(k=3)
 @tool
 def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:

vlm_tools.py CHANGED Viewed

@@ -129,13 +129,13 @@ onnx_path = "vlm_assets/yolov3-8.onnx"
 names_path = "vlm_assets/obj.names"
 class ObjectDetectionTool(Tool):
     description = """
-    Detect objects in a list of frames (images).
-    It takes a list of frames (images) as input and returns
-    a list of detected objects with labels, confidence, and bounding boxes.
-    The output type will be List[List[str]]
     """
-    name = "object_detection"
     inputs = {
         "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
     }

 names_path = "vlm_assets/obj.names"
 class ObjectDetectionTool(Tool):
+    name = "object_detection"
     description = """
+        Detect objects in a list of frames (images).
+        It takes a list of frames (images) as input and returns
+        a list of detected objects with labels, confidence, and bounding boxes.
+        The output type will be List[List[str]]
     """
     inputs = {
         "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
     }