Spaces:
Sleeping
Sleeping
Fix tools
Browse files- audio_tools.py +40 -0
- tools.py +4 -4
- vlm_tools.py +5 -5
audio_tools.py
CHANGED
|
@@ -1,9 +1,49 @@
|
|
| 1 |
import base64
|
|
|
|
| 2 |
from langchain_core.tools import tool as langchain_tool
|
| 3 |
from smolagents.tools import Tool, tool
|
| 4 |
from pydub import AudioSegment
|
| 5 |
from pyAudioAnalysis import audioSegmentation as aS
|
| 6 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
@tool
|
| 9 |
def audio_to_base64(file_path: str) -> str:
|
|
|
|
| 1 |
import base64
|
| 2 |
+
import os
|
| 3 |
from langchain_core.tools import tool as langchain_tool
|
| 4 |
from smolagents.tools import Tool, tool
|
| 5 |
from pydub import AudioSegment
|
| 6 |
from pyAudioAnalysis import audioSegmentation as aS
|
| 7 |
from io import BytesIO
|
| 8 |
+
from huggingface_hub import InferenceClient
|
| 9 |
+
|
| 10 |
+
class TranscribeAudioTool(Tool):
|
| 11 |
+
name = "transcribe_audio"
|
| 12 |
+
description = "Transcribe an audio file"
|
| 13 |
+
inputs = {
|
| 14 |
+
"type": "object",
|
| 15 |
+
"properties": {
|
| 16 |
+
"audio": {"type": "string", "description": "The audio file in base64 format"}
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
output_type = "string"
|
| 20 |
+
|
| 21 |
+
def setup(self):
|
| 22 |
+
self.model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
|
| 23 |
+
|
| 24 |
+
def forward(self, audio: str) -> str:
|
| 25 |
+
audio_data = base64.b64decode(audio)
|
| 26 |
+
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
| 27 |
+
result = self.model.automatic_speech_recognition(audio_segment)
|
| 28 |
+
return result["text"]
|
| 29 |
+
|
| 30 |
+
transcribe_audio_tool = TranscribeAudioTool()
|
| 31 |
+
|
| 32 |
+
@tool
|
| 33 |
+
def transcribe_audio(audio: str) -> str:
|
| 34 |
+
"""
|
| 35 |
+
Transcribe an audio file
|
| 36 |
+
Args:
|
| 37 |
+
audio: The audio file in base64 format
|
| 38 |
+
Returns:
|
| 39 |
+
The transcribed text
|
| 40 |
+
"""
|
| 41 |
+
model = InferenceClient(model="openai/whisper-large-v3", token=os.getenv("HUGGINGFACE_API_KEY"))
|
| 42 |
+
audio_data = base64.b64decode(audio)
|
| 43 |
+
audio_segment = AudioSegment.from_file(BytesIO(audio_data))
|
| 44 |
+
result = model.automatic_speech_recognition(audio_segment)
|
| 45 |
+
return result["text"]
|
| 46 |
+
|
| 47 |
|
| 48 |
@tool
|
| 49 |
def audio_to_base64(file_path: str) -> str:
|
tools.py
CHANGED
|
@@ -2,7 +2,8 @@ from langchain_core.tools import tool as langchain_tool
|
|
| 2 |
from smolagents.tools import Tool, tool
|
| 3 |
from datetime import datetime
|
| 4 |
from typing import Literal, List, Union
|
| 5 |
-
from smolagents import
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
@tool
|
|
@@ -58,10 +59,9 @@ def sort_list(my_list: List[int], order: Literal["asc", "desc", "alphabetize", "
|
|
| 58 |
return sorted(my_list, reverse=how[order] == "desc")
|
| 59 |
|
| 60 |
#smolagents tools
|
| 61 |
-
web_search_tool = WebSearchTool()
|
| 62 |
-
duckduckgo_search_tool = DuckDuckGoSearchTool()
|
| 63 |
visit_webpage_tool = VisitWebpageTool()
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
@tool
|
| 67 |
def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
|
|
|
|
| 2 |
from smolagents.tools import Tool, tool
|
| 3 |
from datetime import datetime
|
| 4 |
from typing import Literal, List, Union
|
| 5 |
+
from smolagents import VisitWebpageTool
|
| 6 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
@tool
|
|
|
|
| 59 |
return sorted(my_list, reverse=how[order] == "desc")
|
| 60 |
|
| 61 |
#smolagents tools
|
|
|
|
|
|
|
| 62 |
visit_webpage_tool = VisitWebpageTool()
|
| 63 |
+
tavily_search_tool = TavilySearchResults(k=3)
|
| 64 |
+
|
| 65 |
|
| 66 |
@tool
|
| 67 |
def operate_two_numbers(num1: float, num2: float, operation: Literal["add", "subtract", "multiply", "divide", "power", "modulo"], decimal_places: int = 2)->float:
|
vlm_tools.py
CHANGED
|
@@ -129,13 +129,13 @@ onnx_path = "vlm_assets/yolov3-8.onnx"
|
|
| 129 |
names_path = "vlm_assets/obj.names"
|
| 130 |
|
| 131 |
class ObjectDetectionTool(Tool):
|
|
|
|
| 132 |
description = """
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
"""
|
| 138 |
-
name = "object_detection"
|
| 139 |
inputs = {
|
| 140 |
"frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
|
| 141 |
}
|
|
|
|
| 129 |
names_path = "vlm_assets/obj.names"
|
| 130 |
|
| 131 |
class ObjectDetectionTool(Tool):
|
| 132 |
+
name = "object_detection"
|
| 133 |
description = """
|
| 134 |
+
Detect objects in a list of frames (images).
|
| 135 |
+
It takes a list of frames (images) as input and returns
|
| 136 |
+
a list of detected objects with labels, confidence, and bounding boxes.
|
| 137 |
+
The output type will be List[List[str]]
|
| 138 |
"""
|
|
|
|
| 139 |
inputs = {
|
| 140 |
"frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
|
| 141 |
}
|