Spaces:
Build error
Build error
Commit ·
967c2bb
1
Parent(s): b635feb
Add audio transcription tool using WhisperModel
Browse files
tools.py
CHANGED
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
| 10 |
import base64
|
| 11 |
from openai import AzureOpenAI
|
| 12 |
from config import MODEL_NAME, MODEL_API_VERSION, MODEL_ENDPOINT, MODEL_KEY
|
|
|
|
| 13 |
|
| 14 |
#=========================================
|
| 15 |
# Search Tools
|
|
@@ -369,8 +370,6 @@ def analyze_image(question: str, path: str) -> str:
|
|
| 369 |
Returns:
|
| 370 |
str: The answer to the question about the image.
|
| 371 |
"""
|
| 372 |
-
# path = "data/cca530fc-4052-43b2-b130-b30968d8aa44.png"
|
| 373 |
-
|
| 374 |
client = AzureOpenAI(
|
| 375 |
api_version=MODEL_API_VERSION,
|
| 376 |
azure_endpoint=MODEL_ENDPOINT,
|
|
@@ -399,3 +398,29 @@ def analyze_image(question: str, path: str) -> str:
|
|
| 399 |
)
|
| 400 |
|
| 401 |
return response.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import base64
|
| 11 |
from openai import AzureOpenAI
|
| 12 |
from config import MODEL_NAME, MODEL_API_VERSION, MODEL_ENDPOINT, MODEL_KEY
|
| 13 |
+
from faster_whisper import WhisperModel
|
| 14 |
|
| 15 |
#=========================================
|
| 16 |
# Search Tools
|
|
|
|
| 370 |
Returns:
|
| 371 |
str: The answer to the question about the image.
|
| 372 |
"""
|
|
|
|
|
|
|
| 373 |
client = AzureOpenAI(
|
| 374 |
api_version=MODEL_API_VERSION,
|
| 375 |
azure_endpoint=MODEL_ENDPOINT,
|
|
|
|
| 398 |
)
|
| 399 |
|
| 400 |
return response.choices[0].message.content.strip()
|
| 401 |
+
|
| 402 |
+
#=========================================
|
| 403 |
+
# Audio Tools
|
| 404 |
+
#=========================================
|
| 405 |
+
@tool
|
| 406 |
+
def transcribe_audio(path: str) -> str:
|
| 407 |
+
"""
|
| 408 |
+
Transcribe audio file and return the text.
|
| 409 |
+
Args:
|
| 410 |
+
path (str): The path to the audio file.
|
| 411 |
+
Returns:
|
| 412 |
+
str: The transcribed text.
|
| 413 |
+
"""
|
| 414 |
+
model = WhisperModel(
|
| 415 |
+
model_size_or_path="small",
|
| 416 |
+
device="cpu"
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
segments, _ = model.transcribe(
|
| 420 |
+
path,
|
| 421 |
+
vad_filter=True,
|
| 422 |
+
condition_on_previous_text=True,
|
| 423 |
+
beam_size=5
|
| 424 |
+
)
|
| 425 |
+
text = "".join(seg.text for seg in segments).strip()
|
| 426 |
+
return text
|