serverdaun commited on
Commit
967c2bb
·
1 Parent(s): b635feb

Add audio transcription tool using WhisperModel

Browse files
Files changed (1) hide show
  1. tools.py +27 -2
tools.py CHANGED
@@ -10,6 +10,7 @@ from pathlib import Path
10
  import base64
11
  from openai import AzureOpenAI
12
  from config import MODEL_NAME, MODEL_API_VERSION, MODEL_ENDPOINT, MODEL_KEY
 
13
 
14
  #=========================================
15
  # Search Tools
@@ -369,8 +370,6 @@ def analyze_image(question: str, path: str) -> str:
369
  Returns:
370
  str: The answer to the question about the image.
371
  """
372
- # path = "data/cca530fc-4052-43b2-b130-b30968d8aa44.png"
373
-
374
  client = AzureOpenAI(
375
  api_version=MODEL_API_VERSION,
376
  azure_endpoint=MODEL_ENDPOINT,
@@ -399,3 +398,29 @@ def analyze_image(question: str, path: str) -> str:
399
  )
400
 
401
  return response.choices[0].message.content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import base64
11
  from openai import AzureOpenAI
12
  from config import MODEL_NAME, MODEL_API_VERSION, MODEL_ENDPOINT, MODEL_KEY
13
+ from faster_whisper import WhisperModel
14
 
15
  #=========================================
16
  # Search Tools
 
370
  Returns:
371
  str: The answer to the question about the image.
372
  """
 
 
373
  client = AzureOpenAI(
374
  api_version=MODEL_API_VERSION,
375
  azure_endpoint=MODEL_ENDPOINT,
 
398
  )
399
 
400
  return response.choices[0].message.content.strip()
401
+
402
+ #=========================================
403
+ # Audio Tools
404
+ #=========================================
405
+ @tool
406
+ def transcribe_audio(path: str) -> str:
407
+ """
408
+ Transcribe audio file and return the text.
409
+ Args:
410
+ path (str): The path to the audio file.
411
+ Returns:
412
+ str: The transcribed text.
413
+ """
414
+ model = WhisperModel(
415
+ model_size_or_path="small",
416
+ device="cpu"
417
+ )
418
+
419
+ segments, _ = model.transcribe(
420
+ path,
421
+ vad_filter=True,
422
+ condition_on_previous_text=True,
423
+ beam_size=5
424
+ )
425
+ text = "".join(seg.text for seg in segments).strip()
426
+ return text