File size: 1,417 Bytes
5dfdf10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import whisper
from langchain_core.tools import tool
import os

@tool
def transcriber(audio_path: str, use_gpu: bool = False) -> str:
    """
    Transcribes an audio file

    Parameters
    ----------
    audio_path : str or Path
        Path to an existing audio file (e.g. .wav, .mp3). Must be readable by ffmpeg.
    use_gpu: bool
        Pass True if you are in a colab GPU environment or you have an integrated Nvidia GPU
    
    Returns:
        str: Text of the transcript 
    """
    
    model_size = "tiny"
    ai_model = (
        whisper.load_model(model_size).cuda()
        if use_gpu
        else whisper.load_model(model_size)
    )
        
    raw_transcript = ai_model.transcribe(
        audio_path,
        word_timestamps=False,
        no_speech_threshold=0.5,
        condition_on_previous_text=True,
        compression_ratio_threshold=2.0,
    )
    
    transcript = raw_transcript["text"]
    
    return transcript

if __name__ == "__main__":
    #audio_path = "~/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3"#input("Pass your audio path to transcribe: ")
    #audio_path = os.path.expanduser(audio_path)
    audio_path = "data/temp/yt_audio.mp3"
    print("=" * 30, "\nTranscription\n", "=" * 30, "\n", transcriber(audio_path))


# TODO: include unit testing modules