File size: 2,410 Bytes
e60f160 aa4b93b e60f160 1ae6775 e60f160 1ae6775 e60f160 aa4b93b e60f160 aa4b93b e60f160 aa4b93b e60f160 aa4b93b e60f160 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | import tempfile
import gradio as gr
from neon_tts_plugin_coqui import CoquiTTS
import whisper
import requests
import tempfile
LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"
title = "Talk to (almost) anyone in the world"
coquiTTS = CoquiTTS()
model_med = whisper.load_model("base")
def tts(audio, language):
#print(text, language)
transcribe, text, lang = whisper_stt(audio,language)
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
return fp.name
def whisper_stt(audio,language):
print("Inside Whisper TTS")
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model_med.device)
# detect the spoken language
_, probs = model_med.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language=language, task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transc.text, result_transl.text, lang
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
with gr.Row():# equal_height=False
with gr.Column():# variant="panel"
in_audio = gr.Audio(source="microphone",type="filepath", label='Record your voice here')
radio = gr.Radio(
label="Language",
choices=LANGUAGES,
value=default_lang
)
with gr.Row():# mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
audio = gr.Audio(label="Output", interactive=False)
# actions
submit.click(
tts,
[in_audio, radio],
[audio],
)
blocks.launch()
|