talkaway / app.py
Vikranth's picture
Update app.py
1ae6775
import tempfile
import gradio as gr
from neon_tts_plugin_coqui import CoquiTTS
import whisper
import requests
import tempfile
LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"
title = "Talk to (almost) anyone in the world"
coquiTTS = CoquiTTS()
model_med = whisper.load_model("base")
def tts(audio, language):
#print(text, language)
transcribe, text, lang = whisper_stt(audio,language)
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
return fp.name
def whisper_stt(audio,language):
print("Inside Whisper TTS")
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model_med.device)
# detect the spoken language
_, probs = model_med.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language=language, task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transc.text, result_transl.text, lang
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
with gr.Row():# equal_height=False
with gr.Column():# variant="panel"
in_audio = gr.Audio(source="microphone",type="filepath", label='Record your voice here')
radio = gr.Radio(
label="Language",
choices=LANGUAGES,
value=default_lang
)
with gr.Row():# mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
audio = gr.Audio(label="Output", interactive=False)
# actions
submit.click(
tts,
[in_audio, radio],
[audio],
)
blocks.launch()