import tempfile import gradio as gr from neon_tts_plugin_coqui import CoquiTTS import whisper import requests import tempfile LANGUAGES = list(CoquiTTS.langs.keys()) default_lang = "en" title = "Talk to (almost) anyone in the world" coquiTTS = CoquiTTS() model_med = whisper.load_model("base") def tts(audio, language): #print(text, language) transcribe, text, lang = whisper_stt(audio,language) # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: coquiTTS.get_tts(text, fp, speaker = {"language" : language}) return fp.name def whisper_stt(audio,language): print("Inside Whisper TTS") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model_med.device) # detect the spoken language _, probs = model_med.detect_language(mel) lang = max(probs, key=probs.get) print(f"Detected language: {max(probs, key=probs.get)}") # decode the audio options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang options_transl = whisper.DecodingOptions(fp16 = False, language=language, task='translate') #lang result_transc = whisper.decode(model_med, mel, options_transc) result_transl = whisper.decode(model_med, mel, options_transl) # print the recognized text print(f"transcript is : {result_transc.text}") print(f"translation is : {result_transl.text}") return result_transc.text, result_transl.text, lang with gr.Blocks() as blocks: gr.Markdown("