File size: 2,410 Bytes
e60f160
 
 
 
 
aa4b93b
 
 
e60f160
 
 
 
 
 
 
1ae6775
e60f160
 
 
 
1ae6775
e60f160
aa4b93b
 
 
e60f160
 
 
 
 
aa4b93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e60f160
 
 
 
 
 
 
aa4b93b
e60f160
 
 
 
 
 
 
 
 
 
 
 
aa4b93b
e60f160
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import tempfile

import gradio as gr

from neon_tts_plugin_coqui import CoquiTTS
import whisper 
import requests 
import tempfile


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"



title = "Talk to (almost) anyone in the world"


coquiTTS = CoquiTTS()

model_med = whisper.load_model("base")

def tts(audio, language):
    #print(text, language)
    transcribe, text, lang = whisper_stt(audio,language)
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(text, fp, speaker = {"language" : language})
        return fp.name

def whisper_stt(audio,language):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model_med.device)
  
  # detect the spoken language
  _, probs = model_med.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language=language, task='translate') #lang
  result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model_med, mel, options_transl)
  
  # print the recognized text
  print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transc.text, result_transl.text, lang
  

with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    with gr.Row():# equal_height=False
        with gr.Column():# variant="panel"
            in_audio = gr.Audio(source="microphone",type="filepath", label='Record your voice here')
            radio = gr.Radio(
                label="Language",
                choices=LANGUAGES,
                value=default_lang
            )
            with gr.Row():# mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        audio = gr.Audio(label="Output", interactive=False)

    # actions
    submit.click(
        tts,
        [in_audio, radio],
        [audio],
    )



blocks.launch()