Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ from transformers import pipeline
|
|
| 21 |
# Set an environment variable
|
| 22 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 23 |
|
| 24 |
-
|
| 25 |
SAMPLE_RATE = 16000 # Hz
|
| 26 |
MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
|
| 27 |
DESCRIPTION = '''
|
|
@@ -35,16 +35,10 @@ DESCRIPTION = '''
|
|
| 35 |
'''
|
| 36 |
PLACEHOLDER = """
|
| 37 |
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
| 38 |
-
<img src="./
|
| 39 |
<p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
| 40 |
</div>
|
| 41 |
"""
|
| 42 |
-
# PLACEHOLDER = """
|
| 43 |
-
# <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
| 44 |
-
# <img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
|
| 45 |
-
# <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
| 46 |
-
# </div>
|
| 47 |
-
# """
|
| 48 |
|
| 49 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 50 |
|
|
@@ -73,6 +67,8 @@ terminators = [
|
|
| 73 |
### TTS model
|
| 74 |
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
|
| 75 |
|
|
|
|
|
|
|
| 76 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
| 77 |
"""
|
| 78 |
Convert all files to monochannel 16 kHz wav files.
|
|
@@ -197,10 +193,15 @@ def voice_player(history):
|
|
| 197 |
Plays the generated response using the VITS-ljs model.
|
| 198 |
Returns the audio player with the generated response.
|
| 199 |
"""
|
| 200 |
-
_, text = history
|
| 201 |
voice = pipe(text)
|
| 202 |
-
voice = gr.Audio(value = (
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
return voice
|
| 205 |
|
| 206 |
|
|
@@ -226,7 +227,10 @@ with gr.Blocks(
|
|
| 226 |
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
|
| 227 |
)
|
| 228 |
|
| 229 |
-
audio_file = gr.Audio(
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
|
| 232 |
with gr.Column():
|
|
@@ -238,7 +242,7 @@ with gr.Blocks(
|
|
| 238 |
variant="primary"
|
| 239 |
)
|
| 240 |
|
| 241 |
-
chat_input = gr.Textbox(
|
| 242 |
label="Transcribed text:",
|
| 243 |
interactive=False,
|
| 244 |
placeholder="Transcribed text will appear here.",
|
|
@@ -246,12 +250,12 @@ with gr.Blocks(
|
|
| 246 |
visible=True # set to True to see processing time of asr transcription
|
| 247 |
)
|
| 248 |
|
| 249 |
-
out_audio = gr.Audio(
|
| 250 |
value = None,
|
| 251 |
label="Response Voice Player",
|
| 252 |
show_label=True,
|
| 253 |
visible=True # set to True to see processing time of tts audio generation
|
| 254 |
-
)
|
| 255 |
|
| 256 |
chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
|
| 257 |
bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")
|
|
|
|
| 21 |
# Set an environment variable
|
| 22 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 23 |
|
| 24 |
+
# Variables
|
| 25 |
SAMPLE_RATE = 16000 # Hz
|
| 26 |
MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
|
| 27 |
DESCRIPTION = '''
|
|
|
|
| 35 |
'''
|
| 36 |
PLACEHOLDER = """
|
| 37 |
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
| 38 |
+
<img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
|
| 39 |
<p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
| 40 |
</div>
|
| 41 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 44 |
|
|
|
|
| 67 |
### TTS model
|
| 68 |
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
|
| 69 |
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
| 73 |
"""
|
| 74 |
Convert all files to monochannel 16 kHz wav files.
|
|
|
|
| 193 |
Plays the generated response using the VITS-ljs model.
|
| 194 |
Returns the audio player with the generated response.
|
| 195 |
"""
|
| 196 |
+
_, text = history[-1]
|
| 197 |
voice = pipe(text)
|
| 198 |
+
voice = gr.Audio(value = (
|
| 199 |
+
voice["sampling_rate"],
|
| 200 |
+
voice["audio"].squeeze()),
|
| 201 |
+
type="numpy", autoplay=True,
|
| 202 |
+
label="MyAlexa Response",
|
| 203 |
+
show_label=True,
|
| 204 |
+
visible=True)
|
| 205 |
return voice
|
| 206 |
|
| 207 |
|
|
|
|
| 227 |
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
|
| 228 |
)
|
| 229 |
|
| 230 |
+
audio_file = gr.Audio(
|
| 231 |
+
sources=["microphone", "upload"],
|
| 232 |
+
type="filepath"
|
| 233 |
+
)
|
| 234 |
|
| 235 |
|
| 236 |
with gr.Column():
|
|
|
|
| 242 |
variant="primary"
|
| 243 |
)
|
| 244 |
|
| 245 |
+
chat_input = gr.Textbox( # Shows the transcribed text
|
| 246 |
label="Transcribed text:",
|
| 247 |
interactive=False,
|
| 248 |
placeholder="Transcribed text will appear here.",
|
|
|
|
| 250 |
visible=True # set to True to see processing time of asr transcription
|
| 251 |
)
|
| 252 |
|
| 253 |
+
out_audio = gr.Audio( # Shows an audio player for the generated response
|
| 254 |
value = None,
|
| 255 |
label="Response Voice Player",
|
| 256 |
show_label=True,
|
| 257 |
visible=True # set to True to see processing time of tts audio generation
|
| 258 |
+
)
|
| 259 |
|
| 260 |
chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
|
| 261 |
bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")
|