Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 29, 2024

Commit

548c4d8

1 Parent(s): 474452f

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from transformers import pipeline
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
@@ -35,16 +35,10 @@ DESCRIPTION = '''
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src="./MyAlexaLogo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
    <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
 </div>
 """
-# PLACEHOLDER = """
-# <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-#    <img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
-#    <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
-# </div>
-# """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -73,6 +67,8 @@ terminators = [
 ### TTS model
 pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
@@ -197,10 +193,15 @@ def voice_player(history):
     Plays the generated response using the VITS-ljs model.
 	Returns the audio player with the generated response.
     """
-    _, text = history
     voice = pipe(text)
-    voice =  gr.Audio(value = (voice["sampling_rate"], voice["audio"].squeeze()), type="numpy", autoplay=True, label="MyAlexa Response", show_label=True,
-                               visible=True)
     return voice
@@ -226,7 +227,10 @@ with gr.Blocks(
 				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
 			)
-			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
 		with gr.Column():
@@ -238,7 +242,7 @@ with gr.Blocks(
 				variant="primary"
 			)
-			chat_input = gr.Textbox(
 				label="Transcribed text:",
 				interactive=False,
 				placeholder="Transcribed text will appear here.",
@@ -246,12 +250,12 @@ with gr.Blocks(
 				visible=True # set to True to see processing time of asr transcription
 			)
-			out_audio = gr.Audio(
 				value = None,
 				label="Response Voice Player",
 				show_label=True,
 				visible=True # set to True to see processing time of tts audio generation
-			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
 	bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")

 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# Variables
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
    <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
 </div>
 """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ### TTS model
 pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
     Plays the generated response using the VITS-ljs model.
 	Returns the audio player with the generated response.
     """
+    _, text = history[-1]
     voice = pipe(text)
+    voice =  gr.Audio(value = (
+		voice["sampling_rate"],
+		voice["audio"].squeeze()),
+		type="numpy", autoplay=True,
+		label="MyAlexa Response",
+		show_label=True,
+		visible=True)
     return voice
 				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
 			)
+			audio_file = gr.Audio(
+				sources=["microphone", "upload"],
+				type="filepath"
+			)
 		with gr.Column():
 				variant="primary"
 			)
+			chat_input = gr.Textbox( # Shows the transcribed text
 				label="Transcribed text:",
 				interactive=False,
 				placeholder="Transcribed text will appear here.",
 				visible=True # set to True to see processing time of asr transcription
 			)
+			out_audio = gr.Audio( # Shows an audio player for the generated response
 				value = None,
 				label="Response Voice Player",
 				show_label=True,
 				visible=True # set to True to see processing time of tts audio generation
+			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
 	bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")