Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 29, 2024

Commit

14b3eb9

1 Parent(s): c3f6601

Added and organized comments

Browse files

Files changed (1) hide show

app.py +23 -19

app.py CHANGED Viewed

@@ -18,10 +18,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import pipeline
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-# Variables
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
@@ -29,8 +30,8 @@ DESCRIPTION = '''
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
 <p style='text-align: center'>MyAlexa is a demo of a voice chat assistant with chat logs that accepts audio input and outputs an AI response. </p>
 <p>This space uses <a href="https://huggingface.co/nvidia/canary-1b"><b>NVIDIA Canary 1B</b></a> for Automatic Speech-to-text Recognition (ASR), <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama 3 8B Insruct</b></a> for the large language model (LLM) and <a href="https://huggingface.co/kakao-enterprise/vits-ljs"><b>VITS-ljs by Kakao Enterprise</b></a> for text to speech (TTS).</p>
-<p>This demo accepts audio inputs not more than 40 seconds long.</p>
-<p>Transcription and responses are limited to the English language.</p>
 </div>
 '''
 PLACEHOLDER = """
@@ -42,7 +43,7 @@ PLACEHOLDER = """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-### ASR model
 canary_model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
 canary_model.eval()
 # make sure beam size always 1 for consistency
@@ -51,7 +52,7 @@ decoding_cfg = canary_model.cfg.decoding
 decoding_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decoding_cfg)
-### LLM model
 # Load the tokenizer and model
 llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 llama3_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
@@ -64,11 +65,11 @@ terminators = [
     llm_tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
-### TTS model
 pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
@@ -99,8 +100,8 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 def transcribe(audio_filepath):
 	"""
-	Transcribes a converted audio file.
-	Set to english language with punctuations.
 	Returns the transcribed text as a string.
 	"""
@@ -136,15 +137,15 @@ def transcribe(audio_filepath):
 def add_message(history, message):
 	"""
 	Adds the input message in the chatbot.
-	Returns the updated chatbot history.
 	"""
 	history.append((message, None))
 	return history
 def bot(history, message):
 	"""
-	Gets the bot's response and places the user and bot messages in the chatbot
-	Returns the appended chatbot history.
 	"""
 	response = bot_response(message, history)
 	lines = response.split("\n")
@@ -162,8 +163,8 @@ def bot(history, message):
 @spaces.GPU()
 def bot_response(message, history):
     """
-    Generates a streaming response using the llama3-8b model.
-	Set max_new_tokens = 100, temperature=0.6, and top_p=0.9
     Returns the generated response in string format.
     """
     conversation = []
@@ -175,7 +176,7 @@ def bot_response(message, history):
     outputs = llama3_model.generate(
 		input_ids,
-		max_new_tokens = 100,
 		eos_token_id = terminators,
 		do_sample=True,
 		temperature=0.6,
@@ -190,7 +191,7 @@ def bot_response(message, history):
 @spaces.GPU()
 def voice_player(history):
     """
-    Plays the generated response using the VITS-ljs model.
 	Returns the audio player with the generated response.
     """
     _, text = history[-1]
@@ -205,7 +206,9 @@ def voice_player(history):
 		visible=True)
     return voice
 with gr.Blocks(
 	title="MyAlexa",
 	css="""
@@ -251,13 +254,13 @@ with gr.Blocks(
 				visible=False # set to True to see processing time of asr transcription
 			)
-			gr.HTML("<p><b>Step 3 [Optional]:</b> Replay MyAlexa's voice response.</p>")
 			out_audio = gr.Audio( # Shows an audio player for the generated response
 				value = None,
-				label="Response Voice Player",
 				show_label=True,
-				visible=False # set to True to see processing time of initial tts audio generation
 			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
@@ -270,6 +273,7 @@ with gr.Blocks(
 		outputs = [chat_input]
 	)
 demo.queue()
 if __name__ == "__main__":
     demo.launch()

 from transformers import pipeline
+#### Variables ###
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
 <p style='text-align: center'>MyAlexa is a demo of a voice chat assistant with chat logs that accepts audio input and outputs an AI response. </p>
 <p>This space uses <a href="https://huggingface.co/nvidia/canary-1b"><b>NVIDIA Canary 1B</b></a> for Automatic Speech-to-text Recognition (ASR), <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama 3 8B Insruct</b></a> for the large language model (LLM) and <a href="https://huggingface.co/kakao-enterprise/vits-ljs"><b>VITS-ljs by Kakao Enterprise</b></a> for text to speech (TTS).</p>
+<p>This demo accepts audio inputs not more than 40 seconds long. Transcription and responses are limited to the English language.</p>
+<p>The LLM max_new_tokens, temperature and top_p are set to 512, 0.6 and 0.9 respectively</p>
 </div>
 '''
 PLACEHOLDER = """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+### ASR model ###
 canary_model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
 canary_model.eval()
 # make sure beam size always 1 for consistency
 decoding_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decoding_cfg)
+### LLM model ###
 # Load the tokenizer and model
 llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 llama3_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
     llm_tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
+### TTS model ###
 pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
+### Start of functions ###
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	Convert all files to monochannel 16 kHz wav files.
 def transcribe(audio_filepath):
 	"""
+	Transcribes a converted audio file using the asr model.
+	Set to the english language with punctuations.
 	Returns the transcribed text as a string.
 	"""
 def add_message(history, message):
 	"""
 	Adds the input message in the chatbot.
+	Returns the updated chatbot.
 	"""
 	history.append((message, None))
 	return history
 def bot(history, message):
 	"""
+	Gets the bot's response and adds it in the chatbot.
+	Returns the appended chatbot.
 	"""
 	response = bot_response(message, history)
 	lines = response.split("\n")
 @spaces.GPU()
 def bot_response(message, history):
     """
+    Generates a streaming response using the llm model.
+	Set max_new_tokens = 512, temperature=0.6, and top_p=0.9
     Returns the generated response in string format.
     """
     conversation = []
     outputs = llama3_model.generate(
 		input_ids,
+		max_new_tokens = 512,
 		eos_token_id = terminators,
 		do_sample=True,
 		temperature=0.6,
 @spaces.GPU()
 def voice_player(history):
     """
+    Plays the generated response using the tts model.
 	Returns the audio player with the generated response.
     """
     _, text = history[-1]
 		visible=True)
     return voice
+### End of functions ###
+### Interface using Blocks###
 with gr.Blocks(
 	title="MyAlexa",
 	css="""
 				visible=False # set to True to see processing time of asr transcription
 			)
+			gr.HTML("<p><b>[Optional]:</b> Replay MyAlexa's voice response.</p>")
 			out_audio = gr.Audio( # Shows an audio player for the generated response
 				value = None,
+				label="Response Audio Player",
 				show_label=True,
+				visible=False # set to True to see processing time of the first tts audio generation
 			)
 	chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
 		outputs = [chat_input]
 	)
+### Queue and launch the demo ###
 demo.queue()
 if __name__ == "__main__":
     demo.launch()