JLW commited on
Commit
1d01b9d
·
1 Parent(s): 068803e

Speak with Polly voices

Browse files
Files changed (3) hide show
  1. app.py +87 -16
  2. requirements.txt +4 -3
  3. videos/tempfile.mp4 +2 -2
app.py CHANGED
@@ -1,7 +1,10 @@
1
  import io
2
  import os
 
3
  from typing import Optional, Tuple
4
  import datetime
 
 
5
  import gradio as gr
6
  import requests
7
 
@@ -25,6 +28,8 @@ from openai.error import AuthenticationError, InvalidRequestError
25
  # Pertains to Express-inator functionality
26
  from langchain.prompts import PromptTemplate
27
 
 
 
28
  news_api_key = os.environ["NEWS_API_KEY"]
29
  tmdb_bearer_token = os.environ["TMDB_BEARER_TOKEN"]
30
 
@@ -33,9 +38,11 @@ TOOLS_LIST = ['serpapi', 'wolfram-alpha', 'google-search', 'pal-math', 'pal-colo
33
  TOOLS_DEFAULT_LIST = ['serpapi', 'pal-math']
34
  BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
35
  AUTH_ERR_MSG = "Please paste your OpenAI key."
 
36
 
37
  # Pertains to Express-inator functionality
38
  NUM_WORDS_DEFAULT = 0
 
39
  FORMALITY_DEFAULT = "N/A"
40
  TEMPERATURE_DEFAULT = 0.5
41
  EMOTION_DEFAULT = "N/A"
@@ -46,6 +53,8 @@ PROMPT_TEMPLATE = PromptTemplate(
46
  template="Restate {num_words}{formality}{emotions}{translate_to}{literary_style}the following: \n{original_words}\n",
47
  )
48
 
 
 
49
 
50
  # UNCOMMENT TO USE WHISPER
51
  # warnings.filterwarnings("ignore")
@@ -128,6 +137,12 @@ def transform_text(desc, express_chain, num_words, formality,
128
  if literary_style != LITERARY_STYLE_DEFAULT:
129
  if literary_style == "Prose":
130
  literary_style_str = "as prose, "
 
 
 
 
 
 
131
  elif literary_style == "Poetry":
132
  literary_style_str = "as a poem, "
133
  elif literary_style == "Haiku":
@@ -159,9 +174,9 @@ def transform_text(desc, express_chain, num_words, formality,
159
  generated_text = desc
160
 
161
  # replace all newlines with <br> in generated_text
162
- generated_text = generated_text.replace("\n", "<br>")
163
 
164
- prompt_plus_generated = "<b>GPT prompt:</b> " + formatted_prompt + "<br/><br/><code>" + generated_text + "</code>"
165
 
166
  print("\n==== date/time: " + str(datetime.datetime.now() - datetime.timedelta(hours=5)) + " ====")
167
  print("prompt_plus_generated: " + prompt_plus_generated)
@@ -190,7 +205,7 @@ def set_openai_api_key(api_key):
190
  If no api_key, then None is returned.
191
  """
192
  if api_key:
193
- llm = OpenAI(temperature=0, openai_api_key=api_key)
194
  chain, express_chain = load_chain(TOOLS_DEFAULT_LIST, llm)
195
  return chain, express_chain, llm
196
 
@@ -284,8 +299,56 @@ def chat(
284
  text_to_display = hidden_text + "\n\n" + output
285
  history.append((inp, text_to_display))
286
 
287
- html_video, temp_file = do_html_video_speak(output)
288
- return history, history, html_video, temp_file, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
 
291
  def do_html_video_speak(words_to_speak):
@@ -355,13 +418,19 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
355
  show_label=False, lines=1, type='password')
356
 
357
  with gr.Row():
358
- with gr.Column(scale=1, min_width=240):
359
  my_file = gr.File(label="Upload a file", type="file", visible=False)
360
  tmp_file = gr.File("videos/Masahiro.mp4", visible=False)
361
  tmp_file_url = "/file=" + tmp_file.value['name']
362
  htm_video = f'<video width="256" height="256" autoplay muted loop><source src={tmp_file_url} type="video/mp4" poster="Masahiro.png"></video>'
363
  video_html = gr.HTML(htm_video)
364
 
 
 
 
 
 
 
365
  with gr.Column(scale=3):
366
  chatbot = gr.Chatbot()
367
 
@@ -409,12 +478,12 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
409
 
410
  with gr.Tab("Translate to"):
411
  translate_to_radio = gr.Radio(label="Translate to:", choices=[
412
- TRANSLATE_TO_DEFAULT, "Arabic", "British English", "Chinese (Simplified)", "Chinese (Traditional)",
413
- "Czech", "Danish", "Dutch", "English", "Finnish", "French", "German",
414
- "Greek", "Hebrew", "Hindi", "Hungarian", "Indonesian", "Italian", "Japanese",
415
- "Korean", "Norwegian", "Old English", "Polish", "Portuguese", "Romanian",
416
- "Russian", "Spanish", "Swedish", "Thai", "Turkish",
417
- "Vietnamese",
418
  "emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
419
  "Pirate", "Strange Planet expospeak technical talk", "Yoda"],
420
  value=TRANSLATE_TO_DEFAULT)
@@ -425,7 +494,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
425
 
426
  with gr.Tab("Lit style"):
427
  literary_style_radio = gr.Radio(label="Literary style:", choices=[
428
- LITERARY_STYLE_DEFAULT, "Prose", "Poetry", "Haiku", "Limerick", "Joke", "Knock-knock"],
429
  value=LITERARY_STYLE_DEFAULT)
430
 
431
  literary_style_radio.change(update_foo,
@@ -491,7 +560,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
491
 
492
  with gr.Tab("Max words"):
493
  num_words_slider = gr.Slider(label="Max number of words to generate (0 for don't care)",
494
- value=NUM_WORDS_DEFAULT, minimum=0, maximum=100, step=10)
495
  num_words_slider.change(update_foo,
496
  inputs=[num_words_slider, num_words_state],
497
  outputs=[num_words_state])
@@ -509,14 +578,16 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
509
  anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
510
  surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
511
  translate_to_state, literary_style_state],
512
- outputs=[chatbot, history_state, video_html, my_file, message])
 
513
 
514
  submit.click(chat, inputs=[message, history_state, chain_state, trace_chain_state,
515
  express_chain_state, num_words_state, formality_state,
516
  anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
517
  surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
518
  translate_to_state, literary_style_state],
519
- outputs=[chatbot, history_state, video_html, my_file, message])
 
520
 
521
  openai_api_key_textbox.change(set_openai_api_key,
522
  inputs=[openai_api_key_textbox],
 
1
  import io
2
  import os
3
+ from contextlib import closing
4
  from typing import Optional, Tuple
5
  import datetime
6
+
7
+ import boto3
8
  import gradio as gr
9
  import requests
10
 
 
28
  # Pertains to Express-inator functionality
29
  from langchain.prompts import PromptTemplate
30
 
31
+ from polly_utils import PollyVoiceData, NEURAL_ENGINE
32
+
33
  news_api_key = os.environ["NEWS_API_KEY"]
34
  tmdb_bearer_token = os.environ["TMDB_BEARER_TOKEN"]
35
 
 
38
  TOOLS_DEFAULT_LIST = ['serpapi', 'pal-math']
39
  BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
40
  AUTH_ERR_MSG = "Please paste your OpenAI key."
41
+ MAX_TOKENS = 512
42
 
43
  # Pertains to Express-inator functionality
44
  NUM_WORDS_DEFAULT = 0
45
+ MAX_WORDS = 400
46
  FORMALITY_DEFAULT = "N/A"
47
  TEMPERATURE_DEFAULT = 0.5
48
  EMOTION_DEFAULT = "N/A"
 
53
  template="Restate {num_words}{formality}{emotions}{translate_to}{literary_style}the following: \n{original_words}\n",
54
  )
55
 
56
+ POLLY_VOICE_DATA = PollyVoiceData()
57
+
58
 
59
  # UNCOMMENT TO USE WHISPER
60
  # warnings.filterwarnings("ignore")
 
137
  if literary_style != LITERARY_STYLE_DEFAULT:
138
  if literary_style == "Prose":
139
  literary_style_str = "as prose, "
140
+ elif literary_style == "Summary":
141
+ literary_style_str = "as a summary, "
142
+ elif literary_style == "Outline":
143
+ literary_style_str = "as an outline numbers and lower case letters"
144
+ elif literary_style == "Bullets":
145
+ literary_style_str = "as bullet points using bullets"
146
  elif literary_style == "Poetry":
147
  literary_style_str = "as a poem, "
148
  elif literary_style == "Haiku":
 
174
  generated_text = desc
175
 
176
  # replace all newlines with <br> in generated_text
177
+ generated_text = generated_text.replace("\n", "\n\n")
178
 
179
+ prompt_plus_generated = "GPT prompt: " + formatted_prompt + "\n\n" + generated_text
180
 
181
  print("\n==== date/time: " + str(datetime.datetime.now() - datetime.timedelta(hours=5)) + " ====")
182
  print("prompt_plus_generated: " + prompt_plus_generated)
 
205
  If no api_key, then None is returned.
206
  """
207
  if api_key:
208
+ llm = OpenAI(temperature=0, openai_api_key=api_key, max_tokens=MAX_TOKENS)
209
  chain, express_chain = load_chain(TOOLS_DEFAULT_LIST, llm)
210
  return chain, express_chain, llm
211
 
 
299
  text_to_display = hidden_text + "\n\n" + output
300
  history.append((inp, text_to_display))
301
 
302
+ # html_video, temp_file = do_html_video_speak(output)
303
+ html_audio, temp_aud_file = do_html_audio_speak(output, translate_to)
304
+
305
+ # return history, history, html_video, temp_file, ""
306
+ return history, history, html_audio, temp_aud_file, ""
307
+
308
+
309
+ def do_html_audio_speak(words_to_speak, polly_language):
310
+ polly_client = boto3.Session(
311
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
312
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
313
+ region_name=os.environ["AWS_DEFAULT_REGION"]
314
+ ).client('polly')
315
+
316
+ voice_id, language_code, engine = POLLY_VOICE_DATA.get_voice(polly_language, "Female")
317
+ if not voice_id:
318
+ voice_id = "Joanna"
319
+ language_code = "en-US"
320
+ engine = NEURAL_ENGINE
321
+ response = polly_client.synthesize_speech(
322
+ Text=words_to_speak,
323
+ OutputFormat='mp3',
324
+ VoiceId=voice_id,
325
+ LanguageCode=language_code,
326
+ Engine=engine
327
+ )
328
+
329
+ html_audio = '<pre>no audio</pre>'
330
+
331
+ # Save the audio stream returned by Amazon Polly on Lambda's temp directory
332
+ if "AudioStream" in response:
333
+ with closing(response["AudioStream"]) as stream:
334
+ # output = os.path.join("/tmp/", "speech.mp3")
335
+
336
+ try:
337
+ with open('audios/tempfile.mp3', 'wb') as f:
338
+ f.write(stream.read())
339
+ temp_aud_file = gr.File("audios/tempfile.mp3")
340
+ temp_aud_file_url = "/file=" + temp_aud_file.value['name']
341
+ html_audio = f'<audio autoplay><source src={temp_aud_file_url} type="audio/mp3"></audio>'
342
+ except IOError as error:
343
+ # Could not write to file, exit gracefully
344
+ print(error)
345
+ return None, None
346
+ else:
347
+ # The response didn't contain audio data, exit gracefully
348
+ print("Could not stream audio")
349
+ return None, None
350
+
351
+ return html_audio, "audios/tempfile.mp3"
352
 
353
 
354
  def do_html_video_speak(words_to_speak):
 
418
  show_label=False, lines=1, type='password')
419
 
420
  with gr.Row():
421
+ with gr.Column(scale=1, min_width=100, visible=False):
422
  my_file = gr.File(label="Upload a file", type="file", visible=False)
423
  tmp_file = gr.File("videos/Masahiro.mp4", visible=False)
424
  tmp_file_url = "/file=" + tmp_file.value['name']
425
  htm_video = f'<video width="256" height="256" autoplay muted loop><source src={tmp_file_url} type="video/mp4" poster="Masahiro.png"></video>'
426
  video_html = gr.HTML(htm_video)
427
 
428
+ # my_aud_file = gr.File(label="Audio file", type="file", visible=True)
429
+ tmp_aud_file = gr.File("audios/tempfile.mp3", visible=False)
430
+ tmp_aud_file_url = "/file=" + tmp_aud_file.value['name']
431
+ htm_audio = f'<audio><source src={tmp_aud_file_url} type="audio/mp3"></audio>'
432
+ audio_html = gr.HTML(htm_audio)
433
+
434
  with gr.Column(scale=3):
435
  chatbot = gr.Chatbot()
436
 
 
478
 
479
  with gr.Tab("Translate to"):
480
  translate_to_radio = gr.Radio(label="Translate to:", choices=[
481
+ TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
482
+ "Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
483
+ "English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
484
+ "German", "German (Austrian)", "Hindi", "Icelandic", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
485
+ "Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
486
+ "Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Welsh",
487
  "emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
488
  "Pirate", "Strange Planet expospeak technical talk", "Yoda"],
489
  value=TRANSLATE_TO_DEFAULT)
 
494
 
495
  with gr.Tab("Lit style"):
496
  literary_style_radio = gr.Radio(label="Literary style:", choices=[
497
+ LITERARY_STYLE_DEFAULT, "Prose", "Summary", "Outline", "Bullets", "Poetry", "Haiku", "Limerick", "Joke", "Knock-knock"],
498
  value=LITERARY_STYLE_DEFAULT)
499
 
500
  literary_style_radio.change(update_foo,
 
560
 
561
  with gr.Tab("Max words"):
562
  num_words_slider = gr.Slider(label="Max number of words to generate (0 for don't care)",
563
+ value=NUM_WORDS_DEFAULT, minimum=0, maximum=MAX_WORDS, step=10)
564
  num_words_slider.change(update_foo,
565
  inputs=[num_words_slider, num_words_state],
566
  outputs=[num_words_state])
 
578
  anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
579
  surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
580
  translate_to_state, literary_style_state],
581
+ # outputs=[chatbot, history_state, video_html, my_file, message])
582
+ outputs=[chatbot, history_state, audio_html, tmp_aud_file, message])
583
 
584
  submit.click(chat, inputs=[message, history_state, chain_state, trace_chain_state,
585
  express_chain_state, num_words_state, formality_state,
586
  anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
587
  surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
588
  translate_to_state, literary_style_state],
589
+ # outputs=[chatbot, history_state, video_html, my_file, message])
590
+ outputs=[chatbot, history_state, audio_html, tmp_aud_file, message])
591
 
592
  openai_api_key_textbox.change(set_openai_api_key,
593
  inputs=[openai_api_key_textbox],
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- openai==0.26.0
2
  gradio==3.16.1
3
  google-search-results
4
  google-api-python-client==2.72.0
5
  wolframalpha
6
- langchain==0.0.61
7
- requests~=2.28.1
8
  git+https://github.com/openai/whisper.git
 
 
1
+ openai==0.26.1
2
  gradio==3.16.1
3
  google-search-results
4
  google-api-python-client==2.72.0
5
  wolframalpha
6
+ langchain==0.0.63
7
+ requests==2.28.2
8
  git+https://github.com/openai/whisper.git
9
+ boto3
videos/tempfile.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:355cfbe21252ee7bd7b3cc6ea13e68abc209330bd139abb0d24e301d42e74b57
3
- size 75
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80aa42c48e322cc8da957e8d2e2aba7f1cd5675abb998146facf83a279a1c07d
3
+ size 80784