Speak with Polly voices
Browse files- app.py +87 -16
- requirements.txt +4 -3
- videos/tempfile.mp4 +2 -2
app.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
import io
|
| 2 |
import os
|
|
|
|
| 3 |
from typing import Optional, Tuple
|
| 4 |
import datetime
|
|
|
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import requests
|
| 7 |
|
|
@@ -25,6 +28,8 @@ from openai.error import AuthenticationError, InvalidRequestError
|
|
| 25 |
# Pertains to Express-inator functionality
|
| 26 |
from langchain.prompts import PromptTemplate
|
| 27 |
|
|
|
|
|
|
|
| 28 |
news_api_key = os.environ["NEWS_API_KEY"]
|
| 29 |
tmdb_bearer_token = os.environ["TMDB_BEARER_TOKEN"]
|
| 30 |
|
|
@@ -33,9 +38,11 @@ TOOLS_LIST = ['serpapi', 'wolfram-alpha', 'google-search', 'pal-math', 'pal-colo
|
|
| 33 |
TOOLS_DEFAULT_LIST = ['serpapi', 'pal-math']
|
| 34 |
BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
|
| 35 |
AUTH_ERR_MSG = "Please paste your OpenAI key."
|
|
|
|
| 36 |
|
| 37 |
# Pertains to Express-inator functionality
|
| 38 |
NUM_WORDS_DEFAULT = 0
|
|
|
|
| 39 |
FORMALITY_DEFAULT = "N/A"
|
| 40 |
TEMPERATURE_DEFAULT = 0.5
|
| 41 |
EMOTION_DEFAULT = "N/A"
|
|
@@ -46,6 +53,8 @@ PROMPT_TEMPLATE = PromptTemplate(
|
|
| 46 |
template="Restate {num_words}{formality}{emotions}{translate_to}{literary_style}the following: \n{original_words}\n",
|
| 47 |
)
|
| 48 |
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# UNCOMMENT TO USE WHISPER
|
| 51 |
# warnings.filterwarnings("ignore")
|
|
@@ -128,6 +137,12 @@ def transform_text(desc, express_chain, num_words, formality,
|
|
| 128 |
if literary_style != LITERARY_STYLE_DEFAULT:
|
| 129 |
if literary_style == "Prose":
|
| 130 |
literary_style_str = "as prose, "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
elif literary_style == "Poetry":
|
| 132 |
literary_style_str = "as a poem, "
|
| 133 |
elif literary_style == "Haiku":
|
|
@@ -159,9 +174,9 @@ def transform_text(desc, express_chain, num_words, formality,
|
|
| 159 |
generated_text = desc
|
| 160 |
|
| 161 |
# replace all newlines with <br> in generated_text
|
| 162 |
-
generated_text = generated_text.replace("\n", "
|
| 163 |
|
| 164 |
-
prompt_plus_generated = "
|
| 165 |
|
| 166 |
print("\n==== date/time: " + str(datetime.datetime.now() - datetime.timedelta(hours=5)) + " ====")
|
| 167 |
print("prompt_plus_generated: " + prompt_plus_generated)
|
|
@@ -190,7 +205,7 @@ def set_openai_api_key(api_key):
|
|
| 190 |
If no api_key, then None is returned.
|
| 191 |
"""
|
| 192 |
if api_key:
|
| 193 |
-
llm = OpenAI(temperature=0, openai_api_key=api_key)
|
| 194 |
chain, express_chain = load_chain(TOOLS_DEFAULT_LIST, llm)
|
| 195 |
return chain, express_chain, llm
|
| 196 |
|
|
@@ -284,8 +299,56 @@ def chat(
|
|
| 284 |
text_to_display = hidden_text + "\n\n" + output
|
| 285 |
history.append((inp, text_to_display))
|
| 286 |
|
| 287 |
-
html_video, temp_file = do_html_video_speak(output)
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
|
| 291 |
def do_html_video_speak(words_to_speak):
|
|
@@ -355,13 +418,19 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 355 |
show_label=False, lines=1, type='password')
|
| 356 |
|
| 357 |
with gr.Row():
|
| 358 |
-
with gr.Column(scale=1, min_width=
|
| 359 |
my_file = gr.File(label="Upload a file", type="file", visible=False)
|
| 360 |
tmp_file = gr.File("videos/Masahiro.mp4", visible=False)
|
| 361 |
tmp_file_url = "/file=" + tmp_file.value['name']
|
| 362 |
htm_video = f'<video width="256" height="256" autoplay muted loop><source src={tmp_file_url} type="video/mp4" poster="Masahiro.png"></video>'
|
| 363 |
video_html = gr.HTML(htm_video)
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
with gr.Column(scale=3):
|
| 366 |
chatbot = gr.Chatbot()
|
| 367 |
|
|
@@ -409,12 +478,12 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 409 |
|
| 410 |
with gr.Tab("Translate to"):
|
| 411 |
translate_to_radio = gr.Radio(label="Translate to:", choices=[
|
| 412 |
-
TRANSLATE_TO_DEFAULT, "Arabic", "
|
| 413 |
-
"
|
| 414 |
-
"
|
| 415 |
-
"
|
| 416 |
-
"
|
| 417 |
-
"
|
| 418 |
"emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
|
| 419 |
"Pirate", "Strange Planet expospeak technical talk", "Yoda"],
|
| 420 |
value=TRANSLATE_TO_DEFAULT)
|
|
@@ -425,7 +494,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 425 |
|
| 426 |
with gr.Tab("Lit style"):
|
| 427 |
literary_style_radio = gr.Radio(label="Literary style:", choices=[
|
| 428 |
-
LITERARY_STYLE_DEFAULT, "Prose", "Poetry", "Haiku", "Limerick", "Joke", "Knock-knock"],
|
| 429 |
value=LITERARY_STYLE_DEFAULT)
|
| 430 |
|
| 431 |
literary_style_radio.change(update_foo,
|
|
@@ -491,7 +560,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 491 |
|
| 492 |
with gr.Tab("Max words"):
|
| 493 |
num_words_slider = gr.Slider(label="Max number of words to generate (0 for don't care)",
|
| 494 |
-
value=NUM_WORDS_DEFAULT, minimum=0, maximum=
|
| 495 |
num_words_slider.change(update_foo,
|
| 496 |
inputs=[num_words_slider, num_words_state],
|
| 497 |
outputs=[num_words_state])
|
|
@@ -509,14 +578,16 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 509 |
anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
|
| 510 |
surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
|
| 511 |
translate_to_state, literary_style_state],
|
| 512 |
-
outputs=[chatbot, history_state, video_html, my_file, message])
|
|
|
|
| 513 |
|
| 514 |
submit.click(chat, inputs=[message, history_state, chain_state, trace_chain_state,
|
| 515 |
express_chain_state, num_words_state, formality_state,
|
| 516 |
anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
|
| 517 |
surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
|
| 518 |
translate_to_state, literary_style_state],
|
| 519 |
-
outputs=[chatbot, history_state, video_html, my_file, message])
|
|
|
|
| 520 |
|
| 521 |
openai_api_key_textbox.change(set_openai_api_key,
|
| 522 |
inputs=[openai_api_key_textbox],
|
|
|
|
| 1 |
import io
|
| 2 |
import os
|
| 3 |
+
from contextlib import closing
|
| 4 |
from typing import Optional, Tuple
|
| 5 |
import datetime
|
| 6 |
+
|
| 7 |
+
import boto3
|
| 8 |
import gradio as gr
|
| 9 |
import requests
|
| 10 |
|
|
|
|
| 28 |
# Pertains to Express-inator functionality
|
| 29 |
from langchain.prompts import PromptTemplate
|
| 30 |
|
| 31 |
+
from polly_utils import PollyVoiceData, NEURAL_ENGINE
|
| 32 |
+
|
| 33 |
news_api_key = os.environ["NEWS_API_KEY"]
|
| 34 |
tmdb_bearer_token = os.environ["TMDB_BEARER_TOKEN"]
|
| 35 |
|
|
|
|
| 38 |
TOOLS_DEFAULT_LIST = ['serpapi', 'pal-math']
|
| 39 |
BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
|
| 40 |
AUTH_ERR_MSG = "Please paste your OpenAI key."
|
| 41 |
+
MAX_TOKENS = 512
|
| 42 |
|
| 43 |
# Pertains to Express-inator functionality
|
| 44 |
NUM_WORDS_DEFAULT = 0
|
| 45 |
+
MAX_WORDS = 400
|
| 46 |
FORMALITY_DEFAULT = "N/A"
|
| 47 |
TEMPERATURE_DEFAULT = 0.5
|
| 48 |
EMOTION_DEFAULT = "N/A"
|
|
|
|
| 53 |
template="Restate {num_words}{formality}{emotions}{translate_to}{literary_style}the following: \n{original_words}\n",
|
| 54 |
)
|
| 55 |
|
| 56 |
+
POLLY_VOICE_DATA = PollyVoiceData()
|
| 57 |
+
|
| 58 |
|
| 59 |
# UNCOMMENT TO USE WHISPER
|
| 60 |
# warnings.filterwarnings("ignore")
|
|
|
|
| 137 |
if literary_style != LITERARY_STYLE_DEFAULT:
|
| 138 |
if literary_style == "Prose":
|
| 139 |
literary_style_str = "as prose, "
|
| 140 |
+
elif literary_style == "Summary":
|
| 141 |
+
literary_style_str = "as a summary, "
|
| 142 |
+
elif literary_style == "Outline":
|
| 143 |
+
literary_style_str = "as an outline numbers and lower case letters"
|
| 144 |
+
elif literary_style == "Bullets":
|
| 145 |
+
literary_style_str = "as bullet points using bullets"
|
| 146 |
elif literary_style == "Poetry":
|
| 147 |
literary_style_str = "as a poem, "
|
| 148 |
elif literary_style == "Haiku":
|
|
|
|
| 174 |
generated_text = desc
|
| 175 |
|
| 176 |
# replace all newlines with <br> in generated_text
|
| 177 |
+
generated_text = generated_text.replace("\n", "\n\n")
|
| 178 |
|
| 179 |
+
prompt_plus_generated = "GPT prompt: " + formatted_prompt + "\n\n" + generated_text
|
| 180 |
|
| 181 |
print("\n==== date/time: " + str(datetime.datetime.now() - datetime.timedelta(hours=5)) + " ====")
|
| 182 |
print("prompt_plus_generated: " + prompt_plus_generated)
|
|
|
|
| 205 |
If no api_key, then None is returned.
|
| 206 |
"""
|
| 207 |
if api_key:
|
| 208 |
+
llm = OpenAI(temperature=0, openai_api_key=api_key, max_tokens=MAX_TOKENS)
|
| 209 |
chain, express_chain = load_chain(TOOLS_DEFAULT_LIST, llm)
|
| 210 |
return chain, express_chain, llm
|
| 211 |
|
|
|
|
| 299 |
text_to_display = hidden_text + "\n\n" + output
|
| 300 |
history.append((inp, text_to_display))
|
| 301 |
|
| 302 |
+
# html_video, temp_file = do_html_video_speak(output)
|
| 303 |
+
html_audio, temp_aud_file = do_html_audio_speak(output, translate_to)
|
| 304 |
+
|
| 305 |
+
# return history, history, html_video, temp_file, ""
|
| 306 |
+
return history, history, html_audio, temp_aud_file, ""
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def do_html_audio_speak(words_to_speak, polly_language):
|
| 310 |
+
polly_client = boto3.Session(
|
| 311 |
+
aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
|
| 312 |
+
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
|
| 313 |
+
region_name=os.environ["AWS_DEFAULT_REGION"]
|
| 314 |
+
).client('polly')
|
| 315 |
+
|
| 316 |
+
voice_id, language_code, engine = POLLY_VOICE_DATA.get_voice(polly_language, "Female")
|
| 317 |
+
if not voice_id:
|
| 318 |
+
voice_id = "Joanna"
|
| 319 |
+
language_code = "en-US"
|
| 320 |
+
engine = NEURAL_ENGINE
|
| 321 |
+
response = polly_client.synthesize_speech(
|
| 322 |
+
Text=words_to_speak,
|
| 323 |
+
OutputFormat='mp3',
|
| 324 |
+
VoiceId=voice_id,
|
| 325 |
+
LanguageCode=language_code,
|
| 326 |
+
Engine=engine
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
html_audio = '<pre>no audio</pre>'
|
| 330 |
+
|
| 331 |
+
# Save the audio stream returned by Amazon Polly on Lambda's temp directory
|
| 332 |
+
if "AudioStream" in response:
|
| 333 |
+
with closing(response["AudioStream"]) as stream:
|
| 334 |
+
# output = os.path.join("/tmp/", "speech.mp3")
|
| 335 |
+
|
| 336 |
+
try:
|
| 337 |
+
with open('audios/tempfile.mp3', 'wb') as f:
|
| 338 |
+
f.write(stream.read())
|
| 339 |
+
temp_aud_file = gr.File("audios/tempfile.mp3")
|
| 340 |
+
temp_aud_file_url = "/file=" + temp_aud_file.value['name']
|
| 341 |
+
html_audio = f'<audio autoplay><source src={temp_aud_file_url} type="audio/mp3"></audio>'
|
| 342 |
+
except IOError as error:
|
| 343 |
+
# Could not write to file, exit gracefully
|
| 344 |
+
print(error)
|
| 345 |
+
return None, None
|
| 346 |
+
else:
|
| 347 |
+
# The response didn't contain audio data, exit gracefully
|
| 348 |
+
print("Could not stream audio")
|
| 349 |
+
return None, None
|
| 350 |
+
|
| 351 |
+
return html_audio, "audios/tempfile.mp3"
|
| 352 |
|
| 353 |
|
| 354 |
def do_html_video_speak(words_to_speak):
|
|
|
|
| 418 |
show_label=False, lines=1, type='password')
|
| 419 |
|
| 420 |
with gr.Row():
|
| 421 |
+
with gr.Column(scale=1, min_width=100, visible=False):
|
| 422 |
my_file = gr.File(label="Upload a file", type="file", visible=False)
|
| 423 |
tmp_file = gr.File("videos/Masahiro.mp4", visible=False)
|
| 424 |
tmp_file_url = "/file=" + tmp_file.value['name']
|
| 425 |
htm_video = f'<video width="256" height="256" autoplay muted loop><source src={tmp_file_url} type="video/mp4" poster="Masahiro.png"></video>'
|
| 426 |
video_html = gr.HTML(htm_video)
|
| 427 |
|
| 428 |
+
# my_aud_file = gr.File(label="Audio file", type="file", visible=True)
|
| 429 |
+
tmp_aud_file = gr.File("audios/tempfile.mp3", visible=False)
|
| 430 |
+
tmp_aud_file_url = "/file=" + tmp_aud_file.value['name']
|
| 431 |
+
htm_audio = f'<audio><source src={tmp_aud_file_url} type="audio/mp3"></audio>'
|
| 432 |
+
audio_html = gr.HTML(htm_audio)
|
| 433 |
+
|
| 434 |
with gr.Column(scale=3):
|
| 435 |
chatbot = gr.Chatbot()
|
| 436 |
|
|
|
|
| 478 |
|
| 479 |
with gr.Tab("Translate to"):
|
| 480 |
translate_to_radio = gr.Radio(label="Translate to:", choices=[
|
| 481 |
+
TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
|
| 482 |
+
"Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
|
| 483 |
+
"English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
|
| 484 |
+
"German", "German (Austrian)", "Hindi", "Icelandic", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
|
| 485 |
+
"Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
|
| 486 |
+
"Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Welsh",
|
| 487 |
"emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
|
| 488 |
"Pirate", "Strange Planet expospeak technical talk", "Yoda"],
|
| 489 |
value=TRANSLATE_TO_DEFAULT)
|
|
|
|
| 494 |
|
| 495 |
with gr.Tab("Lit style"):
|
| 496 |
literary_style_radio = gr.Radio(label="Literary style:", choices=[
|
| 497 |
+
LITERARY_STYLE_DEFAULT, "Prose", "Summary", "Outline", "Bullets", "Poetry", "Haiku", "Limerick", "Joke", "Knock-knock"],
|
| 498 |
value=LITERARY_STYLE_DEFAULT)
|
| 499 |
|
| 500 |
literary_style_radio.change(update_foo,
|
|
|
|
| 560 |
|
| 561 |
with gr.Tab("Max words"):
|
| 562 |
num_words_slider = gr.Slider(label="Max number of words to generate (0 for don't care)",
|
| 563 |
+
value=NUM_WORDS_DEFAULT, minimum=0, maximum=MAX_WORDS, step=10)
|
| 564 |
num_words_slider.change(update_foo,
|
| 565 |
inputs=[num_words_slider, num_words_state],
|
| 566 |
outputs=[num_words_state])
|
|
|
|
| 578 |
anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
|
| 579 |
surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
|
| 580 |
translate_to_state, literary_style_state],
|
| 581 |
+
# outputs=[chatbot, history_state, video_html, my_file, message])
|
| 582 |
+
outputs=[chatbot, history_state, audio_html, tmp_aud_file, message])
|
| 583 |
|
| 584 |
submit.click(chat, inputs=[message, history_state, chain_state, trace_chain_state,
|
| 585 |
express_chain_state, num_words_state, formality_state,
|
| 586 |
anticipation_level_state, joy_level_state, trust_level_state, fear_level_state,
|
| 587 |
surprise_level_state, sadness_level_state, disgust_level_state, anger_level_state,
|
| 588 |
translate_to_state, literary_style_state],
|
| 589 |
+
# outputs=[chatbot, history_state, video_html, my_file, message])
|
| 590 |
+
outputs=[chatbot, history_state, audio_html, tmp_aud_file, message])
|
| 591 |
|
| 592 |
openai_api_key_textbox.change(set_openai_api_key,
|
| 593 |
inputs=[openai_api_key_textbox],
|
requirements.txt
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
-
openai==0.26.
|
| 2 |
gradio==3.16.1
|
| 3 |
google-search-results
|
| 4 |
google-api-python-client==2.72.0
|
| 5 |
wolframalpha
|
| 6 |
-
langchain==0.0.
|
| 7 |
-
requests
|
| 8 |
git+https://github.com/openai/whisper.git
|
|
|
|
|
|
| 1 |
+
openai==0.26.1
|
| 2 |
gradio==3.16.1
|
| 3 |
google-search-results
|
| 4 |
google-api-python-client==2.72.0
|
| 5 |
wolframalpha
|
| 6 |
+
langchain==0.0.63
|
| 7 |
+
requests==2.28.2
|
| 8 |
git+https://github.com/openai/whisper.git
|
| 9 |
+
boto3
|
videos/tempfile.mp4
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80aa42c48e322cc8da957e8d2e2aba7f1cd5675abb998146facf83a279a1c07d
|
| 3 |
+
size 80784
|