Spaces:
Running
Running
Update app.py
Browse filesUpdates for o4, tts via gpt-4o
app.py
CHANGED
|
@@ -496,7 +496,7 @@ class DialogueItem(BaseModel):
|
|
| 496 |
class Dialogue(BaseModel):
|
| 497 |
scratchpad: str
|
| 498 |
dialogue: List[DialogueItem]
|
| 499 |
-
|
| 500 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
|
| 501 |
client = OpenAI(
|
| 502 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
|
@@ -511,6 +511,25 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
|
|
| 511 |
for chunk in response.iter_bytes():
|
| 512 |
file.write(chunk)
|
| 513 |
return file.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
|
| 516 |
from functools import wraps
|
|
@@ -531,10 +550,12 @@ def conditional_llm(model, api_base=None, api_key=None):
|
|
| 531 |
def generate_audio(
|
| 532 |
files: list,
|
| 533 |
openai_api_key: str = None,
|
| 534 |
-
text_model: str = "o1-2024-12-17", #"o1-preview-2024-09-12",
|
| 535 |
audio_model: str = "tts-1",
|
| 536 |
speaker_1_voice: str = "alloy",
|
| 537 |
speaker_2_voice: str = "echo",
|
|
|
|
|
|
|
| 538 |
api_base: str = None,
|
| 539 |
intro_instructions: str = '',
|
| 540 |
text_instructions: str = '',
|
|
@@ -578,8 +599,6 @@ def generate_audio(
|
|
| 578 |
with file_path.open("r", encoding="utf-8") as f:
|
| 579 |
text = f.read()
|
| 580 |
combined_text += text + "\n\n"
|
| 581 |
-
|
| 582 |
-
|
| 583 |
# Configure the LLM based on selected model and api_base
|
| 584 |
@retry(retry=retry_if_exception_type(ValidationError))
|
| 585 |
@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
|
|
@@ -642,7 +661,8 @@ def generate_audio(
|
|
| 642 |
for line in llm_output.dialogue:
|
| 643 |
transcript_line = f"{line.speaker}: {line.text}"
|
| 644 |
voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
|
| 645 |
-
|
|
|
|
| 646 |
futures.append((future, transcript_line))
|
| 647 |
characters += len(line.text)
|
| 648 |
|
|
@@ -675,7 +695,7 @@ def generate_audio(
|
|
| 675 |
def validate_and_generate_audio(*args):
|
| 676 |
files = args[0]
|
| 677 |
if not files:
|
| 678 |
-
return None, None, None, "Please upload at least one PDF file before generating audio."
|
| 679 |
try:
|
| 680 |
audio_file, transcript, original_text = generate_audio(*args)
|
| 681 |
return audio_file, transcript, original_text, None # Return None as the error when successful
|
|
@@ -741,7 +761,6 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 741 |
|
| 742 |
with gr.Row(elem_id="main_container"):
|
| 743 |
with gr.Column(scale=2):
|
| 744 |
-
#files = gr.Files(label="PDFs", file_types=["pdf"], )
|
| 745 |
files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
|
| 746 |
|
| 747 |
openai_api_key = gr.Textbox(
|
|
@@ -753,7 +772,7 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 753 |
text_model = gr.Dropdown(
|
| 754 |
label="Text Generation Model",
|
| 755 |
choices=STANDARD_TEXT_MODELS,
|
| 756 |
-
value="o1-preview-2024-09-12", #"gpt-4o-mini",
|
| 757 |
info="Select the model to generate the dialogue text.",
|
| 758 |
)
|
| 759 |
audio_model = gr.Dropdown(
|
|
@@ -774,6 +793,20 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 774 |
value="echo",
|
| 775 |
info="Select the voice for Speaker 2.",
|
| 776 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
api_base = gr.Textbox(
|
| 778 |
label="Custom API Base",
|
| 779 |
placeholder="Enter custom API base URL if using a custom/local model...",
|
|
@@ -852,7 +885,8 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 852 |
fn=validate_and_generate_audio,
|
| 853 |
inputs=[
|
| 854 |
files, openai_api_key, text_model, audio_model,
|
| 855 |
-
speaker_1_voice, speaker_2_voice,
|
|
|
|
| 856 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
| 857 |
prelude_dialog, podcast_dialog_instructions,
|
| 858 |
edited_transcript, # placeholder for edited_transcript
|
|
@@ -881,7 +915,8 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 881 |
inputs=[
|
| 882 |
use_edited_transcript, edited_transcript,
|
| 883 |
files, openai_api_key, text_model, audio_model,
|
| 884 |
-
speaker_1_voice, speaker_2_voice,
|
|
|
|
| 885 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
| 886 |
prelude_dialog, podcast_dialog_instructions,
|
| 887 |
user_feedback, original_text_output
|
|
@@ -908,7 +943,7 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
| 908 |
#demo.queue(max_size=20, default_concurrency_limit=32)
|
| 909 |
|
| 910 |
# Launch the Gradio app
|
| 911 |
-
if __name__ == "__main__":
|
| 912 |
-
demo.launch(share=True)
|
| 913 |
|
| 914 |
-
|
|
|
|
| 496 |
class Dialogue(BaseModel):
|
| 497 |
scratchpad: str
|
| 498 |
dialogue: List[DialogueItem]
|
| 499 |
+
'''
|
| 500 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
|
| 501 |
client = OpenAI(
|
| 502 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
|
|
|
| 511 |
for chunk in response.iter_bytes():
|
| 512 |
file.write(chunk)
|
| 513 |
return file.getvalue()
|
| 514 |
+
'''
|
| 515 |
+
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
|
| 516 |
+
speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
|
| 517 |
+
client = OpenAI(
|
| 518 |
+
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
with client.audio.speech.with_streaming_response.create(
|
| 523 |
+
model=audio_model,
|
| 524 |
+
voice=voice,
|
| 525 |
+
input=text,
|
| 526 |
+
instructions=speaker_instructions,
|
| 527 |
+
) as response:
|
| 528 |
+
with io.BytesIO() as file:
|
| 529 |
+
for chunk in response.iter_bytes():
|
| 530 |
+
file.write(chunk)
|
| 531 |
+
return file.getvalue()
|
| 532 |
+
|
| 533 |
|
| 534 |
|
| 535 |
from functools import wraps
|
|
|
|
| 550 |
def generate_audio(
|
| 551 |
files: list,
|
| 552 |
openai_api_key: str = None,
|
| 553 |
+
text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
|
| 554 |
audio_model: str = "tts-1",
|
| 555 |
speaker_1_voice: str = "alloy",
|
| 556 |
speaker_2_voice: str = "echo",
|
| 557 |
+
speaker_1_instructions: str = '',
|
| 558 |
+
speaker_2_instructions: str = '',
|
| 559 |
api_base: str = None,
|
| 560 |
intro_instructions: str = '',
|
| 561 |
text_instructions: str = '',
|
|
|
|
| 599 |
with file_path.open("r", encoding="utf-8") as f:
|
| 600 |
text = f.read()
|
| 601 |
combined_text += text + "\n\n"
|
|
|
|
|
|
|
| 602 |
# Configure the LLM based on selected model and api_base
|
| 603 |
@retry(retry=retry_if_exception_type(ValidationError))
|
| 604 |
@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
|
|
|
|
| 661 |
for line in llm_output.dialogue:
|
| 662 |
transcript_line = f"{line.speaker}: {line.text}"
|
| 663 |
voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
|
| 664 |
+
speaker_instructions=speaker_1_instructions if line.speaker == "speaker-1" else speaker_2_instructions
|
| 665 |
+
future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key, speaker_instructions, )
|
| 666 |
futures.append((future, transcript_line))
|
| 667 |
characters += len(line.text)
|
| 668 |
|
|
|
|
| 695 |
def validate_and_generate_audio(*args):
|
| 696 |
files = args[0]
|
| 697 |
if not files:
|
| 698 |
+
return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
|
| 699 |
try:
|
| 700 |
audio_file, transcript, original_text = generate_audio(*args)
|
| 701 |
return audio_file, transcript, original_text, None # Return None as the error when successful
|
|
|
|
| 761 |
|
| 762 |
with gr.Row(elem_id="main_container"):
|
| 763 |
with gr.Column(scale=2):
|
|
|
|
| 764 |
files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
|
| 765 |
|
| 766 |
openai_api_key = gr.Textbox(
|
|
|
|
| 772 |
text_model = gr.Dropdown(
|
| 773 |
label="Text Generation Model",
|
| 774 |
choices=STANDARD_TEXT_MODELS,
|
| 775 |
+
value="o3-mini", "o4-mini", #"o1-preview-2024-09-12", #"gpt-4o-mini",
|
| 776 |
info="Select the model to generate the dialogue text.",
|
| 777 |
)
|
| 778 |
audio_model = gr.Dropdown(
|
|
|
|
| 793 |
value="echo",
|
| 794 |
info="Select the voice for Speaker 2.",
|
| 795 |
)
|
| 796 |
+
speaker_1_instructions = gr.Textbox(
|
| 797 |
+
label="Speaker 1 instructions",
|
| 798 |
+
value="Speak in an emotive and friendly tone.",
|
| 799 |
+
info="Speaker 1 instructions (used with gpt-4o-mini-tts only)",
|
| 800 |
+
interactive=True,
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
speaker_2_instructions = gr.Textbox(
|
| 804 |
+
label="Speaker 2 instructions",
|
| 805 |
+
value="Speak in a friendly, but serious tone.",
|
| 806 |
+
info="Speaker 2 instructions (used with gpt-4o-mini-tts only)",
|
| 807 |
+
interactive=True,
|
| 808 |
+
)
|
| 809 |
+
|
| 810 |
api_base = gr.Textbox(
|
| 811 |
label="Custom API Base",
|
| 812 |
placeholder="Enter custom API base URL if using a custom/local model...",
|
|
|
|
| 885 |
fn=validate_and_generate_audio,
|
| 886 |
inputs=[
|
| 887 |
files, openai_api_key, text_model, audio_model,
|
| 888 |
+
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
| 889 |
+
api_base,
|
| 890 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
| 891 |
prelude_dialog, podcast_dialog_instructions,
|
| 892 |
edited_transcript, # placeholder for edited_transcript
|
|
|
|
| 915 |
inputs=[
|
| 916 |
use_edited_transcript, edited_transcript,
|
| 917 |
files, openai_api_key, text_model, audio_model,
|
| 918 |
+
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
| 919 |
+
api_base,
|
| 920 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
| 921 |
prelude_dialog, podcast_dialog_instructions,
|
| 922 |
user_feedback, original_text_output
|
|
|
|
| 943 |
#demo.queue(max_size=20, default_concurrency_limit=32)
|
| 944 |
|
| 945 |
# Launch the Gradio app
|
| 946 |
+
#if __name__ == "__main__":
|
| 947 |
+
# demo.launch(share=True)
|
| 948 |
|
| 949 |
+
demo.launch()
|