Spaces:

lamm-mit
/

PDF2Audio

Running

App Files Files Community

mjbuehler commited on Apr 17

Commit

9aef995

verified ·

1 Parent(s): 773f681

Update app.py

Browse files

Updates for o4, tts via gpt-4o

Files changed (1) hide show

app.py +48 -13

app.py CHANGED Viewed

@@ -496,7 +496,7 @@ class DialogueItem(BaseModel):
 class Dialogue(BaseModel):
     scratchpad: str
     dialogue: List[DialogueItem]
 def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
     client = OpenAI(
         api_key=api_key or os.getenv("OPENAI_API_KEY"),
@@ -511,6 +511,25 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
             for chunk in response.iter_bytes():
                 file.write(chunk)
             return file.getvalue()
 from functools import wraps
@@ -531,10 +550,12 @@ def conditional_llm(model, api_base=None, api_key=None):
 def generate_audio(
     files: list,
     openai_api_key: str = None,
-    text_model: str = "o1-2024-12-17", #"o1-preview-2024-09-12",
     audio_model: str = "tts-1",
     speaker_1_voice: str = "alloy",
     speaker_2_voice: str = "echo",
     api_base: str = None,
     intro_instructions: str = '',
     text_instructions: str = '',
@@ -578,8 +599,6 @@ def generate_audio(
                 with file_path.open("r", encoding="utf-8") as f:
                     text = f.read()
                     combined_text += text + "\n\n"
     # Configure the LLM based on selected model and api_base
     @retry(retry=retry_if_exception_type(ValidationError))
     @conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
@@ -642,7 +661,8 @@ def generate_audio(
         for line in llm_output.dialogue:
             transcript_line = f"{line.speaker}: {line.text}"
             voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
-            future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key)
             futures.append((future, transcript_line))
             characters += len(line.text)
@@ -675,7 +695,7 @@ def generate_audio(
 def validate_and_generate_audio(*args):
     files = args[0]
     if not files:
-        return None, None, None, "Please upload at least one PDF file before generating audio."
     try:
         audio_file, transcript, original_text = generate_audio(*args)
         return audio_file, transcript, original_text, None  # Return None as the error when successful
@@ -741,7 +761,6 @@ with gr.Blocks(title="PDF to Audio", css="""
     with gr.Row(elem_id="main_container"):
         with gr.Column(scale=2):
-            #files = gr.Files(label="PDFs", file_types=["pdf"], )
             files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
             openai_api_key = gr.Textbox(
@@ -753,7 +772,7 @@ with gr.Blocks(title="PDF to Audio", css="""
             text_model = gr.Dropdown(
                 label="Text Generation Model",
                 choices=STANDARD_TEXT_MODELS,
-                value="o1-preview-2024-09-12", #"gpt-4o-mini",
                 info="Select the model to generate the dialogue text.",
             )
             audio_model = gr.Dropdown(
@@ -774,6 +793,20 @@ with gr.Blocks(title="PDF to Audio", css="""
                 value="echo",
                 info="Select the voice for Speaker 2.",
             )
             api_base = gr.Textbox(
                 label="Custom API Base",
                 placeholder="Enter custom API base URL if using a custom/local model...",
@@ -852,7 +885,8 @@ with gr.Blocks(title="PDF to Audio", css="""
         fn=validate_and_generate_audio,
         inputs=[
             files, openai_api_key, text_model, audio_model,
-            speaker_1_voice, speaker_2_voice, api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
             edited_transcript,  # placeholder for edited_transcript
@@ -881,7 +915,8 @@ with gr.Blocks(title="PDF to Audio", css="""
         inputs=[
             use_edited_transcript, edited_transcript,
             files, openai_api_key, text_model, audio_model,
-            speaker_1_voice, speaker_2_voice, api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
             user_feedback, original_text_output
@@ -908,7 +943,7 @@ with gr.Blocks(title="PDF to Audio", css="""
 #demo.queue(max_size=20, default_concurrency_limit=32)
 # Launch the Gradio app
-if __name__ == "__main__":
-    demo.launch(share=True)
-#demo.launch(server_name="127.0.0.1", server_port=7860)

 class Dialogue(BaseModel):
     scratchpad: str
     dialogue: List[DialogueItem]
+'''
 def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
     client = OpenAI(
         api_key=api_key or os.getenv("OPENAI_API_KEY"),
             for chunk in response.iter_bytes():
                 file.write(chunk)
             return file.getvalue()
+'''
+def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
+           speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
+    client = OpenAI(
+        api_key=api_key or os.getenv("OPENAI_API_KEY"),
+    )
+    with client.audio.speech.with_streaming_response.create(
+        model=audio_model,
+        voice=voice,
+        input=text,
+        instructions=speaker_instructions,
+    ) as response:
+        with io.BytesIO() as file:
+            for chunk in response.iter_bytes():
+                file.write(chunk)
+            return file.getvalue()
 from functools import wraps
 def generate_audio(
     files: list,
     openai_api_key: str = None,
+    text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
     audio_model: str = "tts-1",
     speaker_1_voice: str = "alloy",
     speaker_2_voice: str = "echo",
+    speaker_1_instructions: str = '',
+    speaker_2_instructions: str = '',
     api_base: str = None,
     intro_instructions: str = '',
     text_instructions: str = '',
                 with file_path.open("r", encoding="utf-8") as f:
                     text = f.read()
                     combined_text += text + "\n\n"
     # Configure the LLM based on selected model and api_base
     @retry(retry=retry_if_exception_type(ValidationError))
     @conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
         for line in llm_output.dialogue:
             transcript_line = f"{line.speaker}: {line.text}"
             voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
+            speaker_instructions=speaker_1_instructions if line.speaker == "speaker-1" else speaker_2_instructions
+            future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key, speaker_instructions, )
             futures.append((future, transcript_line))
             characters += len(line.text)
 def validate_and_generate_audio(*args):
     files = args[0]
     if not files:
+        return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
     try:
         audio_file, transcript, original_text = generate_audio(*args)
         return audio_file, transcript, original_text, None  # Return None as the error when successful
     with gr.Row(elem_id="main_container"):
         with gr.Column(scale=2):
             files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
             openai_api_key = gr.Textbox(
             text_model = gr.Dropdown(
                 label="Text Generation Model",
                 choices=STANDARD_TEXT_MODELS,
+                value="o3-mini", "o4-mini", #"o1-preview-2024-09-12", #"gpt-4o-mini",
                 info="Select the model to generate the dialogue text.",
             )
             audio_model = gr.Dropdown(
                 value="echo",
                 info="Select the voice for Speaker 2.",
             )
+            speaker_1_instructions = gr.Textbox(
+                label="Speaker 1 instructions",
+                value="Speak in an emotive and friendly tone.",
+                info="Speaker 1 instructions (used with gpt-4o-mini-tts only)",
+                interactive=True,
+            )
+            speaker_2_instructions = gr.Textbox(
+                label="Speaker 2 instructions",
+                value="Speak in a friendly, but serious tone.",
+                info="Speaker 2 instructions (used with gpt-4o-mini-tts only)",
+                interactive=True,
+            )
             api_base = gr.Textbox(
                 label="Custom API Base",
                 placeholder="Enter custom API base URL if using a custom/local model...",
         fn=validate_and_generate_audio,
         inputs=[
             files, openai_api_key, text_model, audio_model,
+            speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
+            api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
             edited_transcript,  # placeholder for edited_transcript
         inputs=[
             use_edited_transcript, edited_transcript,
             files, openai_api_key, text_model, audio_model,
+            speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
+            api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
             user_feedback, original_text_output
 #demo.queue(max_size=20, default_concurrency_limit=32)
 # Launch the Gradio app
+#if __name__ == "__main__":
+#    demo.launch(share=True)
+demo.launch()