Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 26, 2024

Commit

1b825cc

verified ·

1 Parent(s): 78e7cbb

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -17

app.py CHANGED Viewed

@@ -10,6 +10,11 @@ import spaces
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
 import tempfile
 # Install flash-attention
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -63,9 +68,20 @@ vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_c
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 # Helper functions
-@spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
-def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20):
     try:
         conversation = [{"role": "system", "content": system_prompt}]
         for prompt, answer in history:
@@ -91,27 +107,31 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
             streamer=streamer,
         )
-        with torch.no_grad():
-            thread = Thread(target=text_model.generate, kwargs=generate_kwargs)
-            thread.start()
         buffer = ""
         audio_buffer = np.array([])
         for new_text in streamer:
             buffer += new_text
-            # Generate speech for the new text
-            tts_input_ids = tts_tokenizer(new_text, return_tensors="pt").input_ids.to(device)
-            tts_description = "A clear and natural voice reads the text with moderate speed and expression."
-            tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
-            with torch.no_grad():
-                audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
-            new_audio = audio_generation.cpu().numpy().squeeze()
-            audio_buffer = np.concatenate((audio_buffer, new_audio))
             yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
     except Exception as e:
         print(f"An error occurred: {str(e)}")
         yield history + [[message, f"An error occurred: {str(e)}"]], None
@@ -190,7 +210,7 @@ custom_suggestions = """
 </div>
 """
-# Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
@@ -213,11 +233,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
             max_new_tokens = gr.Slider(minimum=128, maximum=8192, step=1, value=1024, label="Max new tokens")
             top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p")
             top_k = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k")
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
-        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot, audio_output])
         clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):

 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
 import tempfile
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+# Add this global variable after the imports
+executor = ThreadPoolExecutor(max_workers=2)
 # Install flash-attention
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
+# Add the generate_speech function here
+async def generate_speech(text, tts_model, tts_tokenizer):
+    tts_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
+    tts_description = "A clear and natural voice reads the text with moderate speed and expression."
+    tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
+    with torch.no_grad():
+        audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
+    return audio_generation.cpu().numpy().squeeze()
 # Helper functions
+@spaces.GPU(timeout=300)
+async def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20, use_tts=True):
     try:
         conversation = [{"role": "system", "content": system_prompt}]
         for prompt, answer in history:
             streamer=streamer,
         )
+        thread = Thread(target=text_model.generate, kwargs=generate_kwargs)
+        thread.start()
         buffer = ""
         audio_buffer = np.array([])
+        tts_future = None
         for new_text in streamer:
             buffer += new_text
+            if use_tts and len(buffer) > 50:  # Start TTS generation when buffer has enough content
+                if tts_future is None:
+                    tts_future = asyncio.get_event_loop().run_in_executor(
+                        executor, generate_speech, buffer, tts_model, tts_tokenizer
+                    )
             yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
+        # Wait for TTS to complete if it's still running
+        if use_tts and tts_future is not None:
+            audio_buffer = await tts_future
+        # Final yield with complete text and audio
+        yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
     except Exception as e:
         print(f"An error occurred: {str(e)}")
         yield history + [[message, f"An error occurred: {str(e)}"]], None
 </div>
 """
+# Update the Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
             max_new_tokens = gr.Slider(minimum=128, maximum=8192, step=1, value=1024, label="Max new tokens")
             top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p")
             top_k = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k")
+            use_tts = gr.Checkbox(label="Enable Text-to-Speech", value=True)
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
+        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k, use_tts], [chatbot, audio_output])
         clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):