Spaces:

EyeSeeUIUC
/

EyeSee_chi

Sleeping

App Files Files Community

Niki Zhang commited on May 27, 2024

Commit

cf1091a

verified ·

1 Parent(s): 9434e0e

Update app.py

Browse files

Combine with TTS module

Files changed (1) hide show

app.py +86 -21

app.py CHANGED Viewed

@@ -18,6 +18,16 @@ from caption_anything.segmenter import build_segmenter
 from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
 from segment_anything import sam_model_registry
 import easyocr
 args = parse_augment()
 args.segmenter = "huge"
@@ -102,12 +112,12 @@ def init_openai_api_key(api_key=""):
     print(text_refiner)
     openai_available = text_refiner is not None
     if openai_available:
-        return [gr.update(visible=True)]*6 + [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
     else:
-        return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
 def init_wo_openai_api_key():
-        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]*2 + [gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -256,7 +266,8 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
-                   out_state, click_index_state, input_mask_state, input_points_state, input_labels_state):
     print("state",state)
     click_index = click_index_state
@@ -291,13 +302,23 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
             print("new_cap",new_cap)
             refined_image_input = create_bubble_frame(np.array(origin_image_input), new_cap, click_index, input_mask,
                                                       input_points=input_points, input_labels=input_labels)
-            txt2speech(new_cap)
-            yield state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state
-        else:
-            txt2speech(generated_caption)
-            yield state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state
@@ -531,6 +552,7 @@ def create_ui():
                             interactive=True,
                             label="Generated Caption Length",
                         )
                         enable_wiki = gr.Radio(
                             choices=["Yes", "No"],
                             value="No",
@@ -541,6 +563,7 @@ def create_ui():
                     examples=examples,
                     inputs=[example_image],
                 )
             with gr.Column(scale=0.5):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
@@ -567,18 +590,52 @@ def create_ui():
                         with gr.Row():
                             clear_button_text = gr.Button(value="Clear Text", interactive=True)
                             submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -663,13 +720,19 @@ def create_ui():
         submit_button_click.click(
-        submit_caption,
-        inputs=[image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
-                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state],
-        outputs=[chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state],
-        show_progress=True, queue=True
-    )
@@ -683,6 +746,9 @@ def create_ui():
             show_progress=False, queue=True
         )
         return iface
@@ -690,4 +756,3 @@ if __name__ == '__main__':
     iface = create_ui()
     iface.queue(concurrency_count=5, api_open=False, max_size=10)
     iface.launch(server_name="0.0.0.0", enable_queue=True)

 from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
 from segment_anything import sam_model_registry
 import easyocr
+import tts
+article = """
+<div style='margin:20px auto;'>
+<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
+</div>
+"""
 args = parse_augment()
 args.segmenter = "huge"
     print(text_refiner)
     openai_available = text_refiner is not None
     if openai_available:
+        return [gr.update(visible=True)]*7 + [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
     else:
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
 def init_wo_openai_api_key():
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]*3 + [gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
+                   out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   input_text, input_language, input_audio, input_mic, use_mic, agree):
     print("state",state)
     click_index = click_index_state
             print("new_cap",new_cap)
             refined_image_input = create_bubble_frame(np.array(origin_image_input), new_cap, click_index, input_mask,
                                                       input_points=input_points, input_labels=input_labels)
+            try:
+                waveform_visual, audio_output = tts.predict(new_cap, input_language, input_audio, input_mic, use_mic, agree)
+                print("error tts")
+                yield state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            except Exception as e:
+                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+                print(f"Error during TTS prediction: {str(e)}")
+                yield state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+        else:
+            try:
+                waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
+                yield state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            except Exception as e:
+                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+                print(f"Error during TTS prediction: {str(e)}")
+                yield state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
                             interactive=True,
                             label="Generated Caption Length",
                         )
+                        # 是否启用wiki内容整合到caption中
                         enable_wiki = gr.Radio(
                             choices=["Yes", "No"],
                             value="No",
                     examples=examples,
                     inputs=[example_image],
                 )
             with gr.Column(scale=0.5):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         with gr.Row():
                             clear_button_text = gr.Button(value="Clear Text", interactive=True)
                             submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
+                # TTS interface hidden initially
+                with gr.Column(visible=False) as tts_interface:
+                    input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
+                    input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
+                    input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
+                    input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
+                    use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
+                    agree = gr.Checkbox(label="Agree", value=True)
+                    output_waveform = gr.Video(label="Waveform Visual")
+                    output_audio = gr.Audio(label="Synthesised Audio")
+                    with gr.Row():
+                        submit_tts = gr.Button(value="Submit", interactive=True)
+                        clear_tts = gr.Button(value="Clear", interactive=True)
+        def clear_tts_fields():
+            return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
+        submit_tts.click(
+            tts.predict,
+            inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
+            outputs=[output_waveform, output_audio],
+            queue=True
+        )
+        clear_tts.click(
+            clear_tts_fields,
+            inputs=None,
+            outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
+            queue=False
+        )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
         submit_button_click.click(
+            submit_caption,
+            inputs=[
+        image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        input_text, input_language, input_audio, input_mic, use_mic, agree
+    ],
+            outputs=[
+                chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
+                output_waveform, output_audio
+            ],
+            show_progress=True,
+            queue=True
+        )
             show_progress=False, queue=True
         )
         return iface
     iface = create_ui()
     iface.queue(concurrency_count=5, api_open=False, max_size=10)
     iface.launch(server_name="0.0.0.0", enable_queue=True)