xd

Paused

App Files Files Community

jnjj commited on Apr 25, 2025

Commit

5baf180

verified ·

1 Parent(s): e700713

Update gradio_ui.py

Browse files

Files changed (1) hide show

gradio_ui.py +72 -57

gradio_ui.py CHANGED Viewed

@@ -1,22 +1,27 @@
 import gradio as gr
 import os
 from whisper_tts import WhisperTTS
 from ollama_chatbotTTS import OllamaChat
 from text_to_speech import TextToSpeech
 from sync_audio_video import AudioVideoSync
-import re
 os.system("ollama serve &")
-# Paths
 THUMBNAILS_DIR = "thumbnails"
 VIDEO_DIR = "sample_video"
 def get_thumbnail_images():
     if not os.path.exists(THUMBNAILS_DIR):
         return []
-    return [(os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
-            for f in os.listdir(THUMBNAILS_DIR) if f.endswith((".png", ".jpg", ".jpeg"))]
 thumbnail_images = get_thumbnail_images()
 avatar_names = [name for name, _ in thumbnail_images]
@@ -26,8 +31,8 @@ def find_matching_video(file_name):
     if not os.path.exists(VIDEO_DIR):
         return None
     for video in os.listdir(VIDEO_DIR):
-        video_name, ext = os.path.splitext(video)
-        if video_name.lower() == file_name and ext in [".mp4", ".avi", ".mov"]:
             return os.path.join(VIDEO_DIR, video)
     return None
@@ -43,76 +48,86 @@ def check_enable_process_button(selected_name, audio_file, transcribed_text):
     return gr.update(interactive=False)
 def process_pipeline(audio_file, transcribed_text, selected_name):
     if audio_file:
-        whisper_tts = WhisperTTS()
-        transcribed_text = whisper_tts.transcribe_audio(audio_file)
-        yield transcribed_text, "", None, None  # Show transcribed text first
     if not transcribed_text.strip():
         yield "Warning: Please provide valid text.", "", None, None
         return
-    ollama_chat = OllamaChat()
-    chatbot_response = ollama_chat.get_response(transcribed_text)
-    chatbot_response = re.sub(r"<think>|</think>", "", chatbot_response).strip()
-    yield transcribed_text, chatbot_response, None, None  # Show chatbot response next
-    if not chatbot_response:
         yield transcribed_text, "Warning: No chatbot response.", None, None
         return
     tts = TextToSpeech()
-    output_audio_path = tts.synthesize(chatbot_response)
-    yield transcribed_text, chatbot_response, output_audio_path, None  # Show generated speech
     if not selected_name:
-        yield transcribed_text, chatbot_response, output_audio_path, "Warning: Select an avatar."
         return
-    input_video = find_matching_video(selected_name.lower())
-    if not input_video:
-        yield transcribed_text, chatbot_response, output_audio_path, "Warning: No matching video."
         return
     sync = AudioVideoSync()
-    output_video_path = sync.sync_audio_video(input_video, output_audio_path)
-    yield transcribed_text, chatbot_response, output_audio_path, output_video_path  # Show final video
-with gr.Blocks() as demo:
-    gr.Markdown("## Personalized Avatar Video")
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(type="filepath", label="Audio Input")
-            transcribed_text_output = gr.Textbox(label="Edit and Process Text")
-            chatbot_response_output = gr.Textbox(label="Assistant Response")
-            gr.Markdown("### Select an Avatar")
-            selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
-            avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
-            process_button = gr.Button("Generate Lip-Sync Video", interactive=False)
-        with gr.Column():
-            tts_audio_output = gr.Audio(label="Generated Speech")
-            video_output = gr.Video(label="Final Lip-Synced Video")
-    selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
-    selected_avatar.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
-    audio_input.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
-    transcribed_text_output.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
-    process_button.click(
-        process_pipeline,
-        inputs=[audio_input, transcribed_text_output, selected_avatar],
-        outputs=[transcribed_text_output, chatbot_response_output, tts_audio_output, video_output]
-    )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        # opcionalmente:
-        share=True,         # para obtener un link público
-        inbrowser=True,     # para abrir automáticamente el navegador
-#        prevent_thread_lock=True  # si quieres que el script no bloquee el hilo principal
     )

 import gradio as gr
 import os
+import re
 from whisper_tts import WhisperTTS
 from ollama_chatbotTTS import OllamaChat
 from text_to_speech import TextToSpeech
 from sync_audio_video import AudioVideoSync
+# Instalación y arranque de Ollama
+os.system("curl https://ollama.com/install.sh | sh")
 os.system("ollama serve &")
+# Directorios
 THUMBNAILS_DIR = "thumbnails"
 VIDEO_DIR = "sample_video"
 def get_thumbnail_images():
     if not os.path.exists(THUMBNAILS_DIR):
         return []
+    return [
+        (os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
+        for f in os.listdir(THUMBNAILS_DIR)
+        if f.lower().endswith((".png", ".jpg", ".jpeg"))
+    ]
 thumbnail_images = get_thumbnail_images()
 avatar_names = [name for name, _ in thumbnail_images]
     if not os.path.exists(VIDEO_DIR):
         return None
     for video in os.listdir(VIDEO_DIR):
+        name, ext = os.path.splitext(video)
+        if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
             return os.path.join(VIDEO_DIR, video)
     return None
     return gr.update(interactive=False)
 def process_pipeline(audio_file, transcribed_text, selected_name):
+    # 1) Si hay audio, transcribir
     if audio_file:
+        whisper = WhisperTTS()
+        transcribed_text = whisper.transcribe_audio(audio_file)
+        yield transcribed_text, "", None, None
+    # 2) Validar texto
     if not transcribed_text.strip():
         yield "Warning: Please provide valid text.", "", None, None
         return
+    # 3) Chatbot
+    ollama = OllamaChat()
+    resp = ollama.get_response(transcribed_text)
+    resp = re.sub(r"<think>|</think>", "", resp).strip()
+    yield transcribed_text, resp, None, None
+    if not resp:
         yield transcribed_text, "Warning: No chatbot response.", None, None
         return
+    # 4) TTS
     tts = TextToSpeech()
+    audio_out = tts.synthesize(resp)
+    yield transcribed_text, resp, audio_out, None
+    # 5) Video
     if not selected_name:
+        yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
         return
+    vid_in = find_matching_video(selected_name)
+    if not vid_in:
+        yield transcribed_text, resp, audio_out, "Warning: No matching video."
         return
     sync = AudioVideoSync()
+    vid_out = sync.sync_audio_video(vid_in, audio_out)
+    yield transcribed_text, resp, audio_out, vid_out
+def build_demo() -> gr.Blocks:
+    with gr.Blocks() as demo:
+        gr.Markdown("## Personalized Avatar Video")
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(type="filepath", label="Audio Input")
+                transcribed_text = gr.Textbox(label="Edit and Process Text")
+                chatbot_resp = gr.Textbox(label="Assistant Response")
+                gr.Markdown("### Select an Avatar")
+                selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
+                avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
+                process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)
+            with gr.Column():
+                tts_audio = gr.Audio(label="Generated Speech")
+                video_out = gr.Video(label="Final Lip-Synced Video")
+        # Enlazar eventos
+        selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
+        for inp in (selected_avatar, audio_input, transcribed_text):
+            inp.change(check_enable_process_button,
+                       inputs=[selected_avatar, audio_input, transcribed_text],
+                       outputs=[process_btn])
+        process_btn.click(
+            process_pipeline,
+            inputs=[audio_input, transcribed_text, selected_avatar],
+            outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
+        )
+    # Configurar la cola
+    demo = demo.queue(max_size=100000)
+    return demo
 if __name__ == "__main__":
+    demo = build_demo()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True,
+        inbrowser=True,
     )