jnjj commited on
Commit
5baf180
verified
1 Parent(s): e700713

Update gradio_ui.py

Browse files
Files changed (1) hide show
  1. gradio_ui.py +72 -57
gradio_ui.py CHANGED
@@ -1,22 +1,27 @@
1
  import gradio as gr
2
  import os
 
3
  from whisper_tts import WhisperTTS
4
  from ollama_chatbotTTS import OllamaChat
5
  from text_to_speech import TextToSpeech
6
  from sync_audio_video import AudioVideoSync
7
- import re
8
 
 
 
9
  os.system("ollama serve &")
10
 
11
- # Paths
12
  THUMBNAILS_DIR = "thumbnails"
13
  VIDEO_DIR = "sample_video"
14
 
15
  def get_thumbnail_images():
16
  if not os.path.exists(THUMBNAILS_DIR):
17
  return []
18
- return [(os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
19
- for f in os.listdir(THUMBNAILS_DIR) if f.endswith((".png", ".jpg", ".jpeg"))]
 
 
 
20
 
21
  thumbnail_images = get_thumbnail_images()
22
  avatar_names = [name for name, _ in thumbnail_images]
@@ -26,8 +31,8 @@ def find_matching_video(file_name):
26
  if not os.path.exists(VIDEO_DIR):
27
  return None
28
  for video in os.listdir(VIDEO_DIR):
29
- video_name, ext = os.path.splitext(video)
30
- if video_name.lower() == file_name and ext in [".mp4", ".avi", ".mov"]:
31
  return os.path.join(VIDEO_DIR, video)
32
  return None
33
 
@@ -43,76 +48,86 @@ def check_enable_process_button(selected_name, audio_file, transcribed_text):
43
  return gr.update(interactive=False)
44
 
45
  def process_pipeline(audio_file, transcribed_text, selected_name):
 
46
  if audio_file:
47
- whisper_tts = WhisperTTS()
48
- transcribed_text = whisper_tts.transcribe_audio(audio_file)
49
- yield transcribed_text, "", None, None # Show transcribed text first
50
-
 
51
  if not transcribed_text.strip():
52
  yield "Warning: Please provide valid text.", "", None, None
53
  return
54
-
55
- ollama_chat = OllamaChat()
56
- chatbot_response = ollama_chat.get_response(transcribed_text)
57
- chatbot_response = re.sub(r"<think>|</think>", "", chatbot_response).strip()
58
- yield transcribed_text, chatbot_response, None, None # Show chatbot response next
59
 
60
- if not chatbot_response:
 
 
 
 
 
 
61
  yield transcribed_text, "Warning: No chatbot response.", None, None
62
  return
63
 
 
64
  tts = TextToSpeech()
65
- output_audio_path = tts.synthesize(chatbot_response)
66
- yield transcribed_text, chatbot_response, output_audio_path, None # Show generated speech
67
 
 
68
  if not selected_name:
69
- yield transcribed_text, chatbot_response, output_audio_path, "Warning: Select an avatar."
70
  return
71
 
72
- input_video = find_matching_video(selected_name.lower())
73
- if not input_video:
74
- yield transcribed_text, chatbot_response, output_audio_path, "Warning: No matching video."
75
  return
76
 
77
  sync = AudioVideoSync()
78
- output_video_path = sync.sync_audio_video(input_video, output_audio_path)
79
- yield transcribed_text, chatbot_response, output_audio_path, output_video_path # Show final video
80
-
81
- with gr.Blocks() as demo:
82
- gr.Markdown("## Personalized Avatar Video")
83
-
84
- with gr.Row():
85
- with gr.Column():
86
- audio_input = gr.Audio(type="filepath", label="Audio Input")
87
- transcribed_text_output = gr.Textbox(label="Edit and Process Text")
88
- chatbot_response_output = gr.Textbox(label="Assistant Response")
89
- gr.Markdown("### Select an Avatar")
90
- selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
91
- avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
92
- process_button = gr.Button("Generate Lip-Sync Video", interactive=False)
93
-
94
- with gr.Column():
95
- tts_audio_output = gr.Audio(label="Generated Speech")
96
- video_output = gr.Video(label="Final Lip-Synced Video")
97
-
98
- selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
99
- selected_avatar.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
100
- audio_input.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
101
- transcribed_text_output.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text_output], outputs=[process_button])
102
-
103
- process_button.click(
104
- process_pipeline,
105
- inputs=[audio_input, transcribed_text_output, selected_avatar],
106
- outputs=[transcribed_text_output, chatbot_response_output, tts_audio_output, video_output]
107
- )
 
 
 
 
 
 
 
108
 
109
  if __name__ == "__main__":
 
110
  demo.launch(
111
  server_name="0.0.0.0",
112
  server_port=7860,
113
- # opcionalmente:
114
- share=True, # para obtener un link p煤blico
115
- inbrowser=True, # para abrir autom谩ticamente el navegador
116
- # prevent_thread_lock=True # si quieres que el script no bloquee el hilo principal
117
  )
118
-
 
1
  import gradio as gr
2
  import os
3
+ import re
4
  from whisper_tts import WhisperTTS
5
  from ollama_chatbotTTS import OllamaChat
6
  from text_to_speech import TextToSpeech
7
  from sync_audio_video import AudioVideoSync
 
8
 
9
+ # Instalaci贸n y arranque de Ollama
10
+ os.system("curl https://ollama.com/install.sh | sh")
11
  os.system("ollama serve &")
12
 
13
+ # Directorios
14
  THUMBNAILS_DIR = "thumbnails"
15
  VIDEO_DIR = "sample_video"
16
 
17
  def get_thumbnail_images():
18
  if not os.path.exists(THUMBNAILS_DIR):
19
  return []
20
+ return [
21
+ (os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
22
+ for f in os.listdir(THUMBNAILS_DIR)
23
+ if f.lower().endswith((".png", ".jpg", ".jpeg"))
24
+ ]
25
 
26
  thumbnail_images = get_thumbnail_images()
27
  avatar_names = [name for name, _ in thumbnail_images]
 
31
  if not os.path.exists(VIDEO_DIR):
32
  return None
33
  for video in os.listdir(VIDEO_DIR):
34
+ name, ext = os.path.splitext(video)
35
+ if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
36
  return os.path.join(VIDEO_DIR, video)
37
  return None
38
 
 
48
  return gr.update(interactive=False)
49
 
50
  def process_pipeline(audio_file, transcribed_text, selected_name):
51
+ # 1) Si hay audio, transcribir
52
  if audio_file:
53
+ whisper = WhisperTTS()
54
+ transcribed_text = whisper.transcribe_audio(audio_file)
55
+ yield transcribed_text, "", None, None
56
+
57
+ # 2) Validar texto
58
  if not transcribed_text.strip():
59
  yield "Warning: Please provide valid text.", "", None, None
60
  return
 
 
 
 
 
61
 
62
+ # 3) Chatbot
63
+ ollama = OllamaChat()
64
+ resp = ollama.get_response(transcribed_text)
65
+ resp = re.sub(r"<think>|</think>", "", resp).strip()
66
+ yield transcribed_text, resp, None, None
67
+
68
+ if not resp:
69
  yield transcribed_text, "Warning: No chatbot response.", None, None
70
  return
71
 
72
+ # 4) TTS
73
  tts = TextToSpeech()
74
+ audio_out = tts.synthesize(resp)
75
+ yield transcribed_text, resp, audio_out, None
76
 
77
+ # 5) Video
78
  if not selected_name:
79
+ yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
80
  return
81
 
82
+ vid_in = find_matching_video(selected_name)
83
+ if not vid_in:
84
+ yield transcribed_text, resp, audio_out, "Warning: No matching video."
85
  return
86
 
87
  sync = AudioVideoSync()
88
+ vid_out = sync.sync_audio_video(vid_in, audio_out)
89
+ yield transcribed_text, resp, audio_out, vid_out
90
+
91
+ def build_demo() -> gr.Blocks:
92
+ with gr.Blocks() as demo:
93
+ gr.Markdown("## Personalized Avatar Video")
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ audio_input = gr.Audio(type="filepath", label="Audio Input")
98
+ transcribed_text = gr.Textbox(label="Edit and Process Text")
99
+ chatbot_resp = gr.Textbox(label="Assistant Response")
100
+ gr.Markdown("### Select an Avatar")
101
+ selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
102
+ avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
103
+ process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)
104
+
105
+ with gr.Column():
106
+ tts_audio = gr.Audio(label="Generated Speech")
107
+ video_out = gr.Video(label="Final Lip-Synced Video")
108
+
109
+ # Enlazar eventos
110
+ selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
111
+ for inp in (selected_avatar, audio_input, transcribed_text):
112
+ inp.change(check_enable_process_button,
113
+ inputs=[selected_avatar, audio_input, transcribed_text],
114
+ outputs=[process_btn])
115
+
116
+ process_btn.click(
117
+ process_pipeline,
118
+ inputs=[audio_input, transcribed_text, selected_avatar],
119
+ outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
120
+ )
121
+
122
+ # Configurar la cola
123
+ demo = demo.queue(max_size=100000)
124
+ return demo
125
 
126
  if __name__ == "__main__":
127
+ demo = build_demo()
128
  demo.launch(
129
  server_name="0.0.0.0",
130
  server_port=7860,
131
+ share=True,
132
+ inbrowser=True,
 
 
133
  )