LTTEAM commited on
Commit
5c5adf3
·
verified ·
1 Parent(s): 1a0bd0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -57
app.py CHANGED
@@ -3,13 +3,15 @@ import sys
3
  import importlib
4
  import json
5
  import asyncio
 
6
  from datetime import datetime
7
 
8
  import torch
9
  import gradio as gr
10
  import pydub
11
  import edge_tts
12
- import srt
 
13
 
14
  # --- 1) Đảm bảo src/ có trong Python path để import ChatterboxVC ---
15
  script_dir = os.path.dirname(os.path.abspath(__file__))
@@ -46,11 +48,15 @@ def yield_vc_updates(log_msg=None, audio_data=None, file_list=None, log_append=T
46
  log_update = gr.update(value="\n".join(global_log_messages_vc))
47
 
48
  # audio output
49
- audio_update = gr.update(visible=(audio_data is not None),
50
- value=audio_data if audio_data is not None else None)
 
 
51
  # file-download output
52
- files_update = gr.update(visible=(file_list is not None),
53
- value=file_list if file_list is not None else [])
 
 
54
 
55
  yield log_update, audio_update, files_update
56
 
@@ -69,7 +75,7 @@ def load_edge_tts_voices(json_path="voices.json"):
69
 
70
  edge_choices, edge_code_map = load_edge_tts_voices()
71
 
72
- # --- 5) TTS Edge với rate & volume ---
73
  async def _edge_tts_async(text, disp, rate_pct, vol_pct):
74
  code = edge_code_map.get(disp)
75
  rate_str = f"{rate_pct:+d}%"
@@ -82,37 +88,86 @@ def run_edge_tts(text, disp, rate_pct, vol_pct):
82
  path = asyncio.run(_edge_tts_async(text, disp, rate_pct, vol_pct))
83
  return path, path
84
 
85
- # --- 6) Sinh audio từ SRT (có rate & vol) ---
86
- def synthesize_srt_audio(srt_path: str, disp_voice: str, work_dir: str,
87
- rate_pct: int, vol_pct: int) -> str:
88
- with open(srt_path, "r", encoding="utf-8") as f:
89
- subs = list(srt.parse(f.read()))
90
- combined = pydub.AudioSegment.empty()
91
- current_ms = 0
92
-
93
- for sub in subs:
94
- start_ms = int(sub.start.total_seconds() * 1000)
95
- end_ms = int(sub.end.total_seconds() * 1000)
96
- dur_ms = end_ms - start_ms
97
-
98
- # silence until start
99
- if start_ms > current_ms:
100
- combined += pydub.AudioSegment.silent(duration=start_ms - current_ms)
101
-
102
- tmp_wav, _ = run_edge_tts(sub.content, disp_voice, rate_pct, vol_pct)
103
- tts_audio = pydub.AudioSegment.from_file(tmp_wav)
104
-
105
- # crop/pad để match dur
106
- if len(tts_audio) > dur_ms:
107
- tts_audio = tts_audio[:dur_ms]
 
 
 
 
 
 
 
 
 
 
 
 
108
  else:
109
- tts_audio += pydub.AudioSegment.silent(duration=dur_ms - len(tts_audio))
110
-
111
- combined += tts_audio
112
- current_ms = end_ms
113
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  out_path = os.path.join(work_dir, "srt_source.wav")
115
- combined.export(out_path, format="wav")
 
 
 
 
 
 
 
 
 
116
  return out_path
117
 
118
  # --- 7) Voice Conversion chính ---
@@ -174,9 +229,9 @@ def generate_vc(
174
  os.remove(tmp)
175
  yield from yield_vc_updates(f"Xử lý đoạn {i+1}/{len(chunks)}")
176
  # ghép lại
177
- combined = pydub.AudioSegment.empty()
178
  for p in temp_paths:
179
- combined += pydub.AudioSegment.from_file(p)
180
  final = os.path.join(work_dir, "combined.wav")
181
  combined.export(final, format="wav")
182
  outputs.append(final)
@@ -192,7 +247,7 @@ def generate_vc(
192
  yield from yield_vc_updates(f"Lỗi: {e}")
193
  raise
194
 
195
- # cuốn cùng: luôn trả về cả audio đầu tiên và danh sách files cho download
196
  first = outputs[0] if outputs else None
197
  yield from yield_vc_updates(log_msg=None, audio_data=first, file_list=outputs)
198
 
@@ -288,18 +343,18 @@ with gr.Blocks(title="Chuyển Giọng Nói AI") as demo:
288
  # Toggle SRT
289
  def toggle_srt(v):
290
  return (
291
- gr.update(visible=v), # srt_file
292
- gr.update(visible=v), # srt_voice
293
- gr.update(visible=v), # srt_rate
294
- gr.update(visible=v), # srt_vol
295
- gr.update(visible=not v), # use_edge
296
- gr.update(visible=not v), # edge_text
297
- gr.update(visible=not v), # edge_voice
298
- gr.update(visible=not v), # edge_rate
299
- gr.update(visible=not v), # edge_vol
300
- gr.update(visible=not v), # gen_edge_btn
301
- gr.update(visible=not v), # edge_audio
302
- gr.update(visible=not v) # src_audio
303
  )
304
  use_srt.change(
305
  fn=toggle_srt,
@@ -314,13 +369,13 @@ with gr.Blocks(title="Chuyển Giọng Nói AI") as demo:
314
  # Toggle Edge TTS
315
  def toggle_edge(v):
316
  return (
317
- gr.update(visible=v), # edge_text
318
- gr.update(visible=v), # edge_voice
319
- gr.update(visible=v), # edge_rate
320
- gr.update(visible=v), # edge_vol
321
- gr.update(visible=v), # gen_edge_btn
322
- gr.update(visible=v), # edge_audio
323
- gr.update(visible=not v) # src_audio
324
  )
325
  use_edge.change(
326
  fn=toggle_edge,
 
3
  import importlib
4
  import json
5
  import asyncio
6
+ import tempfile
7
  from datetime import datetime
8
 
9
  import torch
10
  import gradio as gr
11
  import pydub
12
  import edge_tts
13
+ import pysrt
14
+ from pydub import AudioSegment
15
 
16
  # --- 1) Đảm bảo src/ có trong Python path để import ChatterboxVC ---
17
  script_dir = os.path.dirname(os.path.abspath(__file__))
 
48
  log_update = gr.update(value="\n".join(global_log_messages_vc))
49
 
50
  # audio output
51
+ audio_update = gr.update(
52
+ visible=(audio_data is not None),
53
+ value=audio_data if audio_data is not None else None
54
+ )
55
  # file-download output
56
+ files_update = gr.update(
57
+ visible=(file_list is not None),
58
+ value=file_list if file_list is not None else []
59
+ )
60
 
61
  yield log_update, audio_update, files_update
62
 
 
75
 
76
  edge_choices, edge_code_map = load_edge_tts_voices()
77
 
78
+ # --- 5) TTS Edge với rate & volume (cho trường hợp nhập text trực tiếp) ---
79
  async def _edge_tts_async(text, disp, rate_pct, vol_pct):
80
  code = edge_code_map.get(disp)
81
  rate_str = f"{rate_pct:+d}%"
 
88
  path = asyncio.run(_edge_tts_async(text, disp, rate_pct, vol_pct))
89
  return path, path
90
 
91
+ # --- 6) TTS from SRT sử dụng pysrt + chia nhỏ text nếu quá dài ---
92
+ async def _tts_save_segment(text: str, voice_code: str, rate_pct: int, vol_pct: int, path: str) -> bool:
93
+ """
94
+ Save một đoạn text thành file audio bằng Edge TTS.
95
+ Trả về True nếu có audio, False nếu bị NoAudioReceived.
96
+ """
97
+ rate_str = f"{rate_pct:+d}%"
98
+ vol_str = f"{vol_pct:+d}%"
99
+ try:
100
+ await edge_tts.Communicate(text, voice=voice_code, rate=rate_str, volume=vol_str).save(path)
101
+ return True
102
+ except edge_tts.exceptions.NoAudioReceived:
103
+ # segment quá ngắn, bỏ qua
104
+ return False
105
+
106
+ async def _generate_audio_from_srt(
107
+ srt_path: str,
108
+ tmp_dir: str,
109
+ out_path: str,
110
+ voice_code: str,
111
+ rate_pct: int,
112
+ vol_pct: int
113
+ ):
114
+ """
115
+ Đọc file .srt, chia nhỏ text nếu >200 ký tự, gọi Edge TTS từng phần,
116
+ ghép các segment và export thành file WAV.
117
+ """
118
+ subs = pysrt.open(srt_path, encoding='utf-8')
119
+ segments = []
120
+
121
+ for i, sub in enumerate(subs):
122
+ text = sub.text.replace('\n', ' ')
123
+ # nếu text quá dài, chia nhỏ
124
+ if len(text) > 200:
125
+ parts = [text[k:k+200] for k in range(0, len(text), 200)]
126
  else:
127
+ parts = [text]
128
+
129
+ seg = AudioSegment.silent(duration=0)
130
+ for j, part in enumerate(parts):
131
+ seg_path = os.path.join(tmp_dir, f"seg_{i}_{j}.wav")
132
+ ok = await _tts_save_segment(part, voice_code, rate_pct, vol_pct, seg_path)
133
+ if ok:
134
+ seg += AudioSegment.from_file(seg_path)
135
+ segments.append(seg)
136
+
137
+ # ghép tất cả segments và export
138
+ if segments:
139
+ combined = segments[0]
140
+ for seg in segments[1:]:
141
+ combined += seg
142
+ combined.export(out_path, format="wav")
143
+
144
+ def synthesize_srt_audio(
145
+ srt_path: str,
146
+ disp_voice: str,
147
+ work_dir: str,
148
+ rate_pct: int,
149
+ vol_pct: int
150
+ ) -> str:
151
+ """
152
+ Wrapper đồng bộ để sinh file WAV từ SRT bằng Edge TTS,
153
+ trả về đường dẫn file WAV để đưa vào pipeline clone voice.
154
+ """
155
+ # lấy mã giọng từ map
156
+ voice_code = edge_code_map.get(disp_voice)
157
+
158
+ # tạo tmp dir và định nghĩa output path
159
+ tmp_dir = tempfile.mkdtemp()
160
  out_path = os.path.join(work_dir, "srt_source.wav")
161
+
162
+ # chạy event loop bất đồng bộ
163
+ loop = asyncio.new_event_loop()
164
+ asyncio.set_event_loop(loop)
165
+ loop.run_until_complete(
166
+ _generate_audio_from_srt(
167
+ srt_path, tmp_dir, out_path,
168
+ voice_code, rate_pct, vol_pct
169
+ )
170
+ )
171
  return out_path
172
 
173
  # --- 7) Voice Conversion chính ---
 
229
  os.remove(tmp)
230
  yield from yield_vc_updates(f"Xử lý đoạn {i+1}/{len(chunks)}")
231
  # ghép lại
232
+ combined = AudioSegment.empty()
233
  for p in temp_paths:
234
+ combined += AudioSegment.from_file(p)
235
  final = os.path.join(work_dir, "combined.wav")
236
  combined.export(final, format="wav")
237
  outputs.append(final)
 
247
  yield from yield_vc_updates(f"Lỗi: {e}")
248
  raise
249
 
250
+ # trả về audio đầu tiên và danh sách file để download
251
  first = outputs[0] if outputs else None
252
  yield from yield_vc_updates(log_msg=None, audio_data=first, file_list=outputs)
253
 
 
343
  # Toggle SRT
344
  def toggle_srt(v):
345
  return (
346
+ gr.update(visible=v), # srt_file
347
+ gr.update(visible=v), # srt_voice
348
+ gr.update(visible=v), # srt_rate
349
+ gr.update(visible=v), # srt_vol
350
+ gr.update(visible=not v),# use_edge
351
+ gr.update(visible=not v),# edge_text
352
+ gr.update(visible=not v),# edge_voice
353
+ gr.update(visible=not v),# edge_rate
354
+ gr.update(visible=not v),# edge_vol
355
+ gr.update(visible=not v),# gen_edge_btn
356
+ gr.update(visible=not v),# edge_audio
357
+ gr.update(visible=not v) # src_audio
358
  )
359
  use_srt.change(
360
  fn=toggle_srt,
 
369
  # Toggle Edge TTS
370
  def toggle_edge(v):
371
  return (
372
+ gr.update(visible=v), # edge_text
373
+ gr.update(visible=v), # edge_voice
374
+ gr.update(visible=v), # edge_rate
375
+ gr.update(visible=v), # edge_vol
376
+ gr.update(visible=v), # gen_edge_btn
377
+ gr.update(visible=v), # edge_audio
378
+ gr.update(visible=not v) # src_audio
379
  )
380
  use_edge.change(
381
  fn=toggle_edge,