oicui commited on
Commit
e3826a0
·
verified ·
1 Parent(s): 51caa9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -96
app.py CHANGED
@@ -70,26 +70,38 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
70
  return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
71
 
72
 
73
- # ============================
74
- # SMART CHUNKING (TỐI ƯU)
75
- # ============================
76
 
77
- def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
78
  """
79
- Chia text thành các đoạn (chunk) ngắn:
80
- - Ưu tiên tách theo câu.
81
- - Nếu câu quá dài thì tách tiếp theo từ.
82
- - Gộp nhiều câu nhỏ vào 1 chunk để giảm số lần gọi model.
83
  """
84
- # Normalize khoảng trắng
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  text = re.sub(r"\s+", " ", text.strip())
86
  if not text:
87
  return []
88
  if len(text) <= max_chars:
89
  return [text]
90
 
91
- # Hỗ trợ nhiều dấu câu đa ngôn ngữ: . ! ? … ؟ ، : ؛ ।
92
- sentences = re.split(r'(?<=[\.!\?…؟،:؛।])\s+', text)
93
 
94
  chunks: list[str] = []
95
  current = ""
@@ -99,26 +111,10 @@ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
99
  if not sent:
100
  continue
101
 
102
- # Nếu bản thân câu đã dài hơn max_chars -> chia mềm theo từ
103
- if len(sent) > max_chars:
104
- words = sent.split()
105
- temp = ""
106
- for w in words:
107
- if len(temp) + len(w) + 1 > max_chars:
108
- if temp:
109
- chunks.append(temp.strip())
110
- temp = ""
111
- temp += w + " "
112
- if temp:
113
- chunks.append(temp.strip())
114
- continue
115
-
116
- # Nếu gộp thêm câu mà vẫn không vượt max_chars -> gộp chung
117
  if len(current) + len(sent) + 1 <= max_chars:
118
  current += sent + " "
119
  else:
120
- if current:
121
- chunks.append(current.strip())
122
  current = sent + " "
123
 
124
  if current:
@@ -127,45 +123,32 @@ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
127
  return [c for c in chunks if c]
128
 
129
 
130
- def concat_audio_torch(chunks: list[torch.Tensor],
131
- crossfade_ms: int = 10,
132
- sr: int = 24000) -> torch.Tensor:
 
 
133
  """
134
- Nối nhiều đoạn audio (1D tensor) bằng crossfade nhẹ để tránh tiếng "click".
135
  """
136
  if not chunks:
137
  return torch.empty(0)
138
 
139
- if len(chunks) == 1 or crossfade_ms <= 0:
140
- return torch.cat(chunks, dim=-1)
141
-
142
- output = chunks[0]
143
- crossfade = int(crossfade_ms * sr / 1000)
144
 
145
- for i in range(1, len(chunks)):
146
- a = output
147
- b = chunks[i]
 
 
148
 
149
- # Đảm bảo crossfade không lớn hơn độ dài đoạn
150
- cf = min(crossfade, a.shape[-1], b.shape[-1])
151
- if cf <= 0:
152
- output = torch.cat([a, b], dim=-1)
153
- continue
154
-
155
- fade_out = torch.linspace(1.0, 0.0, steps=cf, device=a.device, dtype=a.dtype)
156
- fade_in = torch.linspace(0.0, 1.0, steps=cf, device=b.device, dtype=b.dtype)
157
-
158
- a_tail = a[..., -cf:] * fade_out
159
- b_head = b[..., :cf] * fade_in
160
 
161
- mixed = a_tail + b_head
162
- a_main = a[..., :-cf]
163
- b_rest = b[..., cf:]
164
-
165
- output = torch.cat([a_main, mixed, b_rest], dim=-1)
166
-
167
- return output
168
 
 
 
 
169
 
170
  @spaces.GPU
171
  def generate_tts_audio(
@@ -182,55 +165,46 @@ def generate_tts_audio(
182
  if current_model is None:
183
  raise RuntimeError("TTS model not loaded.")
184
 
185
- # --- SEED LOGIC ---
186
  if seed_num_input == 0:
187
  seed_num_input = random.randint(1, 2**32 - 1)
188
- print(f"🌱 Random seed generated: {seed_num_input}")
189
- else:
190
- print(f"🌱 Using provided seed: {seed_num_input}")
191
-
192
  set_seed(int(seed_num_input))
193
 
 
 
 
194
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
195
  generate_kwargs = {
196
  "exaggeration": exaggeration_input,
197
  "temperature": temperature_input,
198
  "cfg_weight": cfgw_input,
 
199
  }
200
- if chosen_prompt:
201
- generate_kwargs["audio_prompt_path"] = chosen_prompt
202
 
203
- # 💡 DÙNG SMART CHUNKING TỐI ƯU
204
  chunks = smart_chunk_text(text_input, max_chars=500)
205
- print(f"📚 Total chunks: {len(chunks)}")
206
-
207
  all_audio: list[torch.Tensor] = []
208
 
209
- for idx, chunk in enumerate(chunks, start=1):
210
- print(f"🎧 Rendering chunk {idx}/{len(chunks)} (len={len(chunk)} chars)")
211
  wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
212
  all_audio.append(wav.squeeze(0).cpu())
213
 
214
- # 🔗 NỐI AUDIO VỚI CROSSFADE NHẸ
215
- final_audio = concat_audio_torch(
216
  all_audio,
217
- crossfade_ms=12,
218
  sr=current_model.sr
219
  )
220
 
221
- # RETURN AUDIO + SEED
222
  return (current_model.sr, final_audio.numpy()), str(seed_num_input)
223
 
224
 
225
-
226
- # ============================
227
- # GRADIO UI
228
- # ============================
229
 
230
  with gr.Blocks() as demo:
231
  gr.Markdown("""
232
- # 🎙️ Multi Language Realistic Voice Cloner
233
- Generate long-form multilingual speech with reference audio styling and smart chunking (crossfaded).
234
  """)
235
 
236
  gr.Markdown(get_supported_languages_display())
@@ -238,22 +212,16 @@ with gr.Blocks() as demo:
238
  with gr.Row():
239
  with gr.Column():
240
  initial_lang = "en"
241
- text = gr.Textbox(
242
- value=default_text_for_ui(initial_lang),
243
- label="Text to synthesize",
244
- lines=8
245
- )
246
  language_id = gr.Dropdown(
247
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
248
  value=initial_lang,
249
  label="Language"
250
  )
251
- ref_wav = gr.Audio(
252
- sources=["upload", "microphone"],
253
- type="filepath",
254
- label="Reference Audio (Optional)",
255
- value=default_audio_for_ui(initial_lang)
256
- )
257
  exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
258
  cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG Weight", value=0.5)
259
 
@@ -263,7 +231,6 @@ with gr.Blocks() as demo:
263
 
264
  run_btn = gr.Button("Generate", variant="primary")
265
 
266
- # OUTPUT COLUMN
267
  with gr.Column():
268
  audio_output = gr.Audio(label="Output Audio")
269
  seed_output = gr.Textbox(label="Seed Used", interactive=False)
@@ -274,15 +241,13 @@ with gr.Blocks() as demo:
274
  language_id.change(
275
  fn=on_lang_change,
276
  inputs=[language_id, ref_wav, text],
277
- outputs=[ref_wav, text],
278
- show_progress=False
279
  )
280
 
281
- # CONNECT BUTTON
282
  run_btn.click(
283
  fn=generate_tts_audio,
284
  inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
285
- outputs=[audio_output, seed_output],
286
  )
287
 
288
  demo.launch(mcp_server=True, share=True)
 
70
  return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
71
 
72
 
73
+ # ===========================================================
74
+ # THÊM HÀM NGẮT NGHỈ TỰ NHIÊN ⭐
75
+ # ===========================================================
76
 
77
+ def natural_pause_text(text: str) -> str:
78
  """
79
+ Thêm ngắt nghỉ tự nhiên để AI phát âm giống người thật.
80
+ - '...' = nghỉ dài
81
+ - '…' = nghỉ ngắn
 
82
  """
83
+ # Nghỉ dài sau dấu kết câu
84
+ text = re.sub(r"([\.!\?])\s+", r"\1 ... ", text)
85
+
86
+ # Nghỉ ngắn sau dấu phẩy
87
+ text = re.sub(r"(,)\s+", r"\1 … ", text)
88
+
89
+ return text.strip()
90
+
91
+
92
+ # ===========================================================
93
+ # SMART CHUNKING
94
+ # ===========================================================
95
+
96
+ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
97
+
98
  text = re.sub(r"\s+", " ", text.strip())
99
  if not text:
100
  return []
101
  if len(text) <= max_chars:
102
  return [text]
103
 
104
+ sentences = re.split(r'(?<=[\.!\?…])\s+', text)
 
105
 
106
  chunks: list[str] = []
107
  current = ""
 
111
  if not sent:
112
  continue
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  if len(current) + len(sent) + 1 <= max_chars:
115
  current += sent + " "
116
  else:
117
+ chunks.append(current.strip())
 
118
  current = sent + " "
119
 
120
  if current:
 
123
  return [c for c in chunks if c]
124
 
125
 
126
+ # ===========================================================
127
+ # ⭐ HÀM GHÉP AUDIO CÓ CHÈN KHOẢNG LẶNG GIỐNG NGƯỜI ⭐
128
+ # ===========================================================
129
+
130
+ def concat_audio_with_pause(chunks, pause_ms: int = 220, sr: int = 24000):
131
  """
132
+ Chèn khoảng lặng (pause) tự nhiên giữa các câu/chunk.
133
  """
134
  if not chunks:
135
  return torch.empty(0)
136
 
137
+ silence = torch.zeros(int(sr * pause_ms / 1000))
138
+ output = []
 
 
 
139
 
140
+ for i, ch in enumerate(chunks):
141
+ output.append(ch)
142
+ # chèn pause giữa các câu
143
+ if i < len(chunks) - 1:
144
+ output.append(silence.clone())
145
 
146
+ return torch.cat(output, dim=-1)
 
 
 
 
 
 
 
 
 
 
147
 
 
 
 
 
 
 
 
148
 
149
+ # ===========================================================
150
+ # TTS GENERATE
151
+ # ===========================================================
152
 
153
  @spaces.GPU
154
  def generate_tts_audio(
 
165
  if current_model is None:
166
  raise RuntimeError("TTS model not loaded.")
167
 
 
168
  if seed_num_input == 0:
169
  seed_num_input = random.randint(1, 2**32 - 1)
 
 
 
 
170
  set_seed(int(seed_num_input))
171
 
172
+ # ⭐ Thêm ngắt nghỉ vào TEXT
173
+ text_input = natural_pause_text(text_input)
174
+
175
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
176
  generate_kwargs = {
177
  "exaggeration": exaggeration_input,
178
  "temperature": temperature_input,
179
  "cfg_weight": cfgw_input,
180
+ "audio_prompt_path": chosen_prompt
181
  }
 
 
182
 
 
183
  chunks = smart_chunk_text(text_input, max_chars=500)
 
 
184
  all_audio: list[torch.Tensor] = []
185
 
186
+ for chunk in chunks:
 
187
  wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
188
  all_audio.append(wav.squeeze(0).cpu())
189
 
190
+ # Ghép audio PAUSE giống người
191
+ final_audio = concat_audio_with_pause(
192
  all_audio,
193
+ pause_ms=230, # Điều chỉnh độ tự nhiên tại đây
194
  sr=current_model.sr
195
  )
196
 
 
197
  return (current_model.sr, final_audio.numpy()), str(seed_num_input)
198
 
199
 
200
+ # ===========================================================
201
+ # GRADIO UI
202
+ # ===========================================================
 
203
 
204
  with gr.Blocks() as demo:
205
  gr.Markdown("""
206
+ # 🎙️ Multi Language Realistic Voice Cloner
207
+ Now with **Natural Human Pausing System** (Text + Audio Pause)
208
  """)
209
 
210
  gr.Markdown(get_supported_languages_display())
 
212
  with gr.Row():
213
  with gr.Column():
214
  initial_lang = "en"
215
+ text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text to synthesize", lines=8)
 
 
 
 
216
  language_id = gr.Dropdown(
217
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
218
  value=initial_lang,
219
  label="Language"
220
  )
221
+ ref_wav = gr.Audio(sources=["upload","microphone"], type="filepath",
222
+ label="Reference Audio (Optional)",
223
+ value=default_audio_for_ui(initial_lang))
224
+
 
 
225
  exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
226
  cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG Weight", value=0.5)
227
 
 
231
 
232
  run_btn = gr.Button("Generate", variant="primary")
233
 
 
234
  with gr.Column():
235
  audio_output = gr.Audio(label="Output Audio")
236
  seed_output = gr.Textbox(label="Seed Used", interactive=False)
 
241
  language_id.change(
242
  fn=on_lang_change,
243
  inputs=[language_id, ref_wav, text],
244
+ outputs=[ref_wav, text]
 
245
  )
246
 
 
247
  run_btn.click(
248
  fn=generate_tts_audio,
249
  inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
250
+ outputs=[audio_output, seed_output]
251
  )
252
 
253
  demo.launch(mcp_server=True, share=True)