oicui commited on
Commit
51caa9d
·
verified ·
1 Parent(s): f8b6238

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -17
app.py CHANGED
@@ -36,8 +36,10 @@ def default_text_for_ui(lang: str) -> str:
36
  def get_supported_languages_display() -> str:
37
  items = [f"**{name}** (`{code}`)" for code, name in sorted(SUPPORTED_LANGUAGES.items())]
38
  mid = len(items)//2
39
- return f"### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)\n" \
40
- f"{' '.join(items[:mid])}\n\n{' • '.join(items[mid:])}"
 
 
41
 
42
  def get_or_load_model():
43
  global MODEL
@@ -67,27 +69,104 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
67
  return provided_path
68
  return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
69
 
70
- # --- text splitter ---
71
- def split_text_into_chunks(text: str, max_chars: int = 500) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
72
  text = re.sub(r"\s+", " ", text.strip())
 
 
73
  if len(text) <= max_chars:
74
  return [text]
75
 
76
- sentences = re.split(r'(?<=[.!?।،])\s+', text)
77
- chunks, current_chunk = [], ""
 
 
 
78
 
79
  for sent in sentences:
80
- if len(current_chunk) + len(sent) < max_chars:
81
- current_chunk += " " + sent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  else:
83
- chunks.append(current_chunk.strip())
84
- current_chunk = sent
85
- if current_chunk:
86
- chunks.append(current_chunk.strip())
 
 
87
 
88
  return [c for c in chunks if c]
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @spaces.GPU
92
  def generate_tts_audio(
93
  text_input: str,
@@ -121,14 +200,23 @@ def generate_tts_audio(
121
  if chosen_prompt:
122
  generate_kwargs["audio_prompt_path"] = chosen_prompt
123
 
124
- chunks = split_text_into_chunks(text_input)
125
- all_audio = []
 
126
 
127
- for chunk in chunks:
 
 
 
128
  wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
129
  all_audio.append(wav.squeeze(0).cpu())
130
 
131
- final_audio = torch.cat(all_audio, dim=-1)
 
 
 
 
 
132
 
133
  # RETURN AUDIO + SEED
134
  return (current_model.sr, final_audio.numpy()), str(seed_num_input)
@@ -142,7 +230,7 @@ def generate_tts_audio(
142
  with gr.Blocks() as demo:
143
  gr.Markdown("""
144
  # 🎙️ Multi Language Realistic Voice Cloner
145
- Generate long-form multilingual speech with reference audio styling and auto-chunking.
146
  """)
147
 
148
  gr.Markdown(get_supported_languages_display())
 
36
  def get_supported_languages_display() -> str:
37
  items = [f"**{name}** (`{code}`)" for code, name in sorted(SUPPORTED_LANGUAGES.items())]
38
  mid = len(items)//2
39
+ return (
40
+ f"### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)\n"
41
+ f"{' • '.join(items[:mid])}\n\n{' • '.join(items[mid:])}"
42
+ )
43
 
44
  def get_or_load_model():
45
  global MODEL
 
69
  return provided_path
70
  return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
71
 
72
+
73
+ # ============================
74
+ # SMART CHUNKING (TỐI ƯU)
75
+ # ============================
76
+
77
+ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
78
+ """
79
+ Chia text thành các đoạn (chunk) ngắn:
80
+ - Ưu tiên tách theo câu.
81
+ - Nếu câu quá dài thì tách tiếp theo từ.
82
+ - Gộp nhiều câu nhỏ vào 1 chunk để giảm số lần gọi model.
83
+ """
84
+ # Normalize khoảng trắng
85
  text = re.sub(r"\s+", " ", text.strip())
86
+ if not text:
87
+ return []
88
  if len(text) <= max_chars:
89
  return [text]
90
 
91
+ # Hỗ trợ nhiều dấu câu đa ngôn ngữ: . ! ? … ؟ ، : ؛ ।
92
+ sentences = re.split(r'(?<=[\.!\?…؟،:؛।])\s+', text)
93
+
94
+ chunks: list[str] = []
95
+ current = ""
96
 
97
  for sent in sentences:
98
+ sent = sent.strip()
99
+ if not sent:
100
+ continue
101
+
102
+ # Nếu bản thân câu đã dài hơn max_chars -> chia mềm theo từ
103
+ if len(sent) > max_chars:
104
+ words = sent.split()
105
+ temp = ""
106
+ for w in words:
107
+ if len(temp) + len(w) + 1 > max_chars:
108
+ if temp:
109
+ chunks.append(temp.strip())
110
+ temp = ""
111
+ temp += w + " "
112
+ if temp:
113
+ chunks.append(temp.strip())
114
+ continue
115
+
116
+ # Nếu gộp thêm câu mà vẫn không vượt max_chars -> gộp chung
117
+ if len(current) + len(sent) + 1 <= max_chars:
118
+ current += sent + " "
119
  else:
120
+ if current:
121
+ chunks.append(current.strip())
122
+ current = sent + " "
123
+
124
+ if current:
125
+ chunks.append(current.strip())
126
 
127
  return [c for c in chunks if c]
128
 
129
 
130
+ def concat_audio_torch(chunks: list[torch.Tensor],
131
+ crossfade_ms: int = 10,
132
+ sr: int = 24000) -> torch.Tensor:
133
+ """
134
+ Nối nhiều đoạn audio (1D tensor) bằng crossfade nhẹ để tránh tiếng "click".
135
+ """
136
+ if not chunks:
137
+ return torch.empty(0)
138
+
139
+ if len(chunks) == 1 or crossfade_ms <= 0:
140
+ return torch.cat(chunks, dim=-1)
141
+
142
+ output = chunks[0]
143
+ crossfade = int(crossfade_ms * sr / 1000)
144
+
145
+ for i in range(1, len(chunks)):
146
+ a = output
147
+ b = chunks[i]
148
+
149
+ # Đảm bảo crossfade không lớn hơn độ dài đoạn
150
+ cf = min(crossfade, a.shape[-1], b.shape[-1])
151
+ if cf <= 0:
152
+ output = torch.cat([a, b], dim=-1)
153
+ continue
154
+
155
+ fade_out = torch.linspace(1.0, 0.0, steps=cf, device=a.device, dtype=a.dtype)
156
+ fade_in = torch.linspace(0.0, 1.0, steps=cf, device=b.device, dtype=b.dtype)
157
+
158
+ a_tail = a[..., -cf:] * fade_out
159
+ b_head = b[..., :cf] * fade_in
160
+
161
+ mixed = a_tail + b_head
162
+ a_main = a[..., :-cf]
163
+ b_rest = b[..., cf:]
164
+
165
+ output = torch.cat([a_main, mixed, b_rest], dim=-1)
166
+
167
+ return output
168
+
169
+
170
  @spaces.GPU
171
  def generate_tts_audio(
172
  text_input: str,
 
200
  if chosen_prompt:
201
  generate_kwargs["audio_prompt_path"] = chosen_prompt
202
 
203
+ # 💡 DÙNG SMART CHUNKING TỐI ƯU
204
+ chunks = smart_chunk_text(text_input, max_chars=500)
205
+ print(f"📚 Total chunks: {len(chunks)}")
206
 
207
+ all_audio: list[torch.Tensor] = []
208
+
209
+ for idx, chunk in enumerate(chunks, start=1):
210
+ print(f"🎧 Rendering chunk {idx}/{len(chunks)} (len={len(chunk)} chars)")
211
  wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
212
  all_audio.append(wav.squeeze(0).cpu())
213
 
214
+ # 🔗 NỐI AUDIO VỚI CROSSFADE NHẸ
215
+ final_audio = concat_audio_torch(
216
+ all_audio,
217
+ crossfade_ms=12,
218
+ sr=current_model.sr
219
+ )
220
 
221
  # RETURN AUDIO + SEED
222
  return (current_model.sr, final_audio.numpy()), str(seed_num_input)
 
230
  with gr.Blocks() as demo:
231
  gr.Markdown("""
232
  # 🎙️ Multi Language Realistic Voice Cloner
233
+ Generate long-form multilingual speech with reference audio styling and smart chunking (crossfaded).
234
  """)
235
 
236
  gr.Markdown(get_supported_languages_display())