hi i want to use this to train with own voice like voice cloning can you tell me how or can you add a voice for me

#3
by samxiao0 - opened
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +66 -126
  3. generation_counter.json +1 -1
  4. vertex_client.py +10 -16
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Ringg Squirrel TTS V1.0
3
- emoji: 🐿️
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
 
1
  ---
2
+ title: Ringg TTS V1.0
3
+ emoji: 😻
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
app.py CHANGED
@@ -5,7 +5,6 @@ from pathlib import Path
5
  import uuid
6
  import fcntl
7
  import time
8
- import tempfile
9
  from vertex_client import get_vertex_client
10
 
11
  # gr.NO_RELOAD = False
@@ -153,9 +152,8 @@ def synthesize_speech(text, voice_id):
153
 
154
  if success and audio_bytes:
155
  print("✅ Synthesized audio using Vertex AI")
156
- # Save binary audio to temp file in system temp directory
157
- temp_dir = tempfile.gettempdir()
158
- audio_file = os.path.join(temp_dir, f"ringg_{str(uuid.uuid4())}.wav")
159
  with open(audio_file, "wb") as f:
160
  f.write(audio_bytes)
161
 
@@ -172,7 +170,7 @@ def synthesize_speech(text, voice_id):
172
  rtf_no_vocoder
173
  ) = ""
174
 
175
- status_msg = ""
176
 
177
  return (
178
  audio_file,
@@ -222,7 +220,7 @@ with gr.Blocks(
222
 
223
  # Best Practices Section
224
  gr.Markdown("""
225
- ## 📝 Best Practices for Best Results
226
  - **Supported Languages:** Hindi and English only
227
  - **Check spelling carefully:** Misspelled words may be mispronounced
228
  - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
@@ -230,48 +228,41 @@ with gr.Blocks(
230
  - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
231
  """)
232
 
233
- # Input Section - Text, Voice, and Character Count grouped together
234
- with gr.Group():
235
- # Text Input
236
- text_input = gr.Textbox(
237
- label="Text (max 300 characters)",
238
- placeholder="Type or paste your text here (max 300 characters)...",
239
- lines=6,
240
- max_lines=10,
241
- max_length=300,
242
- )
243
- # Voice Selection
244
- voices = get_voices()
245
- voice_choices = {display: vid for display, vid in voices}
246
-
247
- voice_dropdown = gr.Dropdown(
248
- choices=list(voice_choices.keys()),
249
- label="Choose a voice style",
250
- info=f"{len(voices)} voices available",
251
- value=list(voice_choices.keys())[0] if voices else None,
252
- show_label=False,
253
- )
254
- # Character count display
255
- char_count = gr.Code(
256
- "Character count: 0 / 300",
257
- show_line_numbers=False,
258
- show_label=False,
259
- )
260
-
261
- # Audio output section
262
- gr.Markdown("### 🎧 Audio Result")
263
- audio_output = gr.Audio(label="Generated Audio", type="filepath")
264
- status = gr.Markdown("", visible=True)
265
- metrics_header = gr.Markdown("**📊 Metrics**", visible=False)
266
- metrics_output = gr.Code(
267
- label="Performance Metrics",
268
- language="json",
269
- interactive=False,
270
- visible=False,
271
  )
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
274
 
 
275
  with gr.Row():
276
  example_btn1 = gr.Button("English Example", size="sm")
277
  example_btn2 = gr.Button("Hindi Example", size="sm")
@@ -289,103 +280,52 @@ with gr.Blocks(
289
  def update_char_count(text):
290
  """Update character count as user types"""
291
  count = len(text) if text else 0
292
- return f"Character count: {count} / 300"
293
 
294
  def load_example_text(example_text):
295
  """Load example text and update character count"""
296
  count = len(example_text)
297
- return example_text, f"Character count: {count} / 300"
298
 
299
  def clear_text():
300
  """Clear text input"""
301
- return "", "Character count: 0 / 300"
302
 
303
  def on_generate(text, voice_display):
304
- """Generate speech using the distill model."""
305
- # Validate inputs
306
- if not text or not text.strip():
307
- error_msg = "⚠️ Please enter some text"
308
- yield (
309
- None,
310
- error_msg,
311
- gr.update(visible=False),
312
- gr.update(visible=False),
313
- f"**🌍 Generations:** {load_counter()}",
314
- )
315
- return
316
-
317
  voice_id = voice_choices.get(voice_display)
318
- if not voice_id:
319
- error_msg = "⚠️ Please select a voice"
320
- yield (
321
- None,
322
- error_msg,
323
- gr.update(visible=False),
324
- gr.update(visible=False),
325
- f"**🌍 Generations:** {load_counter()}",
326
- )
327
- return
328
-
329
- # Show loading state initially
330
- yield (
331
- None,
332
- "⏳ Loading...",
333
- gr.update(visible=False),
334
- gr.update(visible=False),
335
- f"**🌍 Generations:** {load_counter()}",
336
  )
337
 
338
- # Synthesize speech
339
- vertex_client = get_vertex_client()
340
- success, audio_bytes, metrics = vertex_client.synthesize(text, voice_id)
341
-
342
- if success and audio_bytes:
343
- # Save audio file in system temp directory
344
- temp_dir = tempfile.gettempdir()
345
- audio_file = os.path.join(
346
- temp_dir, f"ringg_{str(uuid.uuid4())}.wav"
347
- )
348
- with open(audio_file, "wb") as f:
349
- f.write(audio_bytes)
350
-
351
- # Increment counter
352
  new_count = increment_counter()
353
 
354
- # Format metrics
355
- metrics_json = ""
356
- has_metrics = False
357
- if metrics:
358
- has_metrics = True
359
- metrics_json = json.dumps(
360
- {
361
- "total_time": f"{metrics.get('t', 0):.3f}s",
362
- "rtf": f"{metrics.get('rtf', 0):.4f}",
363
- "audio_duration": f"{metrics.get('wav_seconds', 0):.2f}s",
364
- "vocoder_time": f"{metrics.get('t_vocoder', 0):.3f}s",
365
- "no_vocoder_time": f"{metrics.get('t_no_vocoder', 0):.3f}s",
366
- "rtf_no_vocoder": f"{metrics.get('rtf_no_vocoder', 0):.4f}",
367
- },
368
- indent=2,
369
- )
370
-
371
- # Yield success result
372
- yield (
373
- audio_file,
374
- "",
375
- gr.update(visible=has_metrics),
376
- gr.update(value=metrics_json, visible=has_metrics),
377
- f"**🌍 Generations:** {new_count}",
378
- )
379
- else:
380
- # Yield failure result
381
- yield (
382
- None,
383
- "❌ Failed to generate",
384
- gr.update(visible=False),
385
- gr.update(visible=False),
386
- f"**🌍 Generations:** {load_counter()}",
387
  )
388
 
 
 
 
 
 
 
 
389
  def refresh_counter_on_load():
390
  """Refresh the universal generation counter when the UI loads/reloads"""
391
  return f"**🌍 Generations since last reload:** {load_counter()}"
@@ -417,7 +357,7 @@ with gr.Blocks(
417
  inputs=[text_input, voice_dropdown],
418
  outputs=[
419
  audio_output,
420
- status,
421
  metrics_header,
422
  metrics_output,
423
  generation_counter,
 
5
  import uuid
6
  import fcntl
7
  import time
 
8
  from vertex_client import get_vertex_client
9
 
10
  # gr.NO_RELOAD = False
 
152
 
153
  if success and audio_bytes:
154
  print("✅ Synthesized audio using Vertex AI")
155
+ # Save binary audio to temp file
156
+ audio_file = f"/tmp/ringg_{str(uuid.uuid4())}.wav"
 
157
  with open(audio_file, "wb") as f:
158
  f.write(audio_bytes)
159
 
 
170
  rtf_no_vocoder
171
  ) = ""
172
 
173
+ status_msg = "✅ Audio generated successfully!"
174
 
175
  return (
176
  audio_file,
 
220
 
221
  # Best Practices Section
222
  gr.Markdown("""
223
+ ### 📝 Best Practices for Best Results
224
  - **Supported Languages:** Hindi and English only
225
  - **Check spelling carefully:** Misspelled words may be mispronounced
226
  - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
 
228
  - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
229
  """)
230
 
231
+ # Text Input
232
+ text_input = gr.Textbox(
233
+ label="Text (max 300 characters)",
234
+ placeholder="Type or paste your text here (max 300 characters)...",
235
+ lines=6,
236
+ max_lines=10,
237
+ max_length=300,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  )
239
 
240
+ # Character count display
241
+ char_count = gr.Markdown("**Character count:** 0 / 300")
242
+
243
+ with gr.Row():
244
+ with gr.Column(scale=1):
245
+ # Voice Selection
246
+ voices = get_voices()
247
+ voice_choices = {display: vid for display, vid in voices}
248
+
249
+ voice_dropdown = gr.Dropdown(
250
+ choices=list(voice_choices.keys()),
251
+ label="Choose a voice style",
252
+ info=f"{len(voices)} voices available",
253
+ value=list(voice_choices.keys())[0] if voices else None,
254
+ )
255
+
256
+ with gr.Column(scale=1):
257
+ audio_output = gr.Audio(label="Listen to your audio", type="filepath")
258
+ metrics_header = gr.Markdown("### 📊 Generation Metrics", visible=False)
259
+ metrics_output = gr.Code(
260
+ label="Metrics", language="json", interactive=False, visible=False
261
+ )
262
+
263
  generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
264
 
265
+ gr.Markdown("#### 🎯 Try these examples:")
266
  with gr.Row():
267
  example_btn1 = gr.Button("English Example", size="sm")
268
  example_btn2 = gr.Button("Hindi Example", size="sm")
 
280
  def update_char_count(text):
281
  """Update character count as user types"""
282
  count = len(text) if text else 0
283
+ return f"**Character count:** {count} / 300"
284
 
285
  def load_example_text(example_text):
286
  """Load example text and update character count"""
287
  count = len(example_text)
288
+ return example_text, f"**Character count:** {count} / 300"
289
 
290
  def clear_text():
291
  """Clear text input"""
292
+ return "", "**Character count:** 0 / 300"
293
 
294
  def on_generate(text, voice_display):
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  voice_id = voice_choices.get(voice_display)
296
+ audio_file, _status, t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc = (
297
+ synthesize_speech(text, voice_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  )
299
 
300
+ # Get fresh counter from file
301
+ new_count = load_counter()
302
+ if audio_file:
303
+ # Atomically increment the UNIVERSAL counter
 
 
 
 
 
 
 
 
 
 
304
  new_count = increment_counter()
305
 
306
+ # Format metrics as JSON string (only if available)
307
+ has_metrics = any([t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc])
308
+ metrics_json = ""
309
+ if has_metrics:
310
+ metrics_json = json.dumps(
311
+ {
312
+ "total_time": t_time,
313
+ "rtf": rtf,
314
+ "audio_duration": wav_dur,
315
+ "vocoder_time": voc_time,
316
+ "no_vocoder_time": no_voc_time,
317
+ "rtf_no_vocoder": rtf_no_voc,
318
+ },
319
+ indent=2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  )
321
 
322
+ return (
323
+ audio_file,
324
+ gr.update(visible=has_metrics),
325
+ gr.update(value=metrics_json, visible=has_metrics),
326
+ f"**🌍 Generations:** {new_count}",
327
+ )
328
+
329
  def refresh_counter_on_load():
330
  """Refresh the universal generation counter when the UI loads/reloads"""
331
  return f"**🌍 Generations since last reload:** {load_counter()}"
 
357
  inputs=[text_input, voice_dropdown],
358
  outputs=[
359
  audio_output,
360
+ # status_output,
361
  metrics_header,
362
  metrics_output,
363
  generation_counter,
generation_counter.json CHANGED
@@ -1 +1 @@
1
- {"count": 11, "last_updated": 1763749917.869355}
 
1
+ {"count": 3, "last_updated": 1762495500.191227}
vertex_client.py CHANGED
@@ -57,7 +57,7 @@ class VertexAIClient:
57
 
58
  def initialize(self) -> bool:
59
  """
60
- Initialize Vertex AI and find the zipvoice_base_distill endpoint.
61
 
62
  Returns:
63
  True if initialization successful, False otherwise
@@ -80,20 +80,16 @@ class VertexAIClient:
80
  )
81
  logger.info("Vertex AI initialized for project desivocalprod01")
82
 
83
- # Find distill endpoint
84
  for endpoint in aiplatform.Endpoint.list():
85
- if endpoint.display_name == "zipvoice_base_distill":
86
  self.endpoint = endpoint
87
- logger.info(f"Found zipvoice_base_distill endpoint: {endpoint.resource_name}")
88
- break
 
89
 
90
- # Check if endpoint is found
91
- if not self.endpoint:
92
- logger.error("zipvoice_base_distill endpoint not found in Vertex AI")
93
- return False
94
-
95
- self.initialized = True
96
- return True
97
 
98
  except Exception as e:
99
  logger.error(f"Failed to initialize Vertex AI: {e}")
@@ -132,7 +128,7 @@ class VertexAIClient:
132
 
133
  def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
134
  """
135
- Synthesize speech from text using Vertex AI distill endpoint.
136
 
137
  Args:
138
  text: Text to synthesize
@@ -147,12 +143,11 @@ class VertexAIClient:
147
  return False, None, None
148
 
149
  try:
150
- logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id} using distill model")
151
  response = self.endpoint.raw_predict(
152
  body=json.dumps({
153
  "text": text,
154
  "voice_id": voice_id,
155
- "model_type": "distill",
156
  }),
157
  headers={"Content-Type": "application/json"},
158
  )
@@ -191,7 +186,6 @@ class VertexAIClient:
191
  return False, None, None
192
 
193
 
194
-
195
  # Global instance
196
  _vertex_client = None
197
 
 
57
 
58
  def initialize(self) -> bool:
59
  """
60
+ Initialize Vertex AI and find the zipvoice endpoint.
61
 
62
  Returns:
63
  True if initialization successful, False otherwise
 
80
  )
81
  logger.info("Vertex AI initialized for project desivocalprod01")
82
 
83
+ # Find the zipvoice endpoint
84
  for endpoint in aiplatform.Endpoint.list():
85
+ if endpoint.display_name == "zipvoice":
86
  self.endpoint = endpoint
87
+ self.initialized = True
88
+ logger.info(f"Found zipvoice endpoint: {endpoint.resource_name}")
89
+ return True
90
 
91
+ logger.error("zipvoice endpoint not found in Vertex AI")
92
+ return False
 
 
 
 
 
93
 
94
  except Exception as e:
95
  logger.error(f"Failed to initialize Vertex AI: {e}")
 
128
 
129
  def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
130
  """
131
+ Synthesize speech from text using Vertex AI endpoint.
132
 
133
  Args:
134
  text: Text to synthesize
 
143
  return False, None, None
144
 
145
  try:
146
+ logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id}")
147
  response = self.endpoint.raw_predict(
148
  body=json.dumps({
149
  "text": text,
150
  "voice_id": voice_id,
 
151
  }),
152
  headers={"Content-Type": "application/json"},
153
  )
 
186
  return False, None, None
187
 
188
 
 
189
  # Global instance
190
  _vertex_client = None
191