Spaces:

RinggAI
/

Ringg-TTS-v1.0

Running

App Files Files Community

hi i want to use this to train with own voice like voice cloning can you tell me how or can you add a voice for me

by samxiao0 - opened Nov 10, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+79

-145

Files changed (4) hide show

README.md +2 -2
app.py +66 -126
generation_counter.json +1 -1
vertex_client.py +10 -16

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Ringg Squirrel TTS V1.0
-emoji: 🐿️
 colorFrom: pink
 colorTo: blue
 sdk: gradio

 ---
+title: Ringg TTS V1.0
+emoji: 😻
 colorFrom: pink
 colorTo: blue
 sdk: gradio

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from pathlib import Path
 import uuid
 import fcntl
 import time
-import tempfile
 from vertex_client import get_vertex_client
 # gr.NO_RELOAD = False
@@ -153,9 +152,8 @@ def synthesize_speech(text, voice_id):
         if success and audio_bytes:
             print("✅ Synthesized audio using Vertex AI")
-            # Save binary audio to temp file in system temp directory
-            temp_dir = tempfile.gettempdir()
-            audio_file = os.path.join(temp_dir, f"ringg_{str(uuid.uuid4())}.wav")
             with open(audio_file, "wb") as f:
                 f.write(audio_bytes)
@@ -172,7 +170,7 @@ def synthesize_speech(text, voice_id):
                     rtf_no_vocoder
                 ) = ""
-            status_msg = ""
             return (
                 audio_file,
@@ -222,7 +220,7 @@ with gr.Blocks(
     # Best Practices Section
     gr.Markdown("""
-    ## 📝 Best Practices for Best Results
     - **Supported Languages:** Hindi and English only
     - **Check spelling carefully:** Misspelled words may be mispronounced
     - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
@@ -230,48 +228,41 @@ with gr.Blocks(
     - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
     """)
-    # Input Section - Text, Voice, and Character Count grouped together
-    with gr.Group():
-        # Text Input
-        text_input = gr.Textbox(
-            label="Text (max 300 characters)",
-            placeholder="Type or paste your text here (max 300 characters)...",
-            lines=6,
-            max_lines=10,
-            max_length=300,
-        )
-        # Voice Selection
-        voices = get_voices()
-        voice_choices = {display: vid for display, vid in voices}
-        voice_dropdown = gr.Dropdown(
-            choices=list(voice_choices.keys()),
-            label="Choose a voice style",
-            info=f"{len(voices)} voices available",
-            value=list(voice_choices.keys())[0] if voices else None,
-            show_label=False,
-        )
-        # Character count display
-        char_count = gr.Code(
-            "Character count: 0 / 300",
-            show_line_numbers=False,
-            show_label=False,
-        )
-    # Audio output section
-    gr.Markdown("### 🎧 Audio Result")
-    audio_output = gr.Audio(label="Generated Audio", type="filepath")
-    status = gr.Markdown("", visible=True)
-    metrics_header = gr.Markdown("**📊 Metrics**", visible=False)
-    metrics_output = gr.Code(
-        label="Performance Metrics",
-        language="json",
-        interactive=False,
-        visible=False,
     )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
     with gr.Row():
         example_btn1 = gr.Button("English Example", size="sm")
         example_btn2 = gr.Button("Hindi Example", size="sm")
@@ -289,103 +280,52 @@ with gr.Blocks(
     def update_char_count(text):
         """Update character count as user types"""
         count = len(text) if text else 0
-        return f"Character count: {count} / 300"
     def load_example_text(example_text):
         """Load example text and update character count"""
         count = len(example_text)
-        return example_text, f"Character count: {count} / 300"
     def clear_text():
         """Clear text input"""
-        return "", "Character count: 0 / 300"
     def on_generate(text, voice_display):
-        """Generate speech using the distill model."""
-        # Validate inputs
-        if not text or not text.strip():
-            error_msg = "⚠️ Please enter some text"
-            yield (
-                None,
-                error_msg,
-                gr.update(visible=False),
-                gr.update(visible=False),
-                f"**🌍 Generations:** {load_counter()}",
-            )
-            return
         voice_id = voice_choices.get(voice_display)
-        if not voice_id:
-            error_msg = "⚠️ Please select a voice"
-            yield (
-                None,
-                error_msg,
-                gr.update(visible=False),
-                gr.update(visible=False),
-                f"**🌍 Generations:** {load_counter()}",
-            )
-            return
-        # Show loading state initially
-        yield (
-            None,
-            "⏳ Loading...",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            f"**🌍 Generations:** {load_counter()}",
         )
-        # Synthesize speech
-        vertex_client = get_vertex_client()
-        success, audio_bytes, metrics = vertex_client.synthesize(text, voice_id)
-        if success and audio_bytes:
-            # Save audio file in system temp directory
-            temp_dir = tempfile.gettempdir()
-            audio_file = os.path.join(
-                temp_dir, f"ringg_{str(uuid.uuid4())}.wav"
-            )
-            with open(audio_file, "wb") as f:
-                f.write(audio_bytes)
-            # Increment counter
             new_count = increment_counter()
-            # Format metrics
-            metrics_json = ""
-            has_metrics = False
-            if metrics:
-                has_metrics = True
-                metrics_json = json.dumps(
-                    {
-                        "total_time": f"{metrics.get('t', 0):.3f}s",
-                        "rtf": f"{metrics.get('rtf', 0):.4f}",
-                        "audio_duration": f"{metrics.get('wav_seconds', 0):.2f}s",
-                        "vocoder_time": f"{metrics.get('t_vocoder', 0):.3f}s",
-                        "no_vocoder_time": f"{metrics.get('t_no_vocoder', 0):.3f}s",
-                        "rtf_no_vocoder": f"{metrics.get('rtf_no_vocoder', 0):.4f}",
-                    },
-                    indent=2,
-                )
-            # Yield success result
-            yield (
-                audio_file,
-                "",
-                gr.update(visible=has_metrics),
-                gr.update(value=metrics_json, visible=has_metrics),
-                f"**🌍 Generations:** {new_count}",
-            )
-        else:
-            # Yield failure result
-            yield (
-                None,
-                "❌ Failed to generate",
-                gr.update(visible=False),
-                gr.update(visible=False),
-                f"**🌍 Generations:** {load_counter()}",
             )
     def refresh_counter_on_load():
         """Refresh the universal generation counter when the UI loads/reloads"""
         return f"**🌍 Generations since last reload:** {load_counter()}"
@@ -417,7 +357,7 @@ with gr.Blocks(
         inputs=[text_input, voice_dropdown],
         outputs=[
             audio_output,
-            status,
             metrics_header,
             metrics_output,
             generation_counter,

 import uuid
 import fcntl
 import time
 from vertex_client import get_vertex_client
 # gr.NO_RELOAD = False
         if success and audio_bytes:
             print("✅ Synthesized audio using Vertex AI")
+            # Save binary audio to temp file
+            audio_file = f"/tmp/ringg_{str(uuid.uuid4())}.wav"
             with open(audio_file, "wb") as f:
                 f.write(audio_bytes)
                     rtf_no_vocoder
                 ) = ""
+            status_msg = "✅ Audio generated successfully!"
             return (
                 audio_file,
     # Best Practices Section
     gr.Markdown("""
+    ### 📝 Best Practices for Best Results
     - **Supported Languages:** Hindi and English only
     - **Check spelling carefully:** Misspelled words may be mispronounced
     - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
     - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
     """)
+    # Text Input
+    text_input = gr.Textbox(
+        label="Text (max 300 characters)",
+        placeholder="Type or paste your text here (max 300 characters)...",
+        lines=6,
+        max_lines=10,
+        max_length=300,
     )
+    # Character count display
+    char_count = gr.Markdown("**Character count:** 0 / 300")
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Voice Selection
+            voices = get_voices()
+            voice_choices = {display: vid for display, vid in voices}
+            voice_dropdown = gr.Dropdown(
+                choices=list(voice_choices.keys()),
+                label="Choose a voice style",
+                info=f"{len(voices)} voices available",
+                value=list(voice_choices.keys())[0] if voices else None,
+            )
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Listen to your audio", type="filepath")
+            metrics_header = gr.Markdown("### 📊 Generation Metrics", visible=False)
+            metrics_output = gr.Code(
+                label="Metrics", language="json", interactive=False, visible=False
+            )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
+    gr.Markdown("#### 🎯 Try these examples:")
     with gr.Row():
         example_btn1 = gr.Button("English Example", size="sm")
         example_btn2 = gr.Button("Hindi Example", size="sm")
     def update_char_count(text):
         """Update character count as user types"""
         count = len(text) if text else 0
+        return f"**Character count:** {count} / 300"
     def load_example_text(example_text):
         """Load example text and update character count"""
         count = len(example_text)
+        return example_text, f"**Character count:** {count} / 300"
     def clear_text():
         """Clear text input"""
+        return "", "**Character count:** 0 / 300"
     def on_generate(text, voice_display):
         voice_id = voice_choices.get(voice_display)
+        audio_file, _status, t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc = (
+            synthesize_speech(text, voice_id)
         )
+        # Get fresh counter from file
+        new_count = load_counter()
+        if audio_file:
+            # Atomically increment the UNIVERSAL counter
             new_count = increment_counter()
+        # Format metrics as JSON string (only if available)
+        has_metrics = any([t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc])
+        metrics_json = ""
+        if has_metrics:
+            metrics_json = json.dumps(
+                {
+                    "total_time": t_time,
+                    "rtf": rtf,
+                    "audio_duration": wav_dur,
+                    "vocoder_time": voc_time,
+                    "no_vocoder_time": no_voc_time,
+                    "rtf_no_vocoder": rtf_no_voc,
+                },
+                indent=2,
             )
+        return (
+            audio_file,
+            gr.update(visible=has_metrics),
+            gr.update(value=metrics_json, visible=has_metrics),
+            f"**🌍 Generations:** {new_count}",
+        )
     def refresh_counter_on_load():
         """Refresh the universal generation counter when the UI loads/reloads"""
         return f"**🌍 Generations since last reload:** {load_counter()}"
         inputs=[text_input, voice_dropdown],
         outputs=[
             audio_output,
+            # status_output,
             metrics_header,
             metrics_output,
             generation_counter,

generation_counter.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"count": 11, "last_updated": ~~1763749917~~.~~869355~~}


1	+ {"count": 3, "last_updated": 1762495500.191227}

vertex_client.py CHANGED Viewed

@@ -57,7 +57,7 @@ class VertexAIClient:
     def initialize(self) -> bool:
         """
-        Initialize Vertex AI and find the zipvoice_base_distill endpoint.
         Returns:
             True if initialization successful, False otherwise
@@ -80,20 +80,16 @@ class VertexAIClient:
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
-            # Find distill endpoint
             for endpoint in aiplatform.Endpoint.list():
-                if endpoint.display_name == "zipvoice_base_distill":
                     self.endpoint = endpoint
-                    logger.info(f"Found zipvoice_base_distill endpoint: {endpoint.resource_name}")
-                    break
-            # Check if endpoint is found
-            if not self.endpoint:
-                logger.error("zipvoice_base_distill endpoint not found in Vertex AI")
-                return False
-            self.initialized = True
-            return True
         except Exception as e:
             logger.error(f"Failed to initialize Vertex AI: {e}")
@@ -132,7 +128,7 @@ class VertexAIClient:
     def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
         """
-        Synthesize speech from text using Vertex AI distill endpoint.
         Args:
             text: Text to synthesize
@@ -147,12 +143,11 @@ class VertexAIClient:
                 return False, None, None
         try:
-            logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id} using distill model")
             response = self.endpoint.raw_predict(
                 body=json.dumps({
                     "text": text,
                     "voice_id": voice_id,
-                    "model_type": "distill",
                 }),
                 headers={"Content-Type": "application/json"},
             )
@@ -191,7 +186,6 @@ class VertexAIClient:
             return False, None, None
 # Global instance
 _vertex_client = None

     def initialize(self) -> bool:
         """
+        Initialize Vertex AI and find the zipvoice endpoint.
         Returns:
             True if initialization successful, False otherwise
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
+            # Find the zipvoice endpoint
             for endpoint in aiplatform.Endpoint.list():
+                if endpoint.display_name == "zipvoice":
                     self.endpoint = endpoint
+                    self.initialized = True
+                    logger.info(f"Found zipvoice endpoint: {endpoint.resource_name}")
+                    return True
+            logger.error("zipvoice endpoint not found in Vertex AI")
+            return False
         except Exception as e:
             logger.error(f"Failed to initialize Vertex AI: {e}")
     def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
         """
+        Synthesize speech from text using Vertex AI endpoint.
         Args:
             text: Text to synthesize
                 return False, None, None
         try:
+            logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id}")
             response = self.endpoint.raw_predict(
                 body=json.dumps({
                     "text": text,
                     "voice_id": voice_id,
                 }),
                 headers={"Content-Type": "application/json"},
             )
             return False, None, None
 # Global instance
 _vertex_client = None