Pocket-TTS

Sleeping

App Files Files Community

Nymbo commited on Jan 19

Commit

a23619f

verified ·

1 Parent(s): 4d21128

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -2

app.py CHANGED Viewed

@@ -105,10 +105,76 @@ def _init_pocket(
             "sample_rate": model.sample_rate,
         })
         print(f"Pocket TTS initialized. Sample rate: {model.sample_rate} Hz")
     except Exception as e:
         raise gr.Error(f"Failed to initialize Pocket TTS model: {str(e)}")
 def _convert_to_wav(audio_path: str) -> str:
     """Convert audio file to WAV format if needed.
@@ -181,7 +247,40 @@ def _get_voice_state(voice_name: str | None, custom_audio_path: str | None):
     if voice_name in _POCKET_STATE["voice_states"]:
         return _POCKET_STATE["voice_states"][voice_name]
-    # Load and cache voice state
     voice_path = PRESET_VOICES[voice_name]
     print(f"Loading preset voice '{voice_name}' from: {voice_path}")
@@ -381,7 +480,6 @@ with gr.Blocks() as demo:
                 label="Generated Speech",
                 streaming=True,
                 autoplay=True,
-                buttons=["download"],
             )
             with gr.Accordion("Advanced Options", open=False):

             "sample_rate": model.sample_rate,
         })
         print(f"Pocket TTS initialized. Sample rate: {model.sample_rate} Hz")
+        # Auto-create missing embeddings if voice cloning is available
+        if model.has_voice_cloning:
+            _create_missing_embeddings(model)
+        else:
+            print("Voice cloning not available - using pre-computed embeddings only")
     except Exception as e:
         raise gr.Error(f"Failed to initialize Pocket TTS model: {str(e)}")
+def _create_missing_embeddings(model) -> None:
+    """Create embeddings for any voices that have audio files but no embedding."""
+    import os
+    from pocket_tts.data.audio import audio_read
+    from pocket_tts.data.audio_utils import convert_audio
+    import safetensors.torch
+    voices_dir = os.path.join(os.path.dirname(__file__), "voices")
+    embeddings_dir = os.path.join(os.path.dirname(__file__), "embeddings")
+    if not os.path.exists(voices_dir):
+        return
+    os.makedirs(embeddings_dir, exist_ok=True)
+    audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a')
+    for voice_name, voice_path in PRESET_VOICES.items():
+        embedding_path = os.path.join(embeddings_dir, f"{voice_name}.safetensors")
+        # Skip if embedding already exists or no local file
+        if os.path.exists(embedding_path) or voice_path is None:
+            continue
+        # Skip fallback HuggingFace voices
+        if voice_path.startswith("hf://"):
+            continue
+        print(f"Creating embedding for '{voice_name}'...")
+        try:
+            # Convert to WAV if needed
+            audio_path = voice_path
+            if not voice_path.lower().endswith('.wav'):
+                from pydub import AudioSegment
+                import tempfile
+                audio = AudioSegment.from_file(voice_path)
+                temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+                audio.export(temp_wav.name, format='wav')
+                audio_path = temp_wav.name
+            # Read and encode audio
+            audio, sr = audio_read(audio_path)
+            audio_tensor = convert_audio(audio, sr, model.config.mimi.sample_rate, 1)
+            with torch.no_grad():
+                audio_prompt = model._encode_audio(audio_tensor.unsqueeze(0).to(model.device))
+            # Save embedding
+            safetensors.torch.save_file(
+                {"audio_prompt": audio_prompt.cpu()},
+                embedding_path
+            )
+            print(f"  Saved: {embedding_path}")
+        except Exception as e:
+            print(f"  Error creating embedding for {voice_name}: {e}")
 def _convert_to_wav(audio_path: str) -> str:
     """Convert audio file to WAV format if needed.
     if voice_name in _POCKET_STATE["voice_states"]:
         return _POCKET_STATE["voice_states"][voice_name]
+    # Check for pre-computed embedding first (no voice cloning needed)
+    import os
+    embeddings_dir = os.path.join(os.path.dirname(__file__), "embeddings")
+    embedding_path = os.path.join(embeddings_dir, f"{voice_name}.safetensors")
+    if os.path.exists(embedding_path):
+        print(f"Loading pre-computed embedding for '{voice_name}' from: {embedding_path}")
+        import safetensors.torch
+        from pocket_tts.modules.stateful_module import init_states
+        # Load the audio prompt embedding
+        state_dict = safetensors.torch.load_file(embedding_path)
+        audio_prompt = state_dict["audio_prompt"].to(model.device)
+        # Create fresh model state and condition it with the audio prompt
+        # (same logic as model.get_state_for_audio_prompt uses internally)
+        voice_state = init_states(model.flow_lm, batch_size=1, sequence_length=1000)
+        model._run_flow_lm_and_increment_step(model_state=voice_state, audio_conditioning=audio_prompt)
+        # Detach all tensors to make them leaf tensors (required for deepcopy)
+        def detach_tensors(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.detach().clone()
+            elif isinstance(obj, dict):
+                return {k: detach_tensors(v) for k, v in obj.items()}
+            else:
+                return obj
+        voice_state = detach_tensors(voice_state)
+        _POCKET_STATE["voice_states"][voice_name] = voice_state
+        return voice_state
+    # Fall back to voice cloning (requires auth)
     voice_path = PRESET_VOICES[voice_name]
     print(f"Loading preset voice '{voice_name}' from: {voice_path}")
                 label="Generated Speech",
                 streaming=True,
                 autoplay=True,
             )
             with gr.Accordion("Advanced Options", open=False):