instVC

Running

App Files Files Community

Nymbo commited on Jan 19

Commit

b0e1ce1

verified ·

1 Parent(s): 9ec85a2

Create app.py

Browse files

Files changed (1) hide show

app.py +457 -0

app.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import gradio as gr
+import io
+import wave
+import numpy as np
+# Lazy imports for optional dependencies
+try:
+    import torch  # type: ignore
+except Exception:  # pragma: no cover
+    torch = None  # type: ignore
+try:
+    from pocket_tts import TTSModel  # type: ignore
+except Exception:  # pragma: no cover
+    TTSModel = None  # type: ignore
+# Global state for lazy initialization
+_POCKET_STATE = {
+    "initialized": False,
+    "model": None,
+    "voice_states": {},
+    "sample_rate": 24000,
+}
+# Fallback voices from kyutai/tts-voices (used if no local voices found)
+_FALLBACK_VOICES = {
+    "alba": "hf://kyutai/tts-voices/alba-mackenna/casual.wav",
+    "marius": "hf://kyutai/tts-voices/voice-donations/Selfie.wav",
+    "javert": "hf://kyutai/tts-voices/voice-donations/Butter.wav",
+    "jean": "hf://kyutai/tts-voices/ears/p010/freeform_speech_01.wav",
+    "fantine": "hf://kyutai/tts-voices/vctk/p244_023.wav",
+    "cosette": "hf://kyutai/tts-voices/expresso/ex04-ex02_confused_001_channel1_499s.wav",
+    "eponine": "hf://kyutai/tts-voices/vctk/p262_023.wav",
+    "azelma": "hf://kyutai/tts-voices/vctk/p303_023.wav",
+}
+def _get_available_voices() -> dict[str, str]:
+    """Get available voices, preferring local files over HuggingFace.
+    Scans ./voices/ directory for audio files (WAV, MP3, etc.)
+    Falls back to HuggingFace preset voices if no local files found.
+    """
+    import os
+    voices_dir = os.path.join(os.path.dirname(__file__), "voices")
+    local_voices = {}
+    if os.path.exists(voices_dir):
+        for f in os.listdir(voices_dir):
+            # Support common audio formats
+            if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
+                voice_name = os.path.splitext(f)[0]
+                local_voices[voice_name] = os.path.join(voices_dir, f)
+    # If we found local voices, use those exclusively
+    if local_voices:
+        print(f"Found {len(local_voices)} local voice(s): {list(local_voices.keys())}")
+        return local_voices
+    # Fall back to HuggingFace voices
+    print("No local voices found, using HuggingFace preset voices")
+    return _FALLBACK_VOICES
+# Scan voices at import time
+PRESET_VOICES = _get_available_voices()
+def _init_pocket(
+    temp: float = 0.7,
+    lsd_decode_steps: int = 1,
+    noise_clamp: float | None = None,
+    eos_threshold: float = -4.0,
+) -> None:
+    """Lazy initialization of the Pocket TTS model."""
+    if _POCKET_STATE["initialized"]:
+        return
+    if TTSModel is None:
+        raise gr.Error(
+            "pocket-tts is not installed. Please install with: pip install pocket-tts"
+        )
+    if torch is None:
+        raise gr.Error("PyTorch is not installed. Please install torch>=2.5.0")
+    print("Initializing Pocket TTS...")
+    # Auto-detect device: CPU by default, CUDA if available
+    # Note: The pocket-tts docs mention GPU doesn't provide speedup for this model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    try:
+        model = TTSModel.load_model(
+            temp=float(temp),
+            lsd_decode_steps=int(lsd_decode_steps),
+            noise_clamp=float(noise_clamp) if noise_clamp is not None else None,
+            eos_threshold=float(eos_threshold),
+        )
+        _POCKET_STATE.update({
+            "initialized": True,
+            "model": model,
+            "sample_rate": model.sample_rate,
+        })
+        print(f"Pocket TTS initialized. Sample rate: {model.sample_rate} Hz")
+    except Exception as e:
+        raise gr.Error(f"Failed to initialize Pocket TTS model: {str(e)}")
+def _convert_to_wav(audio_path: str) -> str:
+    """Convert audio file to WAV format if needed.
+    Returns the path to a WAV file (original if already WAV, or converted temp file).
+    Uses pydub for MP3 (requires ffmpeg), soundfile for other formats.
+    """
+    import tempfile
+    # Check if already WAV
+    if audio_path.lower().endswith('.wav'):
+        return audio_path
+    print(f"Converting {audio_path} to WAV format...")
+    # Create temp file path
+    import os
+    tmp_fd, wav_path = tempfile.mkstemp(suffix=".wav")
+    os.close(tmp_fd)
+    # Try pydub first (better MP3 support via ffmpeg)
+    try:
+        from pydub import AudioSegment
+        audio = AudioSegment.from_file(audio_path)
+        audio.export(wav_path, format="wav")
+        print(f"Converted via pydub to: {wav_path}")
+        return wav_path
+    except ImportError:
+        pass  # pydub not installed, try soundfile
+    except Exception as e:
+        print(f"pydub conversion failed: {e}, trying soundfile...")
+    # Fall back to soundfile
+    try:
+        import soundfile as sf
+        audio_data, sample_rate = sf.read(audio_path)
+        sf.write(wav_path, audio_data, sample_rate)
+        print(f"Converted via soundfile to: {wav_path}")
+        return wav_path
+    except Exception as e:
+        raise gr.Error(f"Failed to convert audio file: {str(e)}. Please upload a WAV file directly or install pydub+ffmpeg for MP3 support.")
+def _get_voice_state(voice_name: str | None, custom_audio_path: str | None):
+    """Get or create voice state for generation.
+    Args:
+        voice_name: Name of preset voice (alba, marius, etc.)
+        custom_audio_path: Path to custom audio file for voice cloning
+    Returns:
+        Voice state dict for the model
+    """
+    model = _POCKET_STATE["model"]
+    # Custom audio takes priority
+    if custom_audio_path:
+        print(f"Loading custom voice from: {custom_audio_path}")
+        # Convert to WAV if needed
+        wav_path = _convert_to_wav(custom_audio_path)
+        return model.get_state_for_audio_prompt(wav_path)
+    # Use preset voice
+    if not voice_name or voice_name not in PRESET_VOICES:
+        # Default to first available voice
+        voice_name = list(PRESET_VOICES.keys())[0] if PRESET_VOICES else None
+        if not voice_name:
+            raise gr.Error("No voices available. Add audio files to the voices/ directory.")
+    # Check cache
+    if voice_name in _POCKET_STATE["voice_states"]:
+        return _POCKET_STATE["voice_states"][voice_name]
+    # Load and cache voice state
+    voice_path = PRESET_VOICES[voice_name]
+    print(f"Loading preset voice '{voice_name}' from: {voice_path}")
+    # Convert to WAV if needed (local files may be MP3, etc.)
+    wav_path = _convert_to_wav(voice_path)
+    voice_state = model.get_state_for_audio_prompt(wav_path)
+    _POCKET_STATE["voice_states"][voice_name] = voice_state
+    return voice_state
+def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
+    """Convert float audio array to int16."""
+    audio_clipped = np.clip(audio_np, -1.0, 1.0)
+    return (audio_clipped * 32767.0).astype(np.int16)
+def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
+    """Create WAV bytes from int16 audio array."""
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_int16.tobytes())
+    return buffer.getvalue()
+def _split_into_sentences(text: str) -> list[str]:
+    """Split text into sentences for chunk-by-chunk generation.
+    Uses simple punctuation-based splitting for natural speech chunks.
+    """
+    import re
+    # Split on sentence-ending punctuation, keeping the punctuation
+    # Handle common patterns: . ! ? and combinations like "..." or "?!"
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    # Filter out empty strings and strip whitespace
+    return [s.strip() for s in sentences if s.strip()]
+def pocket_tts_stream(
+    text: str,
+    voice: str,
+    custom_audio,
+    temperature: float,
+    lsd_decode_steps: int,
+    noise_clamp: float | None,
+    eos_threshold: float,
+    frames_after_eos: int,
+):
+    """Generate speech with sentence-level streaming.
+    Splits text into sentences and yields complete audio for each sentence,
+    matching Kokoro's smooth streaming pattern.
+    """
+    if not text or not text.strip():
+        raise gr.Error("Please enter text to synthesize.")
+    # Initialize model with current parameters
+    _init_pocket(
+        temp=temperature,
+        lsd_decode_steps=lsd_decode_steps,
+        noise_clamp=noise_clamp if noise_clamp and noise_clamp > 0 else None,
+        eos_threshold=eos_threshold,
+    )
+    model = _POCKET_STATE["model"]
+    sample_rate = _POCKET_STATE["sample_rate"]
+    # Get voice state
+    custom_path = custom_audio if custom_audio else None
+    voice_state = _get_voice_state(voice, custom_path)
+    # Split text into sentences for natural chunking
+    sentences = _split_into_sentences(text)
+    if not sentences:
+        raise gr.Error("No valid sentences found in text.")
+    produced_any = False
+    # Buffer for initial audio - wait for ~5 seconds before yielding first chunk
+    # This prevents stuttering from short first sentences
+    min_initial_samples = int(sample_rate * 5)  # 5 seconds of audio
+    audio_buffer = []
+    buffer_samples = 0
+    initial_buffer_yielded = False
+    try:
+        for idx, sentence in enumerate(sentences):
+            # Generate complete audio for this sentence (non-streaming per sentence)
+            audio = model.generate_audio(
+                voice_state,
+                sentence,
+                frames_after_eos=frames_after_eos if frames_after_eos > 0 else None,
+                copy_state=True,
+            )
+            produced_any = True
+            # Convert tensor to numpy
+            audio_np = audio.cpu().numpy() if hasattr(audio, 'cpu') else audio
+            if not initial_buffer_yielded:
+                # Accumulate in buffer until we have enough audio
+                audio_buffer.append(audio_np)
+                buffer_samples += len(audio_np)
+                # Check if we have enough or this is the last sentence
+                if buffer_samples >= min_initial_samples or idx == len(sentences) - 1:
+                    # Yield the accumulated buffer
+                    combined = np.concatenate(audio_buffer, axis=0)
+                    audio_int16 = _audio_np_to_int16(combined)
+                    yield _wav_bytes_from_int16(audio_int16, sample_rate)
+                    audio_buffer = []
+                    buffer_samples = 0
+                    initial_buffer_yielded = True
+            else:
+                # After initial buffer, yield each sentence immediately
+                audio_int16 = _audio_np_to_int16(audio_np)
+                yield _wav_bytes_from_int16(audio_int16, sample_rate)
+    except gr.Error:
+        raise
+    except Exception as e:
+        raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
+    if not produced_any:
+        raise gr.Error("No audio was generated.")
+def generate_tts(
+    text: str,
+    voice: str,
+    custom_audio,
+    temperature: float,
+    lsd_decode_steps: int,
+    noise_clamp: float,
+    eos_threshold: float,
+    frames_after_eos: int,
+):
+    """Main streaming dispatcher for Pocket TTS."""
+    yield from pocket_tts_stream(
+        text,
+        voice,
+        custom_audio,
+        temperature,
+        lsd_decode_steps,
+        noise_clamp,
+        eos_threshold,
+        frames_after_eos,
+    )
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    gr.HTML(
+        "<h1 style='text-align: center;'>Pocket-TTS</h1>"
+        "<p style='text-align: center;'>Powered by kyutai/pocket-tts | Lightweight TTS on CPU</p>"
+    )
+    with gr.Row():
+        with gr.Column():
+            # Text input
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter the text you want to convert to speech here...",
+                lines=5,
+                value="Hello! This is a test of the Pocket text to speech model. It runs efficiently on CPU and supports voice cloning.",
+            )
+            # Voice selection
+            with gr.Group():
+                gr.Markdown("### Voice Selection")
+                gr.Markdown("Select a preset voice OR upload your own WAV file for voice cloning.")
+                voice_dropdown = gr.Dropdown(
+                    choices=list(PRESET_VOICES.keys()),
+                    label="Preset Voice",
+                    value=list(PRESET_VOICES.keys())[0] if PRESET_VOICES else None,
+                    info="Select a pre-loaded voice. Ignored if custom audio is uploaded.",
+                )
+                gr.Markdown("--- OR ---")
+                ref_audio_input = gr.Audio(
+                    label="Custom Voice (WAV)",
+                    type="filepath",
+                    sources=["upload", "microphone"],
+                )
+            generate_btn = gr.Button(
+                "Generate Speech",
+                variant="primary",
+            )
+        with gr.Column():
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                streaming=True,
+                autoplay=True,
+                buttons=["download"],
+            )
+            with gr.Accordion("Advanced Options", open=False):
+                temp_slider = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=0.7,
+                    step=0.05,
+                    label="Temperature",
+                    info="Controls randomness. Higher = more varied, lower = more consistent.",
+                )
+                lsd_steps_slider = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=1,
+                    step=1,
+                    label="LSD Decode Steps",
+                    info="Number of generation steps. Higher = potentially better quality but slower.",
+                )
+                noise_clamp_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=5.0,
+                    value=0.0,
+                    step=0.1,
+                    label="Noise Clamp",
+                    info="Maximum value for noise sampling. 0 = disabled.",
+                )
+                eos_threshold_slider = gr.Slider(
+                    minimum=-10.0,
+                    maximum=0.0,
+                    value=-4.0,
+                    step=0.5,
+                    label="EOS Threshold",
+                    info="Threshold for end-of-sequence detection. More negative = longer audio.",
+                )
+                frames_after_eos_slider = gr.Slider(
+                    minimum=0,
+                    maximum=10,
+                    value=2,
+                    step=1,
+                    label="Frames After EOS",
+                    info="Additional frames to generate after EOS detection.",
+                )
+    # Connect inputs
+    generate_inputs = [
+        text_input,
+        voice_dropdown,
+        ref_audio_input,
+        temp_slider,
+        lsd_steps_slider,
+        noise_clamp_slider,
+        eos_threshold_slider,
+        frames_after_eos_slider,
+    ]
+    generate_btn.click(
+        fn=generate_tts,
+        inputs=generate_inputs,
+        outputs=audio_output,
+        api_name="generate_speech",
+    )
+    text_input.submit(
+        fn=generate_tts,
+        inputs=generate_inputs,
+        outputs=audio_output,
+        api_name="generate_speech_enter",
+    )
+if __name__ == "__main__":
+    demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")