Spaces:

StaticFace
/

TTS

Sleeping

App Files Files Community

StaticFace commited on Feb 6

Commit

ffccc5e

verified ·

1 Parent(s): a77e673

Create app.py

Browse files

Files changed (1) hide show

app.py +77 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import sys
+import tempfile
+import gradio as gr
+import numpy as np
+import soundfile as sf
+from huggingface_hub import snapshot_download
+MODEL_REPO = "KevinAHM/pocket-tts-onnx"
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+os.environ.setdefault("OMP_NUM_THREADS", "2")
+_repo_dir = snapshot_download(
+    repo_id=MODEL_REPO,
+    allow_patterns=[
+        "pocket_tts_onnx.py",
+        "onnx/*",
+        "tokenizer.model",
+        "text_conditioner.onnx",
+        "reference_sample.wav",
+        "requirements.txt",
+    ],
+)
+sys.path.insert(0, _repo_dir)
+from pocket_tts_onnx import PocketTTSOnnx
+_tts_cache = {}
+def get_tts(temperature: float, lsd_steps: int):
+    key = (float(temperature), int(lsd_steps))
+    if key not in _tts_cache:
+        _tts_cache[key] = PocketTTSOnnx(temperature=float(temperature), lsd_steps=int(lsd_steps))
+    return _tts_cache[key]
+def synthesize(ref_audio_path, text, temperature, lsd_steps):
+    text = (text or "").strip()
+    if not ref_audio_path:
+        raise gr.Error("Upload a reference audio file.")
+    if not text:
+        raise gr.Error("Enter some text.")
+    tts = get_tts(temperature, lsd_steps)
+    audio = tts.generate(text=text, voice=ref_audio_path)
+    sr = getattr(tts, "sample_rate", 24000)
+    audio_np = np.asarray(audio)
+    if audio_np.ndim > 1:
+        audio_np = audio_np.squeeze()
+    out_path = os.path.join(tempfile.gettempdir(), "pocket_tts_out.wav")
+    sf.write(out_path, audio_np, sr)
+    return out_path
+with gr.Blocks() as demo:
+    gr.Markdown("# Pocket TTS ONNX (Voice Cloning)\nUpload a short reference voice sample, type text, and generate audio.")
+    with gr.Row():
+        ref_audio = gr.Audio(label="Reference Audio", type="filepath")
+        text = gr.Textbox(label="Text", lines=6, value="Hello, this is a test of voice cloning.")
+    with gr.Row():
+        temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
+        lsd_steps = gr.Slider(1, 10, value=10, step=1, label="LSD Steps")
+    generate = gr.Button("Generate", variant="primary")
+    out_audio = gr.Audio(label="Output", type="filepath")
+    generate.click(
+        fn=synthesize,
+        inputs=[ref_audio, text, temperature, lsd_steps],
+        outputs=[out_audio],
+        api_name="generate",
+    )
+if __name__ == "__main__":
+    demo.launch()