Spaces:

StaticFace
/

TTS

Sleeping

App Files Files Community

StaticFace commited on Feb 6

Commit

3d73fc7

verified ·

1 Parent(s): a09d229

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -31

app.py CHANGED Viewed

@@ -27,13 +27,10 @@ _OriginalInferenceSession = ort.InferenceSession
 def _PatchedInferenceSession(*args, **kwargs):
     so = kwargs.get("sess_options", ort.SessionOptions())
     so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
     so.intra_op_num_threads = CPU_THREADS
     so.inter_op_num_threads = 1
     kwargs["sess_options"] = so
     return _OriginalInferenceSession(*args, **kwargs)
@@ -43,25 +40,25 @@ from pocket_tts_onnx import PocketTTSOnnx
 tts_cache = {}
-def get_tts(precision: str, temperature: float, lsd_steps: int):
-    key = (precision, float(temperature), int(lsd_steps))
     if key not in tts_cache:
         tts_cache[key] = PocketTTSOnnx(
-            precision=precision,
             temperature=float(temperature),
             lsd_steps=int(lsd_steps),
             device="cpu",
         )
     return tts_cache[key]
-def synthesize(ref_audio_path, text, precision, temperature, lsd_steps):
     text = (text or "").strip()
     if not ref_audio_path:
         raise gr.Error("Upload a reference audio file.")
     if not text:
         raise gr.Error("Enter some text.")
-    tts = get_tts(precision, temperature, int(lsd_steps))
     audio = tts.generate(text=text, voice=ref_audio_path)
     sr = getattr(tts, "SAMPLE_RATE", 24000)
@@ -71,36 +68,22 @@ def synthesize(ref_audio_path, text, precision, temperature, lsd_steps):
     out_path = os.path.join(tempfile.gettempdir(), "pocket_tts_out.wav")
     sf.write(out_path, audio_np, sr)
-    info = (
-        f"CPU_THREADS = {CPU_THREADS}\n"
-        f"precision = {precision}\n"
-        f"temperature = {tts.temperature}\n"
-        f"lsd_steps = {tts.lsd_steps}\n"
-        f"sample_rate = {sr}"
-    )
-    return out_path, info
 with gr.Blocks() as demo:
-    gr.Markdown("# Pocket TTS ONNX\nReference audio + text → output audio")
-    info_box = gr.Textbox(label="Runtime Info", value=f"CPU_THREADS = {CPU_THREADS}", lines=6)
     with gr.Row():
-        ref_audio = gr.Audio(label="Reference Audio", type="filepath")
-        text = gr.Textbox(label="Text", lines=6, value="Hello, this is a test of voice cloning.")
     with gr.Row():
-        precision = gr.Dropdown(["int8", "fp32"], value="int8", label="Precision")
-        temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
-        lsd_steps = gr.Slider(1, 20, value=10, step=1, label="LSD Steps")
-    generate = gr.Button("Generate", variant="primary")
-    out_audio = gr.Audio(label="Output", type="filepath")
     generate.click(
         fn=synthesize,
-        inputs=[ref_audio, text, precision, temperature, lsd_steps],
-        outputs=[out_audio, info_box],
         api_name="generate",
     )

 def _PatchedInferenceSession(*args, **kwargs):
     so = kwargs.get("sess_options", ort.SessionOptions())
     so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
     so.intra_op_num_threads = CPU_THREADS
     so.inter_op_num_threads = 1
     kwargs["sess_options"] = so
     return _OriginalInferenceSession(*args, **kwargs)
 tts_cache = {}
+def get_tts(temperature: float, lsd_steps: int):
+    key = (float(temperature), int(lsd_steps))
     if key not in tts_cache:
         tts_cache[key] = PocketTTSOnnx(
+            precision="int8",
             temperature=float(temperature),
             lsd_steps=int(lsd_steps),
             device="cpu",
         )
     return tts_cache[key]
+def synthesize(ref_audio_path, text, temperature, lsd_steps):
     text = (text or "").strip()
     if not ref_audio_path:
         raise gr.Error("Upload a reference audio file.")
     if not text:
         raise gr.Error("Enter some text.")
+    tts = get_tts(temperature, int(lsd_steps))
     audio = tts.generate(text=text, voice=ref_audio_path)
     sr = getattr(tts, "SAMPLE_RATE", 24000)
     out_path = os.path.join(tempfile.gettempdir(), "pocket_tts_out.wav")
     sf.write(out_path, audio_np, sr)
+    return out_path
 with gr.Blocks() as demo:
     with gr.Row():
+        ref_audio = gr.Audio(type="filepath")
+        text = gr.Textbox(lines=6, value="Hello, this is a test.")
     with gr.Row():
+        temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05)
+        lsd_steps = gr.Slider(1, 20, value=10, step=1)
+    generate = gr.Button("Generate")
+    out_audio = gr.Audio(type="filepath")
     generate.click(
         fn=synthesize,
+        inputs=[ref_audio, text, temperature, lsd_steps],
+        outputs=[out_audio],
         api_name="generate",
     )