Spaces:
Running
Running
File size: 1,805 Bytes
a61afaa 42b69ad a61afaa 42b69ad a61afaa 42b69ad a61afaa 42b69ad a61afaa 42b69ad a61afaa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import gradio as gr
from voxcpm import VoxCPM
import soundfile as sf
import tempfile
import numpy as np
model = None
def load_model():
global model
if model is None:
model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False)
return model
def generate_speech(text, ref_audio, mode):
m = load_model()
if mode == "π¨ Voice Design" or ref_audio is None:
# TTS biasa / voice design dari deskripsi
wav = m.generate(text=text, cfg_value=2.0, inference_timesteps=10)
else:
# Voice cloning pakai audio referensi
ref_sr, ref_wav = ref_audio
ref_wav = ref_wav.astype(np.float32) / 32768.0
wav = m.generate(
text=text,
ref_audio=ref_wav,
ref_sr=ref_sr,
cfg_value=2.0,
inference_timesteps=10,
)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, wav, m.tts_model.sample_rate)
return tmp.name
demo = gr.Interface(
fn=generate_speech,
inputs=[
gr.Textbox(
label="Teks yang mau diucapkan",
placeholder="Ketik teks di sini... (untuk Voice Design, awali dengan (deskripsi suara)teks)"
),
gr.Audio(
label="ποΈ Upload audio referensi (untuk Voice Cloning)",
type="numpy",
sources=["upload", "microphone"]
),
gr.Radio(
choices=["π¨ Voice Design", "ποΈ Voice Cloning"],
value="ποΈ Voice Cloning",
label="Mode"
),
],
outputs=gr.Audio(label="Hasil suara"),
title="VoxCPM TTS API",
description="**Voice Cloning:** upload audio referensi + ketik teks\n**Voice Design:** tulis (deskripsi suara)teks, tanpa audio referensi"
)
demo.launch() |