import gradio as gr from voxcpm import VoxCPM import soundfile as sf import tempfile import numpy as np model = None def load_model(): global model if model is None: model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False) return model def generate_speech(text, ref_audio, mode): m = load_model() if mode == "🎨 Voice Design" or ref_audio is None: # TTS biasa / voice design dari deskripsi wav = m.generate(text=text, cfg_value=2.0, inference_timesteps=10) else: # Voice cloning pakai audio referensi ref_sr, ref_wav = ref_audio ref_wav = ref_wav.astype(np.float32) / 32768.0 wav = m.generate( text=text, ref_audio=ref_wav, ref_sr=ref_sr, cfg_value=2.0, inference_timesteps=10, ) tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(tmp.name, wav, m.tts_model.sample_rate) return tmp.name demo = gr.Interface( fn=generate_speech, inputs=[ gr.Textbox( label="Teks yang mau diucapkan", placeholder="Ketik teks di sini... (untuk Voice Design, awali dengan (deskripsi suara)teks)" ), gr.Audio( label="🎙️ Upload audio referensi (untuk Voice Cloning)", type="numpy", sources=["upload", "microphone"] ), gr.Radio( choices=["🎨 Voice Design", "🎙️ Voice Cloning"], value="🎙️ Voice Cloning", label="Mode" ), ], outputs=gr.Audio(label="Hasil suara"), title="VoxCPM TTS API", description="**Voice Cloning:** upload audio referensi + ketik teks\n**Voice Design:** tulis (deskripsi suara)teks, tanpa audio referensi" ) demo.launch()