Spaces:
Running
Running
| import ctypes | |
| import gradio as gr | |
| import os | |
| import numpy as np | |
| from pydub import AudioSegment | |
| # 1. Load the compiled Linux library | |
| # In the Dockerfile, we named it libnanotts.so | |
| LIB_PATH = "./libnanotts.so" | |
| if not os.path.exists(LIB_PATH): | |
| raise FileNotFoundError(f"Could not find {LIB_PATH}. Ensure the Dockerfile compiled it correctly.") | |
| lib = ctypes.CDLL(LIB_PATH) | |
| # 2. Define C-API Argument and Return Types | |
| # These match the nanotts.h header signatures | |
| lib.load_model.restype = ctypes.c_int | |
| lib.free_model.restype = None | |
| # Arguments for generate_wav_from_ref: | |
| # (ref_path, text, out_wav_ptr, out_samples, out_channels, out_sr, is_stereo) | |
| lib.generate_wav_from_ref.argtypes = [ | |
| ctypes.c_char_p, # const char* ref_path | |
| ctypes.c_char_p, # const char* text | |
| ctypes.POINTER(ctypes.POINTER(ctypes.c_float)), # float** wav_out | |
| ctypes.POINTER(ctypes.c_int), # int* samples | |
| ctypes.POINTER(ctypes.c_int), # int* channels | |
| ctypes.POINTER(ctypes.c_int), # int* sr | |
| ctypes.c_int, # int is_stereo (1=yes) | |
| ] | |
| lib.generate_wav_from_ref.restype = ctypes.c_int | |
| # Arguments for save_wav: (filename, wav_data, samples, channels, sr) | |
| lib.save_wav.argtypes = [ | |
| ctypes.c_char_p, | |
| ctypes.POINTER(ctypes.c_float), | |
| ctypes.c_int, | |
| ctypes.c_int, | |
| ctypes.c_int | |
| ] | |
| lib.save_wav.restype = ctypes.c_int | |
| # 3. Initialize the model once on startup | |
| print("Initializing MossTTS-Nano Model...") | |
| init_status = lib.load_model() | |
| if init_status != 0: | |
| print(f"Failed to load model. Error code: {init_status}") | |
| def preprocess_audio(input_path): | |
| """ | |
| Converts input audio to the exact format the C engine expects: | |
| WAV, 48000Hz, 16-bit PCM. | |
| """ | |
| target_path = "processed_ref.wav" | |
| try: | |
| audio = AudioSegment.from_file(input_path) | |
| # Standardize to 48k, mono (usually better for extraction), 16-bit | |
| audio = audio.set_frame_rate(48000).set_channels(1).set_sample_width(2) | |
| audio.export(target_path, format="wav") | |
| return target_path | |
| except Exception as e: | |
| print(f"Audio Preprocessing Error: {e}") | |
| return None | |
| def inference(ref_audio, text): | |
| if not ref_audio or not text: | |
| return None, "Please provide both a reference voice and text." | |
| # Step A: Convert input to compatible WAV | |
| clean_ref = preprocess_audio(ref_audio) | |
| if not clean_ref: | |
| return None, "Failed to process reference audio file." | |
| # Step B: Prepare pointers for C output | |
| wav_ptr = ctypes.POINTER(ctypes.c_float)() | |
| samples = ctypes.c_int() | |
| channels = ctypes.c_int() | |
| sr = ctypes.c_int() | |
| output_filename = "output.wav" | |
| # Step C: Call the C Engine | |
| # result 0 means success | |
| result = lib.generate_wav_from_ref( | |
| clean_ref.encode('utf-8'), | |
| text.encode('utf-8'), | |
| ctypes.byref(wav_ptr), | |
| ctypes.byref(samples), | |
| ctypes.byref(channels), | |
| ctypes.byref(sr), | |
| 1 # Stereo output | |
| ) | |
| if result != 0: | |
| return None, f"C Engine Error: {result}" | |
| # Step D: Save the generated buffer to a WAV file | |
| lib.save_wav(output_filename.encode('utf-8'), wav_ptr, samples, channels, sr) | |
| # Step E: Free the memory allocated by the C library using standard libc | |
| # This prevents memory leaks in your Space | |
| try: | |
| libc = ctypes.CDLL("libc.so.6") | |
| libc.free(wav_ptr) | |
| except: | |
| pass # Fallback if libc naming differs | |
| return output_filename, "Success!" | |
| # 4. Gradio Web Interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🚀 NanoTTS C-Inference Engine") | |
| gr.Markdown("Zero-dependency C implementation of MOSS-TTS-Nano. Fast, tiny, and runs entirely on CPU.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter the text you want the AI to say...", | |
| lines=3 | |
| ) | |
| input_audio = gr.Audio( | |
| label="Reference Voice (Clone)", | |
| type="filepath" | |
| ) | |
| submit_btn = gr.Button("Synthesize Audio", variant="primary") | |
| with gr.Column(): | |
| audio_out = gr.Audio(label="Generated Result") | |
| status_out = gr.Textbox(label="Status", interactive=False) | |
| submit_btn.click( | |
| fn=inference, | |
| inputs=[input_audio, input_text], | |
| outputs=[audio_out, status_out] | |
| ) | |
| gr.Examples( | |
| examples=[["./asserts/audio/ljs.wav", "Hello, I am a tiny C based voice cloning engine."]], | |
| inputs=[input_audio, input_text] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |