mossttsnano / app.py
arjunbroepic's picture
Update app.py
3c8062c verified
import ctypes
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment
# 1. Load the compiled Linux library
# In the Dockerfile, we named it libnanotts.so
LIB_PATH = "./libnanotts.so"
if not os.path.exists(LIB_PATH):
raise FileNotFoundError(f"Could not find {LIB_PATH}. Ensure the Dockerfile compiled it correctly.")
lib = ctypes.CDLL(LIB_PATH)
# 2. Define C-API Argument and Return Types
# These match the nanotts.h header signatures
lib.load_model.restype = ctypes.c_int
lib.free_model.restype = None
# Arguments for generate_wav_from_ref:
# (ref_path, text, out_wav_ptr, out_samples, out_channels, out_sr, is_stereo)
lib.generate_wav_from_ref.argtypes = [
ctypes.c_char_p, # const char* ref_path
ctypes.c_char_p, # const char* text
ctypes.POINTER(ctypes.POINTER(ctypes.c_float)), # float** wav_out
ctypes.POINTER(ctypes.c_int), # int* samples
ctypes.POINTER(ctypes.c_int), # int* channels
ctypes.POINTER(ctypes.c_int), # int* sr
ctypes.c_int, # int is_stereo (1=yes)
]
lib.generate_wav_from_ref.restype = ctypes.c_int
# Arguments for save_wav: (filename, wav_data, samples, channels, sr)
lib.save_wav.argtypes = [
ctypes.c_char_p,
ctypes.POINTER(ctypes.c_float),
ctypes.c_int,
ctypes.c_int,
ctypes.c_int
]
lib.save_wav.restype = ctypes.c_int
# 3. Initialize the model once on startup
print("Initializing MossTTS-Nano Model...")
init_status = lib.load_model()
if init_status != 0:
print(f"Failed to load model. Error code: {init_status}")
def preprocess_audio(input_path):
"""
Converts input audio to the exact format the C engine expects:
WAV, 48000Hz, 16-bit PCM.
"""
target_path = "processed_ref.wav"
try:
audio = AudioSegment.from_file(input_path)
# Standardize to 48k, mono (usually better for extraction), 16-bit
audio = audio.set_frame_rate(48000).set_channels(1).set_sample_width(2)
audio.export(target_path, format="wav")
return target_path
except Exception as e:
print(f"Audio Preprocessing Error: {e}")
return None
def inference(ref_audio, text):
if not ref_audio or not text:
return None, "Please provide both a reference voice and text."
# Step A: Convert input to compatible WAV
clean_ref = preprocess_audio(ref_audio)
if not clean_ref:
return None, "Failed to process reference audio file."
# Step B: Prepare pointers for C output
wav_ptr = ctypes.POINTER(ctypes.c_float)()
samples = ctypes.c_int()
channels = ctypes.c_int()
sr = ctypes.c_int()
output_filename = "output.wav"
# Step C: Call the C Engine
# result 0 means success
result = lib.generate_wav_from_ref(
clean_ref.encode('utf-8'),
text.encode('utf-8'),
ctypes.byref(wav_ptr),
ctypes.byref(samples),
ctypes.byref(channels),
ctypes.byref(sr),
1 # Stereo output
)
if result != 0:
return None, f"C Engine Error: {result}"
# Step D: Save the generated buffer to a WAV file
lib.save_wav(output_filename.encode('utf-8'), wav_ptr, samples, channels, sr)
# Step E: Free the memory allocated by the C library using standard libc
# This prevents memory leaks in your Space
try:
libc = ctypes.CDLL("libc.so.6")
libc.free(wav_ptr)
except:
pass # Fallback if libc naming differs
return output_filename, "Success!"
# 4. Gradio Web Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 NanoTTS C-Inference Engine")
gr.Markdown("Zero-dependency C implementation of MOSS-TTS-Nano. Fast, tiny, and runs entirely on CPU.")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Text to Speak",
placeholder="Enter the text you want the AI to say...",
lines=3
)
input_audio = gr.Audio(
label="Reference Voice (Clone)",
type="filepath"
)
submit_btn = gr.Button("Synthesize Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Generated Result")
status_out = gr.Textbox(label="Status", interactive=False)
submit_btn.click(
fn=inference,
inputs=[input_audio, input_text],
outputs=[audio_out, status_out]
)
gr.Examples(
examples=[["./asserts/audio/ljs.wav", "Hello, I am a tiny C based voice cloning engine."]],
inputs=[input_audio, input_text]
)
if __name__ == "__main__":
demo.launch()