Spaces:
Running
Running
File size: 4,807 Bytes
23e8911 eb2daf6 3c8062c eb2daf6 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 eb2daf6 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c 23e8911 3c8062c eb2daf6 3c8062c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | import ctypes
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment
# 1. Load the compiled Linux library
# In the Dockerfile, we named it libnanotts.so
LIB_PATH = "./libnanotts.so"
if not os.path.exists(LIB_PATH):
raise FileNotFoundError(f"Could not find {LIB_PATH}. Ensure the Dockerfile compiled it correctly.")
lib = ctypes.CDLL(LIB_PATH)
# 2. Define C-API Argument and Return Types
# These match the nanotts.h header signatures
lib.load_model.restype = ctypes.c_int
lib.free_model.restype = None
# Arguments for generate_wav_from_ref:
# (ref_path, text, out_wav_ptr, out_samples, out_channels, out_sr, is_stereo)
lib.generate_wav_from_ref.argtypes = [
ctypes.c_char_p, # const char* ref_path
ctypes.c_char_p, # const char* text
ctypes.POINTER(ctypes.POINTER(ctypes.c_float)), # float** wav_out
ctypes.POINTER(ctypes.c_int), # int* samples
ctypes.POINTER(ctypes.c_int), # int* channels
ctypes.POINTER(ctypes.c_int), # int* sr
ctypes.c_int, # int is_stereo (1=yes)
]
lib.generate_wav_from_ref.restype = ctypes.c_int
# Arguments for save_wav: (filename, wav_data, samples, channels, sr)
lib.save_wav.argtypes = [
ctypes.c_char_p,
ctypes.POINTER(ctypes.c_float),
ctypes.c_int,
ctypes.c_int,
ctypes.c_int
]
lib.save_wav.restype = ctypes.c_int
# 3. Initialize the model once on startup
print("Initializing MossTTS-Nano Model...")
init_status = lib.load_model()
if init_status != 0:
print(f"Failed to load model. Error code: {init_status}")
def preprocess_audio(input_path):
"""
Converts input audio to the exact format the C engine expects:
WAV, 48000Hz, 16-bit PCM.
"""
target_path = "processed_ref.wav"
try:
audio = AudioSegment.from_file(input_path)
# Standardize to 48k, mono (usually better for extraction), 16-bit
audio = audio.set_frame_rate(48000).set_channels(1).set_sample_width(2)
audio.export(target_path, format="wav")
return target_path
except Exception as e:
print(f"Audio Preprocessing Error: {e}")
return None
def inference(ref_audio, text):
if not ref_audio or not text:
return None, "Please provide both a reference voice and text."
# Step A: Convert input to compatible WAV
clean_ref = preprocess_audio(ref_audio)
if not clean_ref:
return None, "Failed to process reference audio file."
# Step B: Prepare pointers for C output
wav_ptr = ctypes.POINTER(ctypes.c_float)()
samples = ctypes.c_int()
channels = ctypes.c_int()
sr = ctypes.c_int()
output_filename = "output.wav"
# Step C: Call the C Engine
# result 0 means success
result = lib.generate_wav_from_ref(
clean_ref.encode('utf-8'),
text.encode('utf-8'),
ctypes.byref(wav_ptr),
ctypes.byref(samples),
ctypes.byref(channels),
ctypes.byref(sr),
1 # Stereo output
)
if result != 0:
return None, f"C Engine Error: {result}"
# Step D: Save the generated buffer to a WAV file
lib.save_wav(output_filename.encode('utf-8'), wav_ptr, samples, channels, sr)
# Step E: Free the memory allocated by the C library using standard libc
# This prevents memory leaks in your Space
try:
libc = ctypes.CDLL("libc.so.6")
libc.free(wav_ptr)
except:
pass # Fallback if libc naming differs
return output_filename, "Success!"
# 4. Gradio Web Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 NanoTTS C-Inference Engine")
gr.Markdown("Zero-dependency C implementation of MOSS-TTS-Nano. Fast, tiny, and runs entirely on CPU.")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Text to Speak",
placeholder="Enter the text you want the AI to say...",
lines=3
)
input_audio = gr.Audio(
label="Reference Voice (Clone)",
type="filepath"
)
submit_btn = gr.Button("Synthesize Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Generated Result")
status_out = gr.Textbox(label="Status", interactive=False)
submit_btn.click(
fn=inference,
inputs=[input_audio, input_text],
outputs=[audio_out, status_out]
)
gr.Examples(
examples=[["./asserts/audio/ljs.wav", "Hello, I am a tiny C based voice cloning engine."]],
inputs=[input_audio, input_text]
)
if __name__ == "__main__":
demo.launch()
|