File size: 4,807 Bytes
23e8911
eb2daf6
 
3c8062c
 
eb2daf6
3c8062c
 
 
 
 
23e8911
3c8062c
 
 
 
23e8911
3c8062c
 
 
 
23e8911
3c8062c
 
 
 
 
 
 
23e8911
3c8062c
 
 
23e8911
3c8062c
 
 
 
 
23e8911
3c8062c
23e8911
3c8062c
 
 
 
 
23e8911
3c8062c
 
 
 
 
 
 
 
 
 
 
 
 
 
23e8911
3c8062c
 
 
 
23e8911
3c8062c
 
 
 
 
 
23e8911
 
 
 
 
3c8062c
 
 
 
23e8911
3c8062c
 
23e8911
 
 
 
3c8062c
23e8911
eb2daf6
23e8911
3c8062c
23e8911
3c8062c
 
23e8911
3c8062c
 
 
 
 
 
 
 
 
23e8911
3c8062c
 
 
 
23e8911
 
 
3c8062c
 
 
 
 
 
 
 
 
 
23e8911
 
3c8062c
 
23e8911
3c8062c
 
 
 
 
 
 
 
 
 
eb2daf6
3c8062c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import ctypes
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment

# 1. Load the compiled Linux library
# In the Dockerfile, we named it libnanotts.so
LIB_PATH = "./libnanotts.so"
if not os.path.exists(LIB_PATH):
    raise FileNotFoundError(f"Could not find {LIB_PATH}. Ensure the Dockerfile compiled it correctly.")

lib = ctypes.CDLL(LIB_PATH)

# 2. Define C-API Argument and Return Types
# These match the nanotts.h header signatures
lib.load_model.restype = ctypes.c_int
lib.free_model.restype = None

# Arguments for generate_wav_from_ref: 
# (ref_path, text, out_wav_ptr, out_samples, out_channels, out_sr, is_stereo)
lib.generate_wav_from_ref.argtypes = [
    ctypes.c_char_p,                                # const char* ref_path
    ctypes.c_char_p,                                # const char* text
    ctypes.POINTER(ctypes.POINTER(ctypes.c_float)), # float** wav_out
    ctypes.POINTER(ctypes.c_int),                  # int* samples
    ctypes.POINTER(ctypes.c_int),                  # int* channels
    ctypes.POINTER(ctypes.c_int),                  # int* sr
    ctypes.c_int,                                   # int is_stereo (1=yes)
]
lib.generate_wav_from_ref.restype = ctypes.c_int

# Arguments for save_wav: (filename, wav_data, samples, channels, sr)
lib.save_wav.argtypes = [
    ctypes.c_char_p, 
    ctypes.POINTER(ctypes.c_float), 
    ctypes.c_int, 
    ctypes.c_int, 
    ctypes.c_int
]
lib.save_wav.restype = ctypes.c_int

# 3. Initialize the model once on startup
print("Initializing MossTTS-Nano Model...")
init_status = lib.load_model()
if init_status != 0:
    print(f"Failed to load model. Error code: {init_status}")

def preprocess_audio(input_path):
    """
    Converts input audio to the exact format the C engine expects:
    WAV, 48000Hz, 16-bit PCM.
    """
    target_path = "processed_ref.wav"
    try:
        audio = AudioSegment.from_file(input_path)
        # Standardize to 48k, mono (usually better for extraction), 16-bit
        audio = audio.set_frame_rate(48000).set_channels(1).set_sample_width(2)
        audio.export(target_path, format="wav")
        return target_path
    except Exception as e:
        print(f"Audio Preprocessing Error: {e}")
        return None

def inference(ref_audio, text):
    if not ref_audio or not text:
        return None, "Please provide both a reference voice and text."
    
    # Step A: Convert input to compatible WAV
    clean_ref = preprocess_audio(ref_audio)
    if not clean_ref:
        return None, "Failed to process reference audio file."

    # Step B: Prepare pointers for C output
    wav_ptr = ctypes.POINTER(ctypes.c_float)()
    samples = ctypes.c_int()
    channels = ctypes.c_int()
    sr = ctypes.c_int()
    
    output_filename = "output.wav"

    # Step C: Call the C Engine
    # result 0 means success
    result = lib.generate_wav_from_ref(
        clean_ref.encode('utf-8'),
        text.encode('utf-8'),
        ctypes.byref(wav_ptr),
        ctypes.byref(samples),
        ctypes.byref(channels),
        ctypes.byref(sr),
        1 # Stereo output
    )
    
    if result != 0:
        return None, f"C Engine Error: {result}"

    # Step D: Save the generated buffer to a WAV file
    lib.save_wav(output_filename.encode('utf-8'), wav_ptr, samples, channels, sr)
    
    # Step E: Free the memory allocated by the C library using standard libc
    # This prevents memory leaks in your Space
    try:
        libc = ctypes.CDLL("libc.so.6")
        libc.free(wav_ptr)
    except:
        pass # Fallback if libc naming differs

    return output_filename, "Success!"

# 4. Gradio Web Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚀 NanoTTS C-Inference Engine")
    gr.Markdown("Zero-dependency C implementation of MOSS-TTS-Nano. Fast, tiny, and runs entirely on CPU.")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Text to Speak", 
                placeholder="Enter the text you want the AI to say...",
                lines=3
            )
            input_audio = gr.Audio(
                label="Reference Voice (Clone)", 
                type="filepath"
            )
            submit_btn = gr.Button("Synthesize Audio", variant="primary")
        
        with gr.Column():
            audio_out = gr.Audio(label="Generated Result")
            status_out = gr.Textbox(label="Status", interactive=False)

    submit_btn.click(
        fn=inference, 
        inputs=[input_audio, input_text], 
        outputs=[audio_out, status_out]
    )

    gr.Examples(
        examples=[["./asserts/audio/ljs.wav", "Hello, I am a tiny C based voice cloning engine."]],
        inputs=[input_audio, input_text]
    )

if __name__ == "__main__":
    demo.launch()