Spaces:

arjunbroepic
/

mossttsnano

Running

App Files Files Community

mossttsnano / app.py

arjunbroepic

Update app.py

3c8062c verified 18 days ago

raw

history blame contribute delete

4.81 kB

	import ctypes
	import gradio as gr
	import os
	import numpy as np
	from pydub import AudioSegment

	# 1. Load the compiled Linux library
	# In the Dockerfile, we named it libnanotts.so
	LIB_PATH = "./libnanotts.so"
	if not os.path.exists(LIB_PATH):
	raise FileNotFoundError(f"Could not find {LIB_PATH}. Ensure the Dockerfile compiled it correctly.")

	lib = ctypes.CDLL(LIB_PATH)

	# 2. Define C-API Argument and Return Types
	# These match the nanotts.h header signatures
	lib.load_model.restype = ctypes.c_int
	lib.free_model.restype = None

	# Arguments for generate_wav_from_ref:
	# (ref_path, text, out_wav_ptr, out_samples, out_channels, out_sr, is_stereo)
	lib.generate_wav_from_ref.argtypes = [
	ctypes.c_char_p, # const char* ref_path
	ctypes.c_char_p, # const char* text
	ctypes.POINTER(ctypes.POINTER(ctypes.c_float)), # float** wav_out
	ctypes.POINTER(ctypes.c_int), # int* samples
	ctypes.POINTER(ctypes.c_int), # int* channels
	ctypes.POINTER(ctypes.c_int), # int* sr
	ctypes.c_int, # int is_stereo (1=yes)
	]
	lib.generate_wav_from_ref.restype = ctypes.c_int

	# Arguments for save_wav: (filename, wav_data, samples, channels, sr)
	lib.save_wav.argtypes = [
	ctypes.c_char_p,
	ctypes.POINTER(ctypes.c_float),
	ctypes.c_int,
	ctypes.c_int,
	ctypes.c_int
	]
	lib.save_wav.restype = ctypes.c_int

	# 3. Initialize the model once on startup
	print("Initializing MossTTS-Nano Model...")
	init_status = lib.load_model()
	if init_status != 0:
	print(f"Failed to load model. Error code: {init_status}")

	def preprocess_audio(input_path):
	"""
	Converts input audio to the exact format the C engine expects:
	WAV, 48000Hz, 16-bit PCM.
	"""
	target_path = "processed_ref.wav"
	try:
	audio = AudioSegment.from_file(input_path)
	# Standardize to 48k, mono (usually better for extraction), 16-bit
	audio = audio.set_frame_rate(48000).set_channels(1).set_sample_width(2)
	audio.export(target_path, format="wav")
	return target_path
	except Exception as e:
	print(f"Audio Preprocessing Error: {e}")
	return None

	def inference(ref_audio, text):
	if not ref_audio or not text:
	return None, "Please provide both a reference voice and text."

	# Step A: Convert input to compatible WAV
	clean_ref = preprocess_audio(ref_audio)
	if not clean_ref:
	return None, "Failed to process reference audio file."

	# Step B: Prepare pointers for C output
	wav_ptr = ctypes.POINTER(ctypes.c_float)()
	samples = ctypes.c_int()
	channels = ctypes.c_int()
	sr = ctypes.c_int()

	output_filename = "output.wav"

	# Step C: Call the C Engine
	# result 0 means success
	result = lib.generate_wav_from_ref(
	clean_ref.encode('utf-8'),
	text.encode('utf-8'),
	ctypes.byref(wav_ptr),
	ctypes.byref(samples),
	ctypes.byref(channels),
	ctypes.byref(sr),
	1 # Stereo output
	)

	if result != 0:
	return None, f"C Engine Error: {result}"

	# Step D: Save the generated buffer to a WAV file
	lib.save_wav(output_filename.encode('utf-8'), wav_ptr, samples, channels, sr)

	# Step E: Free the memory allocated by the C library using standard libc
	# This prevents memory leaks in your Space
	try:
	libc = ctypes.CDLL("libc.so.6")
	libc.free(wav_ptr)
	except:
	pass # Fallback if libc naming differs

	return output_filename, "Success!"

	# 4. Gradio Web Interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🚀 NanoTTS C-Inference Engine")
	gr.Markdown("Zero-dependency C implementation of MOSS-TTS-Nano. Fast, tiny, and runs entirely on CPU.")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter the text you want the AI to say...",
	lines=3
	)
	input_audio = gr.Audio(
	label="Reference Voice (Clone)",
	type="filepath"
	)
	submit_btn = gr.Button("Synthesize Audio", variant="primary")

	with gr.Column():
	audio_out = gr.Audio(label="Generated Result")
	status_out = gr.Textbox(label="Status", interactive=False)

	submit_btn.click(
	fn=inference,
	inputs=[input_audio, input_text],
	outputs=[audio_out, status_out]
	)

	gr.Examples(
	examples=[["./asserts/audio/ljs.wav", "Hello, I am a tiny C based voice cloning engine."]],
	inputs=[input_audio, input_text]
	)

	if __name__ == "__main__":
	demo.launch()