Spaces:

Rcarvalo
/

speech-to-speech

Runtime error

App Files Files Community

speech-to-speech / app.py

Rcarvalo

Upload app.py with huggingface_hub

5880918 verified 8 days ago

raw

history blame

6.49 kB

	"""
	Gradio app for LFM2-Audio speech-to-speech demo
	Compatible with Hugging Face Spaces
	"""

	import gradio as gr
	import numpy as np
	import torch
	import torchaudio

	from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality

	# Load models
	HF_REPO = "LiquidAI/LFM2-Audio-1.5B"

	print("Loading processor...")
	processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
	print("Loading model...")
	model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
	print("Loading audio codec...")
	mimi = processor.mimi.eval()

	# Move to CUDA if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	mimi = mimi.to(device)

	print(f"Models loaded on {device}")


	def generate_response(audio_input, temperature, top_k, chat_state):
	"""Generate speech-to-speech response"""

	if audio_input is None:
	return None, "Please record audio first", chat_state

	# Parse audio input
	rate, wav = audio_input

	# Convert to torch tensor
	if wav.dtype == np.int16:
	wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
	else:
	wav_tensor = torch.tensor(wav, dtype=torch.float32)

	# Ensure mono and correct shape (channels, samples)
	if len(wav_tensor.shape) > 1:
	wav_tensor = wav_tensor.mean(dim=-1)

	# add_audio expects shape (channels, samples), so add channel dimension
	if len(wav_tensor.shape) == 1:
	wav_tensor = wav_tensor.unsqueeze(0)

	# Initialize chat state if empty
	if len(chat_state.text) == 1:
	chat_state.new_turn("system")
	chat_state.add_text("Respond with interleaved text and audio.")
	chat_state.end_turn()

	# Add user audio
	chat_state.new_turn("user")
	chat_state.add_audio(wav_tensor, rate)
	chat_state.end_turn()

	# Start assistant turn
	chat_state.new_turn("assistant")

	# Set generation parameters
	temp = None if temperature == 0 else float(temperature)
	topk = None if top_k == 0 else int(top_k)

	# Generate response
	text_out = []
	audio_out = []
	modality_out = []

	full_text = ""

	print("Generating response...")
	with torch.no_grad():
	for t in model.generate_interleaved(
	**chat_state,
	max_new_tokens=1024,
	audio_temperature=temp,
	audio_top_k=topk,
	):
	if t.numel() == 1: # Text token
	text_out.append(t)
	modality_out.append(LFMModality.TEXT)
	decoded = processor.text.decode(t)
	full_text += decoded
	print(decoded, end="", flush=True)
	elif t.numel() == 8: # Audio token
	audio_out.append(t)
	modality_out.append(LFMModality.AUDIO_OUT)

	print("\nGeneration complete")

	# Clean up text
	full_text = full_text.replace("<\|text_end\|>", "").strip()

	# Decode audio (remove last end-of-audio token)
	if len(audio_out) > 1:
	mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
	with torch.no_grad():
	waveform = mimi.decode(mimi_codes)[0]

	# Convert to numpy for Gradio
	audio_np = waveform.cpu().numpy()
	audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
	else:
	audio_output = None

	# Update chat state
	if text_out and audio_out:
	chat_state.append(
	text=torch.stack(text_out, 1),
	audio_out=torch.stack(audio_out, 1),
	modality_flag=torch.tensor(modality_out, device=device),
	)

	chat_state.end_turn()
	chat_state.new_turn("user")

	return audio_output, full_text, chat_state


	def reset_chat():
	"""Reset chat state"""
	return ChatState(processor), "", None


	# Create Gradio interface
	with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
	gr.Markdown("""
	# LFM2-Audio Speech-to-Speech Chat

	Talk to LFM2-Audio! Record your voice and get a response with both text and audio.

	How to use:
	1. Click the microphone button to record your voice
	2. Adjust temperature and top-k parameters if needed (or leave defaults)
	3. Click "Generate Response"
	4. Listen to the audio response and read the text transcription

	Note: This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
	""")

	chat_state = gr.State(ChatState(processor))

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Record your voice"
	)

	with gr.Row():
	temperature = gr.Slider(
	minimum=0,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Temperature (0 for greedy)",
	info="Higher = more creative, lower = more deterministic"
	)
	top_k = gr.Slider(
	minimum=0,
	maximum=100,
	value=4,
	step=1,
	label="Top-k (0 for no filtering)",
	info="Number of top tokens to sample from"
	)

	generate_btn = gr.Button("Generate Response", variant="primary")
	reset_btn = gr.Button("Reset Chat")

	with gr.Column():
	text_output = gr.Textbox(
	label="Assistant Response (Text)",
	lines=4,
	interactive=False
	)
	audio_output = gr.Audio(
	label="Assistant Response (Audio)",
	type="numpy",
	interactive=False
	)

	gr.Markdown("""
	### About LFM2-Audio

	LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
	- Real-time speech-to-speech conversations
	- Low-latency interleaved text and audio generation
	- Natural flowing conversations

	[Learn more](https://www.liquid.ai/) \| [GitHub](https://github.com/Liquid4All/liquid-audio/)
	""")

	# Event handlers
	generate_btn.click(
	fn=generate_response,
	inputs=[audio_input, temperature, top_k, chat_state],
	outputs=[audio_output, text_output, chat_state]
	)

	reset_btn.click(
	fn=reset_chat,
	outputs=[chat_state, text_output, audio_output]
	)


	if __name__ == "__main__":
	demo.launch()