Spaces:

Rcarvalo
/

speech-to-speech

Runtime error

File size: 6,297 Bytes

71c51fd

"""
Gradio app for LFM2-Audio speech-to-speech demo
Compatible with Hugging Face Spaces
"""

import gradio as gr
import numpy as np
import torch
import torchaudio

from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality

# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"

print("Loading processor...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
print("Loading model...")
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
print("Loading audio codec...")
mimi = processor.mimi.eval()

# Move to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
mimi = mimi.to(device)

print(f"Models loaded on {device}")


def generate_response(audio_input, temperature, top_k, chat_state):
    """Generate speech-to-speech response"""

    if audio_input is None:
        return None, "Please record audio first", chat_state

    # Parse audio input
    rate, wav = audio_input

    # Convert to torch tensor
    if wav.dtype == np.int16:
        wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
    else:
        wav_tensor = torch.tensor(wav, dtype=torch.float32)

    # Ensure mono
    if len(wav_tensor.shape) > 1:
        wav_tensor = wav_tensor.mean(dim=-1)

    # Initialize chat state if empty
    if len(chat_state.text) == 1:
        chat_state.new_turn("system")
        chat_state.add_text("Respond with interleaved text and audio.")
        chat_state.end_turn()

    # Add user audio
    chat_state.new_turn("user")
    chat_state.add_audio(wav_tensor, rate)
    chat_state.end_turn()

    # Start assistant turn
    chat_state.new_turn("assistant")

    # Set generation parameters
    temp = None if temperature == 0 else float(temperature)
    topk = None if top_k == 0 else int(top_k)

    # Generate response
    text_out = []
    audio_out = []
    modality_out = []

    full_text = ""

    print("Generating response...")
    with torch.no_grad():
        for t in model.generate_interleaved(
            **chat_state,
            max_new_tokens=1024,
            audio_temperature=temp,
            audio_top_k=topk,
        ):
            if t.numel() == 1:  # Text token
                text_out.append(t)
                modality_out.append(LFMModality.TEXT)
                decoded = processor.text.decode(t)
                full_text += decoded
                print(decoded, end="", flush=True)
            elif t.numel() == 8:  # Audio token
                audio_out.append(t)
                modality_out.append(LFMModality.AUDIO_OUT)

    print("\nGeneration complete")

    # Clean up text
    full_text = full_text.replace("<|text_end|>", "").strip()

    # Decode audio (remove last end-of-audio token)
    if len(audio_out) > 1:
        mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
        with torch.no_grad():
            waveform = mimi.decode(mimi_codes)[0]

        # Convert to numpy for Gradio
        audio_np = waveform.cpu().numpy()
        audio_output = (24000, audio_np.T)  # Gradio expects (rate, data)
    else:
        audio_output = None

    # Update chat state
    if text_out and audio_out:
        chat_state.append(
            text=torch.stack(text_out, 1),
            audio_out=torch.stack(audio_out, 1),
            modality_flag=torch.tensor(modality_out, device=device),
        )

    chat_state.end_turn()
    chat_state.new_turn("user")

    return audio_output, full_text, chat_state


def reset_chat():
    """Reset chat state"""
    return ChatState(processor), "", None


# Create Gradio interface
with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
    gr.Markdown("""
    # LFM2-Audio Speech-to-Speech Chat

    Talk to LFM2-Audio! Record your voice and get a response with both text and audio.

    **How to use:**
    1. Click the microphone button to record your voice
    2. Adjust temperature and top-k parameters if needed (or leave defaults)
    3. Click "Generate Response"
    4. Listen to the audio response and read the text transcription

    **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
    """)

    chat_state = gr.State(ChatState(processor))

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="Record your voice"
            )

            with gr.Row():
                temperature = gr.Slider(
                    minimum=0,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    label="Temperature (0 for greedy)",
                    info="Higher = more creative, lower = more deterministic"
                )
                top_k = gr.Slider(
                    minimum=0,
                    maximum=100,
                    value=4,
                    step=1,
                    label="Top-k (0 for no filtering)",
                    info="Number of top tokens to sample from"
                )

            generate_btn = gr.Button("Generate Response", variant="primary")
            reset_btn = gr.Button("Reset Chat")

        with gr.Column():
            text_output = gr.Textbox(
                label="Assistant Response (Text)",
                lines=4,
                interactive=False
            )
            audio_output = gr.Audio(
                label="Assistant Response (Audio)",
                type="numpy",
                interactive=False
            )

    gr.Markdown("""
    ### About LFM2-Audio

    LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
    - Real-time speech-to-speech conversations
    - Low-latency interleaved text and audio generation
    - Natural flowing conversations

    [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
    """)

    # Event handlers
    generate_btn.click(
        fn=generate_response,
        inputs=[audio_input, temperature, top_k, chat_state],
        outputs=[audio_output, text_output, chat_state]
    )

    reset_btn.click(
        fn=reset_chat,
        outputs=[chat_state, text_output, audio_output]
    )


if __name__ == "__main__":
    demo.launch()