π LFM2.5-Audio
Collection
3 items
β’
Updated
β’
17
ONNX export of LFM2.5-Audio-1.5B for cross-platform inference.
LFM2.5-Audio is a multimodal model supporting three modes:
| Decoder | Vocoder | Size | Platform | Use Case |
|---|---|---|---|---|
| Q4 | Q4 | ~1.5GB | WebGPU, Server | Recommended for most uses |
| FP16 | FP16 | ~3.2GB | Server | Higher quality |
onnx/
βββ decoder.onnx # LFM2 backbone (FP32)
βββ decoder.onnx_data*
βββ decoder_fp16.onnx # LFM2 backbone (FP16)
βββ decoder_fp16.onnx_data*
βββ decoder_q4.onnx # LFM2 backbone (Q4, recommended)
βββ decoder_q4.onnx_data
βββ audio_encoder.onnx # Conformer encoder for ASR (FP32)
βββ audio_encoder.onnx_data
βββ audio_encoder_fp16.onnx # Conformer encoder (FP16)
βββ audio_encoder_fp16.onnx_data
βββ audio_encoder_q4.onnx # Conformer encoder (Q4)
βββ audio_encoder_q4.onnx_data
βββ audio_embedding.onnx # Audio code embeddings (FP32)
βββ audio_embedding_fp16.onnx # Audio code embeddings (FP16)
βββ audio_embedding_q4.onnx # Audio code embeddings (Q4)
βββ audio_detokenizer.onnx # Neural vocoder STFT (FP32)
βββ audio_detokenizer.onnx_data
βββ audio_detokenizer_fp16.onnx # Neural vocoder (FP16)
βββ audio_detokenizer_fp16.onnx_data
βββ audio_detokenizer_q4.onnx # Neural vocoder (Q4)
βββ audio_detokenizer_q4.onnx_data
βββ vocoder_depthformer.onnx # Audio codebook prediction (FP32)
βββ vocoder_depthformer.onnx_data
βββ vocoder_depthformer_fp16.onnx # Audio codebook prediction (FP16)
βββ vocoder_depthformer_fp16.onnx_data
βββ vocoder_depthformer_q4.onnx # Audio codebook prediction (Q4)
βββ vocoder_depthformer_q4.onnx_data
βββ embed_tokens.bin # Text embeddings (binary)
βββ embed_tokens.json # Text embeddings metadata
βββ audio_embedding.bin # Audio embeddings (binary, for direct lookup)
βββ audio_embedding.json # Audio embeddings metadata
βββ mel_config.json # Mel spectrogram configuration
* Large models (>2GB) split weights across multiple files:
decoder.onnx_data, decoder.onnx_data_1, decoder.onnx_data_2, etc.
All data files must be in the same directory as the .onnx file.
Use the onnx-export repository for inference.
git clone https://github.com/Liquid4All/onnx-export.git
cd onnx-export
uv sync
Transcribe audio to text:
uv run lfm2-audio-infer /path/to/LFM2.5-Audio-1.5B-ONNX \
--mode asr \
--audio input.wav \
--precision q4
Generate audio from text:
uv run lfm2-audio-infer /path/to/LFM2.5-Audio-1.5B-ONNX \
--mode tts \
--prompt "Hello, this is a test of text to speech synthesis." \
--output output.wav \
--precision q4
Options:
--system "Perform TTS. Use the UK female voice." - Custom system prompt--audio-temperature 0.8 - Audio sampling temperature--audio-top-k 64 - Top-k sampling for audioGenerate interleaved text and audio response from audio input:
uv run lfm2-audio-infer /path/to/LFM2.5-Audio-1.5B-ONNX \
--mode interleaved \
--audio input.wav \
--output output.wav \
--precision q4
Or from text prompt:
uv run lfm2-audio-infer /path/to/LFM2.5-Audio-1.5B-ONNX \
--mode interleaved \
--prompt "Respond with audio" \
--output output.wav \
--precision q4
uv run lfm2-audio-infer --help
| Option | Description |
|---|---|
--mode |
asr, tts, or interleaved |
--precision |
fp16, q4, or q8 (default: fp32) |
--audio |
Input audio file (WAV) |
--output |
Output audio file (WAV) |
--prompt |
Text prompt |
--system |
System prompt |
--max-tokens |
Maximum tokens to generate |
--temperature |
Text sampling temperature |
--audio-temperature |
Audio sampling temperature |
--audio-top-k |
Top-k sampling for audio |
--seed |
Random seed for reproducibility |
npm install onnxruntime-web @huggingface/transformers
WebGPU is required for browser inference. To enable:
chrome://flags/#enable-unsafe-webgpu, enable, and restartchrome://gpu for "WebGPU" statusnavigator.gpu.requestAdapter() in DevTools consoleimport * as ort from "onnxruntime-web/webgpu";
import { AutoTokenizer } from "@huggingface/transformers";
// Check WebGPU availability
if (!navigator.gpu) {
throw new Error("WebGPU not available. Enable at chrome://flags/#enable-unsafe-webgpu");
}
ort.env.wasm.numThreads = 1;
const modelId = "LiquidAI/LFM2.5-Audio-1.5B-ONNX";
const modelBase = `https://huggingface.co/${modelId}/resolve/main`;
// Load tokenizer
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
// Load ONNX sessions
async function loadSession(name, dataFiles = 1) {
const onnxPath = `${modelBase}/onnx/${name}.onnx`;
const externalData = [];
for (let i = 0; i < dataFiles; i++) {
const suffix = i === 0 ? "" : `_${i}`;
const fileName = `${name}.onnx_data${suffix}`;
externalData.push({ path: fileName, data: `${modelBase}/onnx/${fileName}` });
}
return ort.InferenceSession.create(onnxPath, {
executionProviders: ["webgpu"],
externalData,
});
}
// Load models (Q4 recommended for WebGPU)
const decoder = await loadSession("decoder_q4");
const audioEmbedding = await loadSession("audio_embedding_q4");
const detokenizer = await loadSession("audio_detokenizer_q4");
const depthformer = await loadSession("vocoder_depthformer_q4");
// Load text embeddings binary
const embedResponse = await fetch(`${modelBase}/onnx/embed_tokens.bin`);
const embedBuffer = await embedResponse.arrayBuffer();
const embedMetaResponse = await fetch(`${modelBase}/onnx/embed_tokens.json`);
const embedMeta = await embedMetaResponse.json();
const embedWeight = new Float32Array(embedBuffer);
function getTextEmbeddings(ids) {
const hiddenSize = embedMeta.hidden_size;
const embeds = new Float32Array(ids.length * hiddenSize);
for (let i = 0; i < ids.length; i++) {
const offset = ids[i] * hiddenSize;
embeds.set(embedWeight.subarray(offset, offset + hiddenSize), i * hiddenSize);
}
return new ort.Tensor("float32", embeds, [1, ids.length, hiddenSize]);
}
// Model config
const hiddenSize = 2048;
const numCodebooks = 8;
const codebookVocab = 2049;
// TTS example
const text = "Hello, this is a test.";
const prompt = `<|startoftext|><|im_start|>system
Perform TTS. Use the UK female voice.<|im_end|>
<|im_start|>user
${text}<|im_end|>
<|im_start|>assistant
`;
const inputIds = tokenizer.encode(prompt);
let embeds = getTextEmbeddings(inputIds);
// Initialize KV cache
const cache = {};
for (const name of decoder.inputNames) {
if (name.startsWith("past_conv")) {
cache[name] = new ort.Tensor("float32", new Float32Array(hiddenSize * 3), [1, hiddenSize, 3]);
} else if (name.startsWith("past_key_values")) {
cache[name] = new ort.Tensor("float32", new Float32Array(0), [1, 8, 0, 64]);
}
}
// Generation loop
const audioCodes = [];
let inAudioMode = false;
let curLen = inputIds.length;
for (let step = 0; step < 1024; step++) {
const attentionMask = new ort.Tensor("int64", new BigInt64Array(curLen).fill(1n), [1, curLen]);
const outputs = await decoder.run({ inputs_embeds: embeds, attention_mask: attentionMask, ...cache });
// Update cache
for (const [name, tensor] of Object.entries(outputs)) {
if (name.startsWith("present_conv")) {
cache[name.replace("present_conv", "past_conv")] = tensor;
} else if (name.startsWith("present.")) {
cache[name.replace("present.", "past_key_values.")] = tensor;
}
}
if (inAudioMode) {
// Use depthformer to generate audio codes
const hiddenStates = outputs.hidden_states;
const lastHidden = /* extract last position */;
// Autoregressive codebook generation (8 steps per frame)
const frameCodes = await generateAudioFrame(depthformer, lastHidden);
if (frameCodes[0] === 2048) {
// End of audio
break;
}
audioCodes.push(frameCodes);
// Get audio embeddings for feedback
const audioTokens = frameCodes.map((code, cb) => cb * codebookVocab + code);
const audioEmbedsResult = await audioEmbedding.run({
audio_codes: new ort.Tensor("int64", new BigInt64Array(audioTokens.map(BigInt)), [1, 8])
});
// Sum embeddings across codebooks
embeds = sumEmbeddings(audioEmbedsResult.audio_embeds);
} else {
// Text generation
const logits = outputs.logits;
const nextToken = argmax(logits);
if (nextToken === 128) {
// <|audio_start|> - switch to audio mode
inAudioMode = true;
}
embeds = getTextEmbeddings([nextToken]);
}
curLen++;
}
// Decode audio codes to waveform using detokenizer + ISTFT
const waveform = await decodeAudio(detokenizer, audioCodes);
This model is released under the LFM 1.0 License.