Spaces:

LiquidAI
/

LFM2.5-Audio-1.5B-transformers-js

Configuration error

App Files Files Community

ykhrustalev

Paulescu commited on Mar 10

Commit

c72d46d

1 Parent(s): 23b5df6

Fix TTS bug - Error: input 'depth_slices_in' is missing in 'feeds'. (#3)

Browse files

- Add TTS vocoder bug fix implementation plan (fc95264d266ba550a3c08488f3846da4901cd3c0)
- Fix vocoder missing inputs: depth_slices_in, seqlens_k, total_seq_len (f99f0de4d792e0f83a4b04476249a62053d1b2a1)
- Remove implementation plan (content moved to PR description) (6cca669dae910884bdb81267eba3adbe1aef9839)

Co-authored-by: Pau Labarta Bajo <Paulescu@users.noreply.huggingface.co>

Files changed (1) hide show

audio-model.js +19 -3

audio-model.js CHANGED Viewed

@@ -507,7 +507,7 @@ export class AudioModel {
       // On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
       try {
         const vocoderOpts = device === 'webgpu'
-          ? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer' } }
           : {};
         this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
       } catch (e) {
@@ -964,17 +964,24 @@ export class AudioModel {
     // Pre-allocate data arrays
     const stepIdxData = new BigInt64Array(1);
     const prevTokenData = new BigInt64Array(1);
     // Pre-allocate tensors that can be reused
     this._vocoderCache = {
       hiddenTensor: null,  // Created per-call since hiddenState changes
       stepIdxData,
       prevTokenData,
       // Pre-create reusable tensors (ONNX Runtime reads from the data array)
       stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
       prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
       emptyKeysData: new Float32Array(0),
       emptyValuesData: new Float32Array(0),
       // Reusable sampling arrays
       scaledLogits: new Float32Array(2049),  // codebook vocab size
       indices: new Uint16Array(2049),  // Use typed array for faster reset
@@ -1019,32 +1026,41 @@ export class AudioModel {
     let pastKeys = new ort.Tensor(
       'float32',
       cache.emptyKeysData,
-      [numLayers, 1, 0, numKvHeads, headDim]
     );
     let pastValues = new ort.Tensor(
       'float32',
       cache.emptyValuesData,
-      [numLayers, 1, 0, numKvHeads, headDim]
     );
     // Reuse step_idx and prev_token tensors by updating their data
     cache.stepIdxData[0] = 0n;
     cache.prevTokenData[0] = 0n;
     for (let i = 0; i < numCodebooks; i++) {
       // Update mutable tensor data (tensor objects reuse the underlying data arrays)
       cache.stepIdxData[0] = BigInt(i);
       cache.prevTokenData[0] = BigInt(prevToken);
       const feeds = {
         hidden_states: hiddenTensor,
         step_idx: cache.stepIdxTensor,
         prev_token: cache.prevTokenTensor,
         past_keys: pastKeys,
         past_values: pastValues,
       };
       const outputs = await this.vocoderSession.run(feeds);
       const logits = outputs.logits.data;
       const vocabSize = logits.length;

       // On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
       try {
         const vocoderOpts = device === 'webgpu'
+          ? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer', depth_slices: 'gpu-buffer' } }
           : {};
         this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
       } catch (e) {
     // Pre-allocate data arrays
     const stepIdxData = new BigInt64Array(1);
     const prevTokenData = new BigInt64Array(1);
+    const seqlensKData = new Int32Array(1);
+    const totalSeqLenData = new Int32Array(1);
     // Pre-allocate tensors that can be reused
     this._vocoderCache = {
       hiddenTensor: null,  // Created per-call since hiddenState changes
       stepIdxData,
       prevTokenData,
+      seqlensKData,
+      totalSeqLenData,
       // Pre-create reusable tensors (ONNX Runtime reads from the data array)
       stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
       prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
+      seqlensKTensor: new ort.Tensor('int32', seqlensKData, [1]),
+      totalSeqLenTensor: new ort.Tensor('int32', totalSeqLenData, []),
       emptyKeysData: new Float32Array(0),
       emptyValuesData: new Float32Array(0),
+      emptyDepthSlicesData: new Float32Array(8 * 1024),  // zeros for step 0
       // Reusable sampling arrays
       scaledLogits: new Float32Array(2049),  // codebook vocab size
       indices: new Uint16Array(2049),  // Use typed array for faster reset
     let pastKeys = new ort.Tensor(
       'float32',
       cache.emptyKeysData,
+      [numLayers, 1, numKvHeads, 0, headDim]
     );
     let pastValues = new ort.Tensor(
       'float32',
       cache.emptyValuesData,
+      [numLayers, 1, numKvHeads, 0, headDim]
     );
     // Reuse step_idx and prev_token tensors by updating their data
     cache.stepIdxData[0] = 0n;
     cache.prevTokenData[0] = 0n;
+    // depth_slices_in: zeros at step 0 (model ignores it), then fed back from output
+    let depthSlicesIn = new ort.Tensor('float32', cache.emptyDepthSlicesData, [1, 8, 1024]);
     for (let i = 0; i < numCodebooks; i++) {
       // Update mutable tensor data (tensor objects reuse the underlying data arrays)
       cache.stepIdxData[0] = BigInt(i);
       cache.prevTokenData[0] = BigInt(prevToken);
+      cache.seqlensKData[0] = i;
+      cache.totalSeqLenData[0] = i + 1;
       const feeds = {
         hidden_states: hiddenTensor,
+        depth_slices_in: depthSlicesIn,
         step_idx: cache.stepIdxTensor,
         prev_token: cache.prevTokenTensor,
         past_keys: pastKeys,
         past_values: pastValues,
+        seqlens_k: cache.seqlensKTensor,
+        total_seq_len: cache.totalSeqLenTensor,
       };
       const outputs = await this.vocoderSession.run(feeds);
+      depthSlicesIn = outputs.depth_slices;
       const logits = outputs.logits.data;
       const vocabSize = logits.length;