ykhrustalev Paulescu commited on
Commit
c72d46d
·
1 Parent(s): 23b5df6

Fix TTS bug - Error: input 'depth_slices_in' is missing in 'feeds'. (#3)

Browse files

- Add TTS vocoder bug fix implementation plan (fc95264d266ba550a3c08488f3846da4901cd3c0)
- Fix vocoder missing inputs: depth_slices_in, seqlens_k, total_seq_len (f99f0de4d792e0f83a4b04476249a62053d1b2a1)
- Remove implementation plan (content moved to PR description) (6cca669dae910884bdb81267eba3adbe1aef9839)


Co-authored-by: Pau Labarta Bajo <Paulescu@users.noreply.huggingface.co>

Files changed (1) hide show
  1. audio-model.js +19 -3
audio-model.js CHANGED
@@ -507,7 +507,7 @@ export class AudioModel {
507
  // On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
508
  try {
509
  const vocoderOpts = device === 'webgpu'
510
- ? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer' } }
511
  : {};
512
  this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
513
  } catch (e) {
@@ -964,17 +964,24 @@ export class AudioModel {
964
  // Pre-allocate data arrays
965
  const stepIdxData = new BigInt64Array(1);
966
  const prevTokenData = new BigInt64Array(1);
 
 
967
 
968
  // Pre-allocate tensors that can be reused
969
  this._vocoderCache = {
970
  hiddenTensor: null, // Created per-call since hiddenState changes
971
  stepIdxData,
972
  prevTokenData,
 
 
973
  // Pre-create reusable tensors (ONNX Runtime reads from the data array)
974
  stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
975
  prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
 
 
976
  emptyKeysData: new Float32Array(0),
977
  emptyValuesData: new Float32Array(0),
 
978
  // Reusable sampling arrays
979
  scaledLogits: new Float32Array(2049), // codebook vocab size
980
  indices: new Uint16Array(2049), // Use typed array for faster reset
@@ -1019,32 +1026,41 @@ export class AudioModel {
1019
  let pastKeys = new ort.Tensor(
1020
  'float32',
1021
  cache.emptyKeysData,
1022
- [numLayers, 1, 0, numKvHeads, headDim]
1023
  );
1024
  let pastValues = new ort.Tensor(
1025
  'float32',
1026
  cache.emptyValuesData,
1027
- [numLayers, 1, 0, numKvHeads, headDim]
1028
  );
1029
 
1030
  // Reuse step_idx and prev_token tensors by updating their data
1031
  cache.stepIdxData[0] = 0n;
1032
  cache.prevTokenData[0] = 0n;
1033
 
 
 
 
1034
  for (let i = 0; i < numCodebooks; i++) {
1035
  // Update mutable tensor data (tensor objects reuse the underlying data arrays)
1036
  cache.stepIdxData[0] = BigInt(i);
1037
  cache.prevTokenData[0] = BigInt(prevToken);
 
 
1038
 
1039
  const feeds = {
1040
  hidden_states: hiddenTensor,
 
1041
  step_idx: cache.stepIdxTensor,
1042
  prev_token: cache.prevTokenTensor,
1043
  past_keys: pastKeys,
1044
  past_values: pastValues,
 
 
1045
  };
1046
 
1047
  const outputs = await this.vocoderSession.run(feeds);
 
1048
  const logits = outputs.logits.data;
1049
  const vocabSize = logits.length;
1050
 
 
507
  // On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
508
  try {
509
  const vocoderOpts = device === 'webgpu'
510
+ ? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer', depth_slices: 'gpu-buffer' } }
511
  : {};
512
  this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
513
  } catch (e) {
 
964
  // Pre-allocate data arrays
965
  const stepIdxData = new BigInt64Array(1);
966
  const prevTokenData = new BigInt64Array(1);
967
+ const seqlensKData = new Int32Array(1);
968
+ const totalSeqLenData = new Int32Array(1);
969
 
970
  // Pre-allocate tensors that can be reused
971
  this._vocoderCache = {
972
  hiddenTensor: null, // Created per-call since hiddenState changes
973
  stepIdxData,
974
  prevTokenData,
975
+ seqlensKData,
976
+ totalSeqLenData,
977
  // Pre-create reusable tensors (ONNX Runtime reads from the data array)
978
  stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
979
  prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
980
+ seqlensKTensor: new ort.Tensor('int32', seqlensKData, [1]),
981
+ totalSeqLenTensor: new ort.Tensor('int32', totalSeqLenData, []),
982
  emptyKeysData: new Float32Array(0),
983
  emptyValuesData: new Float32Array(0),
984
+ emptyDepthSlicesData: new Float32Array(8 * 1024), // zeros for step 0
985
  // Reusable sampling arrays
986
  scaledLogits: new Float32Array(2049), // codebook vocab size
987
  indices: new Uint16Array(2049), // Use typed array for faster reset
 
1026
  let pastKeys = new ort.Tensor(
1027
  'float32',
1028
  cache.emptyKeysData,
1029
+ [numLayers, 1, numKvHeads, 0, headDim]
1030
  );
1031
  let pastValues = new ort.Tensor(
1032
  'float32',
1033
  cache.emptyValuesData,
1034
+ [numLayers, 1, numKvHeads, 0, headDim]
1035
  );
1036
 
1037
  // Reuse step_idx and prev_token tensors by updating their data
1038
  cache.stepIdxData[0] = 0n;
1039
  cache.prevTokenData[0] = 0n;
1040
 
1041
+ // depth_slices_in: zeros at step 0 (model ignores it), then fed back from output
1042
+ let depthSlicesIn = new ort.Tensor('float32', cache.emptyDepthSlicesData, [1, 8, 1024]);
1043
+
1044
  for (let i = 0; i < numCodebooks; i++) {
1045
  // Update mutable tensor data (tensor objects reuse the underlying data arrays)
1046
  cache.stepIdxData[0] = BigInt(i);
1047
  cache.prevTokenData[0] = BigInt(prevToken);
1048
+ cache.seqlensKData[0] = i;
1049
+ cache.totalSeqLenData[0] = i + 1;
1050
 
1051
  const feeds = {
1052
  hidden_states: hiddenTensor,
1053
+ depth_slices_in: depthSlicesIn,
1054
  step_idx: cache.stepIdxTensor,
1055
  prev_token: cache.prevTokenTensor,
1056
  past_keys: pastKeys,
1057
  past_values: pastValues,
1058
+ seqlens_k: cache.seqlensKTensor,
1059
+ total_seq_len: cache.totalSeqLenTensor,
1060
  };
1061
 
1062
  const outputs = await this.vocoderSession.run(feeds);
1063
+ depthSlicesIn = outputs.depth_slices;
1064
  const logits = outputs.logits.data;
1065
  const vocabSize = logits.length;
1066