Spaces:
Configuration error
Configuration error
Commit ·
c72d46d
1
Parent(s): 23b5df6
Fix TTS bug - Error: input 'depth_slices_in' is missing in 'feeds'. (#3)
Browse files- Add TTS vocoder bug fix implementation plan (fc95264d266ba550a3c08488f3846da4901cd3c0)
- Fix vocoder missing inputs: depth_slices_in, seqlens_k, total_seq_len (f99f0de4d792e0f83a4b04476249a62053d1b2a1)
- Remove implementation plan (content moved to PR description) (6cca669dae910884bdb81267eba3adbe1aef9839)
Co-authored-by: Pau Labarta Bajo <Paulescu@users.noreply.huggingface.co>
- audio-model.js +19 -3
audio-model.js
CHANGED
|
@@ -507,7 +507,7 @@ export class AudioModel {
|
|
| 507 |
// On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
|
| 508 |
try {
|
| 509 |
const vocoderOpts = device === 'webgpu'
|
| 510 |
-
? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer' } }
|
| 511 |
: {};
|
| 512 |
this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
|
| 513 |
} catch (e) {
|
|
@@ -964,17 +964,24 @@ export class AudioModel {
|
|
| 964 |
// Pre-allocate data arrays
|
| 965 |
const stepIdxData = new BigInt64Array(1);
|
| 966 |
const prevTokenData = new BigInt64Array(1);
|
|
|
|
|
|
|
| 967 |
|
| 968 |
// Pre-allocate tensors that can be reused
|
| 969 |
this._vocoderCache = {
|
| 970 |
hiddenTensor: null, // Created per-call since hiddenState changes
|
| 971 |
stepIdxData,
|
| 972 |
prevTokenData,
|
|
|
|
|
|
|
| 973 |
// Pre-create reusable tensors (ONNX Runtime reads from the data array)
|
| 974 |
stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
|
| 975 |
prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
|
|
|
|
|
|
|
| 976 |
emptyKeysData: new Float32Array(0),
|
| 977 |
emptyValuesData: new Float32Array(0),
|
|
|
|
| 978 |
// Reusable sampling arrays
|
| 979 |
scaledLogits: new Float32Array(2049), // codebook vocab size
|
| 980 |
indices: new Uint16Array(2049), // Use typed array for faster reset
|
|
@@ -1019,32 +1026,41 @@ export class AudioModel {
|
|
| 1019 |
let pastKeys = new ort.Tensor(
|
| 1020 |
'float32',
|
| 1021 |
cache.emptyKeysData,
|
| 1022 |
-
[numLayers, 1,
|
| 1023 |
);
|
| 1024 |
let pastValues = new ort.Tensor(
|
| 1025 |
'float32',
|
| 1026 |
cache.emptyValuesData,
|
| 1027 |
-
[numLayers, 1,
|
| 1028 |
);
|
| 1029 |
|
| 1030 |
// Reuse step_idx and prev_token tensors by updating their data
|
| 1031 |
cache.stepIdxData[0] = 0n;
|
| 1032 |
cache.prevTokenData[0] = 0n;
|
| 1033 |
|
|
|
|
|
|
|
|
|
|
| 1034 |
for (let i = 0; i < numCodebooks; i++) {
|
| 1035 |
// Update mutable tensor data (tensor objects reuse the underlying data arrays)
|
| 1036 |
cache.stepIdxData[0] = BigInt(i);
|
| 1037 |
cache.prevTokenData[0] = BigInt(prevToken);
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
const feeds = {
|
| 1040 |
hidden_states: hiddenTensor,
|
|
|
|
| 1041 |
step_idx: cache.stepIdxTensor,
|
| 1042 |
prev_token: cache.prevTokenTensor,
|
| 1043 |
past_keys: pastKeys,
|
| 1044 |
past_values: pastValues,
|
|
|
|
|
|
|
| 1045 |
};
|
| 1046 |
|
| 1047 |
const outputs = await this.vocoderSession.run(feeds);
|
|
|
|
| 1048 |
const logits = outputs.logits.data;
|
| 1049 |
const vocabSize = logits.length;
|
| 1050 |
|
|
|
|
| 507 |
// On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
|
| 508 |
try {
|
| 509 |
const vocoderOpts = device === 'webgpu'
|
| 510 |
+
? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer', depth_slices: 'gpu-buffer' } }
|
| 511 |
: {};
|
| 512 |
this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
|
| 513 |
} catch (e) {
|
|
|
|
| 964 |
// Pre-allocate data arrays
|
| 965 |
const stepIdxData = new BigInt64Array(1);
|
| 966 |
const prevTokenData = new BigInt64Array(1);
|
| 967 |
+
const seqlensKData = new Int32Array(1);
|
| 968 |
+
const totalSeqLenData = new Int32Array(1);
|
| 969 |
|
| 970 |
// Pre-allocate tensors that can be reused
|
| 971 |
this._vocoderCache = {
|
| 972 |
hiddenTensor: null, // Created per-call since hiddenState changes
|
| 973 |
stepIdxData,
|
| 974 |
prevTokenData,
|
| 975 |
+
seqlensKData,
|
| 976 |
+
totalSeqLenData,
|
| 977 |
// Pre-create reusable tensors (ONNX Runtime reads from the data array)
|
| 978 |
stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
|
| 979 |
prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
|
| 980 |
+
seqlensKTensor: new ort.Tensor('int32', seqlensKData, [1]),
|
| 981 |
+
totalSeqLenTensor: new ort.Tensor('int32', totalSeqLenData, []),
|
| 982 |
emptyKeysData: new Float32Array(0),
|
| 983 |
emptyValuesData: new Float32Array(0),
|
| 984 |
+
emptyDepthSlicesData: new Float32Array(8 * 1024), // zeros for step 0
|
| 985 |
// Reusable sampling arrays
|
| 986 |
scaledLogits: new Float32Array(2049), // codebook vocab size
|
| 987 |
indices: new Uint16Array(2049), // Use typed array for faster reset
|
|
|
|
| 1026 |
let pastKeys = new ort.Tensor(
|
| 1027 |
'float32',
|
| 1028 |
cache.emptyKeysData,
|
| 1029 |
+
[numLayers, 1, numKvHeads, 0, headDim]
|
| 1030 |
);
|
| 1031 |
let pastValues = new ort.Tensor(
|
| 1032 |
'float32',
|
| 1033 |
cache.emptyValuesData,
|
| 1034 |
+
[numLayers, 1, numKvHeads, 0, headDim]
|
| 1035 |
);
|
| 1036 |
|
| 1037 |
// Reuse step_idx and prev_token tensors by updating their data
|
| 1038 |
cache.stepIdxData[0] = 0n;
|
| 1039 |
cache.prevTokenData[0] = 0n;
|
| 1040 |
|
| 1041 |
+
// depth_slices_in: zeros at step 0 (model ignores it), then fed back from output
|
| 1042 |
+
let depthSlicesIn = new ort.Tensor('float32', cache.emptyDepthSlicesData, [1, 8, 1024]);
|
| 1043 |
+
|
| 1044 |
for (let i = 0; i < numCodebooks; i++) {
|
| 1045 |
// Update mutable tensor data (tensor objects reuse the underlying data arrays)
|
| 1046 |
cache.stepIdxData[0] = BigInt(i);
|
| 1047 |
cache.prevTokenData[0] = BigInt(prevToken);
|
| 1048 |
+
cache.seqlensKData[0] = i;
|
| 1049 |
+
cache.totalSeqLenData[0] = i + 1;
|
| 1050 |
|
| 1051 |
const feeds = {
|
| 1052 |
hidden_states: hiddenTensor,
|
| 1053 |
+
depth_slices_in: depthSlicesIn,
|
| 1054 |
step_idx: cache.stepIdxTensor,
|
| 1055 |
prev_token: cache.prevTokenTensor,
|
| 1056 |
past_keys: pastKeys,
|
| 1057 |
past_values: pastValues,
|
| 1058 |
+
seqlens_k: cache.seqlensKTensor,
|
| 1059 |
+
total_seq_len: cache.totalSeqLenTensor,
|
| 1060 |
};
|
| 1061 |
|
| 1062 |
const outputs = await this.vocoderSession.run(feeds);
|
| 1063 |
+
depthSlicesIn = outputs.depth_slices;
|
| 1064 |
const logits = outputs.logits.data;
|
| 1065 |
const vocabSize = logits.length;
|
| 1066 |
|