{ "numMelBins": 128, "sampleRate": 16000, "nFFT": 512, "hopLength": 160, "winLength": 400, "preEmphasis": 0.97, "encoderHidden": 1024, "encoderLayers": 24, "subsamplingFactor": 8, "decoderHidden": 640, "decoderLayers": 2, "vocabSize": 8192, "blankTokenId": 8192, "numDurationBins": 5, "durationBins": [ 0, 1, 2, 3, 4 ] }