Add files using upload-large-folder tool
Browse files- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/activation1d.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/load.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py +77 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/build/.ninja_log +4 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/build/build.ninja +34 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h +29 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py +86 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/type_shim.h +92 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py +6 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/__init__.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/act.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/filter.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/resample.cpython-310.pyc +0 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py +30 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py +101 -0
- r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py +58 -0
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py
ADDED
|
File without changes
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (230 Bytes). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/activation1d.cpython-310.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__pycache__/load.cpython-310.pyc
ADDED
|
Binary file (2.06 kB). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
| 2 |
+
# Licensed under the MIT license.
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
from ..torch.resample import UpSample1d, DownSample1d
|
| 7 |
+
|
| 8 |
+
# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
|
| 9 |
+
from . import load
|
| 10 |
+
|
| 11 |
+
anti_alias_activation_cuda = load.load()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FusedAntiAliasActivation(torch.autograd.Function):
|
| 15 |
+
"""
|
| 16 |
+
Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
|
| 17 |
+
The hyperparameters are hard-coded in the kernel to maximize speed.
|
| 18 |
+
NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
|
| 23 |
+
activation_results = anti_alias_activation_cuda.forward(
|
| 24 |
+
inputs, up_ftr, down_ftr, alpha, beta
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
return activation_results
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def backward(ctx, output_grads):
|
| 31 |
+
raise NotImplementedError
|
| 32 |
+
return output_grads, None, None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class Activation1d(nn.Module):
|
| 36 |
+
def __init__(
|
| 37 |
+
self,
|
| 38 |
+
activation,
|
| 39 |
+
up_ratio: int = 2,
|
| 40 |
+
down_ratio: int = 2,
|
| 41 |
+
up_kernel_size: int = 12,
|
| 42 |
+
down_kernel_size: int = 12,
|
| 43 |
+
fused: bool = True,
|
| 44 |
+
):
|
| 45 |
+
super().__init__()
|
| 46 |
+
self.up_ratio = up_ratio
|
| 47 |
+
self.down_ratio = down_ratio
|
| 48 |
+
self.act = activation
|
| 49 |
+
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
| 50 |
+
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
| 51 |
+
|
| 52 |
+
self.fused = fused # Whether to use fused CUDA kernel or not
|
| 53 |
+
|
| 54 |
+
def forward(self, x):
|
| 55 |
+
if not self.fused:
|
| 56 |
+
x = self.upsample(x)
|
| 57 |
+
x = self.act(x)
|
| 58 |
+
x = self.downsample(x)
|
| 59 |
+
return x
|
| 60 |
+
else:
|
| 61 |
+
if self.act.__class__.__name__ == "Snake":
|
| 62 |
+
beta = self.act.alpha.data # Snake uses same params for alpha and beta
|
| 63 |
+
else:
|
| 64 |
+
beta = (
|
| 65 |
+
self.act.beta.data
|
| 66 |
+
) # Snakebeta uses different params for alpha and beta
|
| 67 |
+
alpha = self.act.alpha.data
|
| 68 |
+
if (
|
| 69 |
+
not self.act.alpha_logscale
|
| 70 |
+
): # Exp baked into cuda kernel, cancel it out with a log
|
| 71 |
+
alpha = torch.log(alpha)
|
| 72 |
+
beta = torch.log(beta)
|
| 73 |
+
|
| 74 |
+
x = FusedAntiAliasActivation.apply(
|
| 75 |
+
x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
|
| 76 |
+
)
|
| 77 |
+
return x
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* coding=utf-8
|
| 2 |
+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
#include <torch/extension.h>
|
| 18 |
+
|
| 19 |
+
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
|
| 20 |
+
|
| 21 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
| 22 |
+
m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
|
| 23 |
+
}
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation_cuda.cu
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* coding=utf-8
|
| 2 |
+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
#include <ATen/ATen.h>
|
| 18 |
+
#include <cuda.h>
|
| 19 |
+
#include <cuda_runtime.h>
|
| 20 |
+
#include <cuda_fp16.h>
|
| 21 |
+
#include <cuda_profiler_api.h>
|
| 22 |
+
#include <ATen/cuda/CUDAContext.h>
|
| 23 |
+
#include <torch/extension.h>
|
| 24 |
+
#include "type_shim.h"
|
| 25 |
+
#include <assert.h>
|
| 26 |
+
#include <cfloat>
|
| 27 |
+
#include <limits>
|
| 28 |
+
#include <stdint.h>
|
| 29 |
+
#include <c10/macros/Macros.h>
|
| 30 |
+
|
| 31 |
+
namespace
|
| 32 |
+
{
|
| 33 |
+
// Hard-coded hyperparameters
|
| 34 |
+
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
|
| 35 |
+
constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
|
| 36 |
+
constexpr int BUFFER_SIZE = 32;
|
| 37 |
+
constexpr int FILTER_SIZE = 12;
|
| 38 |
+
constexpr int HALF_FILTER_SIZE = 6;
|
| 39 |
+
constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
|
| 40 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
|
| 41 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
|
| 42 |
+
|
| 43 |
+
template <typename input_t, typename output_t, typename acc_t>
|
| 44 |
+
__global__ void anti_alias_activation_forward(
|
| 45 |
+
output_t *dst,
|
| 46 |
+
const input_t *src,
|
| 47 |
+
const input_t *up_ftr,
|
| 48 |
+
const input_t *down_ftr,
|
| 49 |
+
const input_t *alpha,
|
| 50 |
+
const input_t *beta,
|
| 51 |
+
int batch_size,
|
| 52 |
+
int channels,
|
| 53 |
+
int seq_len)
|
| 54 |
+
{
|
| 55 |
+
// Up and downsample filters
|
| 56 |
+
input_t up_filter[FILTER_SIZE];
|
| 57 |
+
input_t down_filter[FILTER_SIZE];
|
| 58 |
+
|
| 59 |
+
// Load data from global memory including extra indices reserved for replication paddings
|
| 60 |
+
input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
|
| 61 |
+
input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
|
| 62 |
+
|
| 63 |
+
// Output stores downsampled output before writing to dst
|
| 64 |
+
output_t output[BUFFER_SIZE];
|
| 65 |
+
|
| 66 |
+
// blockDim/threadIdx = (128, 1, 1)
|
| 67 |
+
// gridDim/blockIdx = (seq_blocks, channels, batches)
|
| 68 |
+
int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
| 69 |
+
int local_offset = threadIdx.x * BUFFER_SIZE;
|
| 70 |
+
int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
|
| 71 |
+
|
| 72 |
+
// intermediate have double the seq_len
|
| 73 |
+
int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
|
| 74 |
+
int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
|
| 75 |
+
|
| 76 |
+
// Get values needed for replication padding before moving pointer
|
| 77 |
+
const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
| 78 |
+
input_t seq_left_most_value = right_most_pntr[0];
|
| 79 |
+
input_t seq_right_most_value = right_most_pntr[seq_len - 1];
|
| 80 |
+
|
| 81 |
+
// Move src and dst pointers
|
| 82 |
+
src += block_offset + local_offset;
|
| 83 |
+
dst += block_offset + local_offset;
|
| 84 |
+
|
| 85 |
+
// Alpha and beta values for snake activatons. Applies exp by default
|
| 86 |
+
alpha = alpha + blockIdx.y;
|
| 87 |
+
input_t alpha_val = expf(alpha[0]);
|
| 88 |
+
beta = beta + blockIdx.y;
|
| 89 |
+
input_t beta_val = expf(beta[0]);
|
| 90 |
+
|
| 91 |
+
#pragma unroll
|
| 92 |
+
for (int it = 0; it < FILTER_SIZE; it += 1)
|
| 93 |
+
{
|
| 94 |
+
up_filter[it] = up_ftr[it];
|
| 95 |
+
down_filter[it] = down_ftr[it];
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// Apply replication padding for upsampling, matching torch impl
|
| 99 |
+
#pragma unroll
|
| 100 |
+
for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
|
| 101 |
+
{
|
| 102 |
+
int element_index = seq_offset + it; // index for element
|
| 103 |
+
if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
|
| 104 |
+
{
|
| 105 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
|
| 106 |
+
}
|
| 107 |
+
if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
|
| 108 |
+
{
|
| 109 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
|
| 110 |
+
}
|
| 111 |
+
if ((element_index >= 0) && (element_index < seq_len))
|
| 112 |
+
{
|
| 113 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
// Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
|
| 118 |
+
#pragma unroll
|
| 119 |
+
for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
|
| 120 |
+
{
|
| 121 |
+
input_t acc = 0.0;
|
| 122 |
+
int element_index = intermediate_seq_offset + it; // index for intermediate
|
| 123 |
+
#pragma unroll
|
| 124 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
| 125 |
+
{
|
| 126 |
+
if ((element_index + f_idx) >= 0)
|
| 127 |
+
{
|
| 128 |
+
acc += up_filter[f_idx] * elements[it + f_idx];
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
|
| 135 |
+
double no_div_by_zero = 0.000000001;
|
| 136 |
+
#pragma unroll
|
| 137 |
+
for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
|
| 138 |
+
{
|
| 139 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
// Apply replication padding before downsampling conv from intermediates
|
| 143 |
+
#pragma unroll
|
| 144 |
+
for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
|
| 145 |
+
{
|
| 146 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
|
| 147 |
+
}
|
| 148 |
+
#pragma unroll
|
| 149 |
+
for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
|
| 150 |
+
{
|
| 151 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// Apply downsample strided convolution (assuming stride=2) from intermediates
|
| 155 |
+
#pragma unroll
|
| 156 |
+
for (int it = 0; it < BUFFER_SIZE; it += 1)
|
| 157 |
+
{
|
| 158 |
+
input_t acc = 0.0;
|
| 159 |
+
#pragma unroll
|
| 160 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
| 161 |
+
{
|
| 162 |
+
// Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
|
| 163 |
+
acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
|
| 164 |
+
}
|
| 165 |
+
output[it] = acc;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// Write output to dst
|
| 169 |
+
#pragma unroll
|
| 170 |
+
for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
|
| 171 |
+
{
|
| 172 |
+
int element_index = seq_offset + it;
|
| 173 |
+
if (element_index < seq_len)
|
| 174 |
+
{
|
| 175 |
+
dst[it] = output[it];
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
template <typename input_t, typename output_t, typename acc_t>
|
| 182 |
+
void dispatch_anti_alias_activation_forward(
|
| 183 |
+
output_t *dst,
|
| 184 |
+
const input_t *src,
|
| 185 |
+
const input_t *up_ftr,
|
| 186 |
+
const input_t *down_ftr,
|
| 187 |
+
const input_t *alpha,
|
| 188 |
+
const input_t *beta,
|
| 189 |
+
int batch_size,
|
| 190 |
+
int channels,
|
| 191 |
+
int seq_len)
|
| 192 |
+
{
|
| 193 |
+
if (seq_len == 0)
|
| 194 |
+
{
|
| 195 |
+
return;
|
| 196 |
+
}
|
| 197 |
+
else
|
| 198 |
+
{
|
| 199 |
+
// Use 128 threads per block to maximimize gpu utilization
|
| 200 |
+
constexpr int threads_per_block = 128;
|
| 201 |
+
constexpr int seq_len_per_block = 4096;
|
| 202 |
+
int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
|
| 203 |
+
dim3 blocks(blocks_per_seq_len, channels, batch_size);
|
| 204 |
+
dim3 threads(threads_per_block, 1, 1);
|
| 205 |
+
|
| 206 |
+
anti_alias_activation_forward<input_t, output_t, acc_t>
|
| 207 |
+
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
|
| 213 |
+
{
|
| 214 |
+
// Input is a 3d tensor with dimensions [batches, channels, seq_len]
|
| 215 |
+
const int batches = input.size(0);
|
| 216 |
+
const int channels = input.size(1);
|
| 217 |
+
const int seq_len = input.size(2);
|
| 218 |
+
|
| 219 |
+
// Output
|
| 220 |
+
auto act_options = input.options().requires_grad(false);
|
| 221 |
+
|
| 222 |
+
torch::Tensor anti_alias_activation_results =
|
| 223 |
+
torch::empty({batches, channels, seq_len}, act_options);
|
| 224 |
+
|
| 225 |
+
void *input_ptr = static_cast<void *>(input.data_ptr());
|
| 226 |
+
void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
|
| 227 |
+
void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
|
| 228 |
+
void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
|
| 229 |
+
void *beta_ptr = static_cast<void *>(beta.data_ptr());
|
| 230 |
+
void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
|
| 231 |
+
|
| 232 |
+
DISPATCH_FLOAT_HALF_AND_BFLOAT(
|
| 233 |
+
input.scalar_type(),
|
| 234 |
+
"dispatch anti alias activation_forward",
|
| 235 |
+
dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
|
| 236 |
+
reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
|
| 237 |
+
reinterpret_cast<const scalar_t *>(input_ptr),
|
| 238 |
+
reinterpret_cast<const scalar_t *>(up_filter_ptr),
|
| 239 |
+
reinterpret_cast<const scalar_t *>(down_filter_ptr),
|
| 240 |
+
reinterpret_cast<const scalar_t *>(alpha_ptr),
|
| 241 |
+
reinterpret_cast<const scalar_t *>(beta_ptr),
|
| 242 |
+
batches,
|
| 243 |
+
channels,
|
| 244 |
+
seq_len););
|
| 245 |
+
return anti_alias_activation_results;
|
| 246 |
+
}
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/build/.ninja_log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ninja log v5
|
| 2 |
+
1 18844 1746082901714339268 anti_alias_activation.o ccf9e0a9270893a3
|
| 3 |
+
1 81958 1746082964731126970 anti_alias_activation_cuda.cuda.o f0202ef288c19af8
|
| 4 |
+
81968 82271 1746082965135131828 anti_alias_activation_cuda.so b69cef9a3c6cbf35
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/build/build.ninja
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ninja_required_version = 1.3
|
| 2 |
+
cxx = c++
|
| 3 |
+
nvcc = /usr/local/cuda/bin/nvcc
|
| 4 |
+
|
| 5 |
+
cflags = -DTORCH_EXTENSION_NAME=anti_alias_activation_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/TH -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/chenyifu/miniconda3/envs/kimi/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3
|
| 6 |
+
post_cflags =
|
| 7 |
+
cuda_cflags = -DTORCH_EXTENSION_NAME=anti_alias_activation_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/TH -isystem /home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/chenyifu/miniconda3/envs/kimi/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -std=c++17
|
| 8 |
+
cuda_post_cflags =
|
| 9 |
+
cuda_dlink_post_cflags =
|
| 10 |
+
ldflags = -shared -L/home/chenyifu/miniconda3/envs/kimi/lib/python3.10/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart
|
| 11 |
+
|
| 12 |
+
rule compile
|
| 13 |
+
command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
|
| 14 |
+
depfile = $out.d
|
| 15 |
+
deps = gcc
|
| 16 |
+
|
| 17 |
+
rule cuda_compile
|
| 18 |
+
depfile = $out.d
|
| 19 |
+
deps = gcc
|
| 20 |
+
command = $nvcc --generate-dependencies-with-compile --dependency-output $out.d $cuda_cflags -c $in -o $out $cuda_post_cflags
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
rule link
|
| 25 |
+
command = $cxx $in $ldflags -o $out
|
| 26 |
+
|
| 27 |
+
build anti_alias_activation.o: compile /home/chenyifu/audio-r1/r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp
|
| 28 |
+
build anti_alias_activation_cuda.cuda.o: cuda_compile /home/chenyifu/audio-r1/r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation_cuda.cu
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
build anti_alias_activation_cuda.so: link anti_alias_activation.o anti_alias_activation_cuda.cuda.o
|
| 33 |
+
|
| 34 |
+
default anti_alias_activation_cuda.so
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* coding=utf-8
|
| 2 |
+
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
/*This code is copied fron NVIDIA apex:
|
| 18 |
+
* https://github.com/NVIDIA/apex
|
| 19 |
+
* with minor changes. */
|
| 20 |
+
|
| 21 |
+
#ifndef TORCH_CHECK
|
| 22 |
+
#define TORCH_CHECK AT_CHECK
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#ifdef VERSION_GE_1_3
|
| 26 |
+
#define DATA_PTR data_ptr
|
| 27 |
+
#else
|
| 28 |
+
#define DATA_PTR data
|
| 29 |
+
#endif
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
| 2 |
+
# Licensed under the MIT license.
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import pathlib
|
| 6 |
+
import subprocess
|
| 7 |
+
|
| 8 |
+
from torch.utils import cpp_extension
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
|
| 12 |
+
Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
|
| 13 |
+
"""
|
| 14 |
+
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load():
|
| 18 |
+
# Check if cuda 11 is installed for compute capability 8.0
|
| 19 |
+
cc_flag = []
|
| 20 |
+
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
|
| 21 |
+
if int(bare_metal_major) >= 11:
|
| 22 |
+
cc_flag.append("-gencode")
|
| 23 |
+
cc_flag.append("arch=compute_80,code=sm_80")
|
| 24 |
+
|
| 25 |
+
# Build path
|
| 26 |
+
srcpath = pathlib.Path(__file__).parent.absolute()
|
| 27 |
+
buildpath = srcpath / "build"
|
| 28 |
+
_create_build_dir(buildpath)
|
| 29 |
+
|
| 30 |
+
# Helper function to build the kernels.
|
| 31 |
+
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
|
| 32 |
+
return cpp_extension.load(
|
| 33 |
+
name=name,
|
| 34 |
+
sources=sources,
|
| 35 |
+
build_directory=buildpath,
|
| 36 |
+
extra_cflags=[
|
| 37 |
+
"-O3",
|
| 38 |
+
],
|
| 39 |
+
extra_cuda_cflags=[
|
| 40 |
+
"-O3",
|
| 41 |
+
"-gencode",
|
| 42 |
+
"arch=compute_70,code=sm_70",
|
| 43 |
+
"--use_fast_math",
|
| 44 |
+
]
|
| 45 |
+
+ extra_cuda_flags
|
| 46 |
+
+ cc_flag,
|
| 47 |
+
verbose=True,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
extra_cuda_flags = [
|
| 51 |
+
"-U__CUDA_NO_HALF_OPERATORS__",
|
| 52 |
+
"-U__CUDA_NO_HALF_CONVERSIONS__",
|
| 53 |
+
"--expt-relaxed-constexpr",
|
| 54 |
+
"--expt-extended-lambda",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
sources = [
|
| 58 |
+
srcpath / "anti_alias_activation.cpp",
|
| 59 |
+
srcpath / "anti_alias_activation_cuda.cu",
|
| 60 |
+
]
|
| 61 |
+
anti_alias_activation_cuda = _cpp_extention_load_helper(
|
| 62 |
+
"anti_alias_activation_cuda", sources, extra_cuda_flags
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return anti_alias_activation_cuda
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _get_cuda_bare_metal_version(cuda_dir):
|
| 69 |
+
raw_output = subprocess.check_output(
|
| 70 |
+
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
|
| 71 |
+
)
|
| 72 |
+
output = raw_output.split()
|
| 73 |
+
release_idx = output.index("release") + 1
|
| 74 |
+
release = output[release_idx].split(".")
|
| 75 |
+
bare_metal_major = release[0]
|
| 76 |
+
bare_metal_minor = release[1][0]
|
| 77 |
+
|
| 78 |
+
return raw_output, bare_metal_major, bare_metal_minor
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _create_build_dir(buildpath):
|
| 82 |
+
try:
|
| 83 |
+
os.mkdir(buildpath)
|
| 84 |
+
except OSError:
|
| 85 |
+
if not os.path.isdir(buildpath):
|
| 86 |
+
print(f"Creation of the build directory {buildpath} failed")
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/type_shim.h
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* coding=utf-8
|
| 2 |
+
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
#include <ATen/ATen.h>
|
| 18 |
+
#include "compat.h"
|
| 19 |
+
|
| 20 |
+
#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...) \
|
| 21 |
+
switch (TYPE) \
|
| 22 |
+
{ \
|
| 23 |
+
case at::ScalarType::Float: \
|
| 24 |
+
{ \
|
| 25 |
+
using scalar_t = float; \
|
| 26 |
+
__VA_ARGS__; \
|
| 27 |
+
break; \
|
| 28 |
+
} \
|
| 29 |
+
case at::ScalarType::Half: \
|
| 30 |
+
{ \
|
| 31 |
+
using scalar_t = at::Half; \
|
| 32 |
+
__VA_ARGS__; \
|
| 33 |
+
break; \
|
| 34 |
+
} \
|
| 35 |
+
case at::ScalarType::BFloat16: \
|
| 36 |
+
{ \
|
| 37 |
+
using scalar_t = at::BFloat16; \
|
| 38 |
+
__VA_ARGS__; \
|
| 39 |
+
break; \
|
| 40 |
+
} \
|
| 41 |
+
default: \
|
| 42 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
|
| 46 |
+
switch (TYPEIN) \
|
| 47 |
+
{ \
|
| 48 |
+
case at::ScalarType::Float: \
|
| 49 |
+
{ \
|
| 50 |
+
using scalar_t_in = float; \
|
| 51 |
+
switch (TYPEOUT) \
|
| 52 |
+
{ \
|
| 53 |
+
case at::ScalarType::Float: \
|
| 54 |
+
{ \
|
| 55 |
+
using scalar_t_out = float; \
|
| 56 |
+
__VA_ARGS__; \
|
| 57 |
+
break; \
|
| 58 |
+
} \
|
| 59 |
+
case at::ScalarType::Half: \
|
| 60 |
+
{ \
|
| 61 |
+
using scalar_t_out = at::Half; \
|
| 62 |
+
__VA_ARGS__; \
|
| 63 |
+
break; \
|
| 64 |
+
} \
|
| 65 |
+
case at::ScalarType::BFloat16: \
|
| 66 |
+
{ \
|
| 67 |
+
using scalar_t_out = at::BFloat16; \
|
| 68 |
+
__VA_ARGS__; \
|
| 69 |
+
break; \
|
| 70 |
+
} \
|
| 71 |
+
default: \
|
| 72 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
|
| 73 |
+
} \
|
| 74 |
+
break; \
|
| 75 |
+
} \
|
| 76 |
+
case at::ScalarType::Half: \
|
| 77 |
+
{ \
|
| 78 |
+
using scalar_t_in = at::Half; \
|
| 79 |
+
using scalar_t_out = at::Half; \
|
| 80 |
+
__VA_ARGS__; \
|
| 81 |
+
break; \
|
| 82 |
+
} \
|
| 83 |
+
case at::ScalarType::BFloat16: \
|
| 84 |
+
{ \
|
| 85 |
+
using scalar_t_in = at::BFloat16; \
|
| 86 |
+
using scalar_t_out = at::BFloat16; \
|
| 87 |
+
__VA_ARGS__; \
|
| 88 |
+
break; \
|
| 89 |
+
} \
|
| 90 |
+
default: \
|
| 91 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
|
| 92 |
+
}
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
| 2 |
+
# LICENSE is in incl_licenses directory.
|
| 3 |
+
|
| 4 |
+
from .filter import *
|
| 5 |
+
from .resample import *
|
| 6 |
+
from .act import *
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (289 Bytes). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/act.cpython-310.pyc
ADDED
|
Binary file (1.12 kB). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/filter.cpython-310.pyc
ADDED
|
Binary file (2.83 kB). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__pycache__/resample.cpython-310.pyc
ADDED
|
Binary file (1.98 kB). View file
|
|
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
| 2 |
+
# LICENSE is in incl_licenses directory.
|
| 3 |
+
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from .resample import UpSample1d, DownSample1d
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Activation1d(nn.Module):
|
| 9 |
+
def __init__(
|
| 10 |
+
self,
|
| 11 |
+
activation,
|
| 12 |
+
up_ratio: int = 2,
|
| 13 |
+
down_ratio: int = 2,
|
| 14 |
+
up_kernel_size: int = 12,
|
| 15 |
+
down_kernel_size: int = 12,
|
| 16 |
+
):
|
| 17 |
+
super().__init__()
|
| 18 |
+
self.up_ratio = up_ratio
|
| 19 |
+
self.down_ratio = down_ratio
|
| 20 |
+
self.act = activation
|
| 21 |
+
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
| 22 |
+
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
| 23 |
+
|
| 24 |
+
# x: [B,C,T]
|
| 25 |
+
def forward(self, x):
|
| 26 |
+
x = self.upsample(x)
|
| 27 |
+
x = self.act(x)
|
| 28 |
+
x = self.downsample(x)
|
| 29 |
+
|
| 30 |
+
return x
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
| 2 |
+
# LICENSE is in incl_licenses directory.
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
import math
|
| 8 |
+
|
| 9 |
+
if "sinc" in dir(torch):
|
| 10 |
+
sinc = torch.sinc
|
| 11 |
+
else:
|
| 12 |
+
# This code is adopted from adefossez's julius.core.sinc under the MIT License
|
| 13 |
+
# https://adefossez.github.io/julius/julius/core.html
|
| 14 |
+
# LICENSE is in incl_licenses directory.
|
| 15 |
+
def sinc(x: torch.Tensor):
|
| 16 |
+
"""
|
| 17 |
+
Implementation of sinc, i.e. sin(pi * x) / (pi * x)
|
| 18 |
+
__Warning__: Different to julius.sinc, the input is multiplied by `pi`!
|
| 19 |
+
"""
|
| 20 |
+
return torch.where(
|
| 21 |
+
x == 0,
|
| 22 |
+
torch.tensor(1.0, device=x.device, dtype=x.dtype),
|
| 23 |
+
torch.sin(math.pi * x) / math.pi / x,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
|
| 28 |
+
# https://adefossez.github.io/julius/julius/lowpass.html
|
| 29 |
+
# LICENSE is in incl_licenses directory.
|
| 30 |
+
def kaiser_sinc_filter1d(
|
| 31 |
+
cutoff, half_width, kernel_size
|
| 32 |
+
): # return filter [1,1,kernel_size]
|
| 33 |
+
even = kernel_size % 2 == 0
|
| 34 |
+
half_size = kernel_size // 2
|
| 35 |
+
|
| 36 |
+
# For kaiser window
|
| 37 |
+
delta_f = 4 * half_width
|
| 38 |
+
A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
| 39 |
+
if A > 50.0:
|
| 40 |
+
beta = 0.1102 * (A - 8.7)
|
| 41 |
+
elif A >= 21.0:
|
| 42 |
+
beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
|
| 43 |
+
else:
|
| 44 |
+
beta = 0.0
|
| 45 |
+
window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
|
| 46 |
+
|
| 47 |
+
# ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
|
| 48 |
+
if even:
|
| 49 |
+
time = torch.arange(-half_size, half_size) + 0.5
|
| 50 |
+
else:
|
| 51 |
+
time = torch.arange(kernel_size) - half_size
|
| 52 |
+
if cutoff == 0:
|
| 53 |
+
filter_ = torch.zeros_like(time)
|
| 54 |
+
else:
|
| 55 |
+
filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
|
| 56 |
+
"""
|
| 57 |
+
Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
|
| 58 |
+
"""
|
| 59 |
+
filter_ /= filter_.sum()
|
| 60 |
+
filter = filter_.view(1, 1, kernel_size)
|
| 61 |
+
|
| 62 |
+
return filter
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class LowPassFilter1d(nn.Module):
|
| 66 |
+
def __init__(
|
| 67 |
+
self,
|
| 68 |
+
cutoff=0.5,
|
| 69 |
+
half_width=0.6,
|
| 70 |
+
stride: int = 1,
|
| 71 |
+
padding: bool = True,
|
| 72 |
+
padding_mode: str = "replicate",
|
| 73 |
+
kernel_size: int = 12,
|
| 74 |
+
):
|
| 75 |
+
"""
|
| 76 |
+
kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
|
| 77 |
+
"""
|
| 78 |
+
super().__init__()
|
| 79 |
+
if cutoff < -0.0:
|
| 80 |
+
raise ValueError("Minimum cutoff must be larger than zero.")
|
| 81 |
+
if cutoff > 0.5:
|
| 82 |
+
raise ValueError("A cutoff above 0.5 does not make sense.")
|
| 83 |
+
self.kernel_size = kernel_size
|
| 84 |
+
self.even = kernel_size % 2 == 0
|
| 85 |
+
self.pad_left = kernel_size // 2 - int(self.even)
|
| 86 |
+
self.pad_right = kernel_size // 2
|
| 87 |
+
self.stride = stride
|
| 88 |
+
self.padding = padding
|
| 89 |
+
self.padding_mode = padding_mode
|
| 90 |
+
filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
|
| 91 |
+
self.register_buffer("filter", filter)
|
| 92 |
+
|
| 93 |
+
# Input [B, C, T]
|
| 94 |
+
def forward(self, x):
|
| 95 |
+
_, C, _ = x.shape
|
| 96 |
+
|
| 97 |
+
if self.padding:
|
| 98 |
+
x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
|
| 99 |
+
out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
|
| 100 |
+
|
| 101 |
+
return out
|
r1-a/response_generation/Kimi-Audio/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
| 2 |
+
# LICENSE is in incl_licenses directory.
|
| 3 |
+
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from torch.nn import functional as F
|
| 6 |
+
from .filter import LowPassFilter1d
|
| 7 |
+
from .filter import kaiser_sinc_filter1d
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class UpSample1d(nn.Module):
|
| 11 |
+
def __init__(self, ratio=2, kernel_size=None):
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.ratio = ratio
|
| 14 |
+
self.kernel_size = (
|
| 15 |
+
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
|
| 16 |
+
)
|
| 17 |
+
self.stride = ratio
|
| 18 |
+
self.pad = self.kernel_size // ratio - 1
|
| 19 |
+
self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
|
| 20 |
+
self.pad_right = (
|
| 21 |
+
self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
|
| 22 |
+
)
|
| 23 |
+
filter = kaiser_sinc_filter1d(
|
| 24 |
+
cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
|
| 25 |
+
)
|
| 26 |
+
self.register_buffer("filter", filter)
|
| 27 |
+
|
| 28 |
+
# x: [B, C, T]
|
| 29 |
+
def forward(self, x):
|
| 30 |
+
_, C, _ = x.shape
|
| 31 |
+
|
| 32 |
+
x = F.pad(x, (self.pad, self.pad), mode="replicate")
|
| 33 |
+
x = self.ratio * F.conv_transpose1d(
|
| 34 |
+
x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
|
| 35 |
+
)
|
| 36 |
+
x = x[..., self.pad_left : -self.pad_right]
|
| 37 |
+
|
| 38 |
+
return x
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class DownSample1d(nn.Module):
|
| 42 |
+
def __init__(self, ratio=2, kernel_size=None):
|
| 43 |
+
super().__init__()
|
| 44 |
+
self.ratio = ratio
|
| 45 |
+
self.kernel_size = (
|
| 46 |
+
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
|
| 47 |
+
)
|
| 48 |
+
self.lowpass = LowPassFilter1d(
|
| 49 |
+
cutoff=0.5 / ratio,
|
| 50 |
+
half_width=0.6 / ratio,
|
| 51 |
+
stride=ratio,
|
| 52 |
+
kernel_size=self.kernel_size,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def forward(self, x):
|
| 56 |
+
xx = self.lowpass(x)
|
| 57 |
+
|
| 58 |
+
return xx
|