CLAP (code, models, paper)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf +3 -0
- code/CLAP.zip +3 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx +3 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt +0 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx +3 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt +173 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx +3 -0
- models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt +0 -0
- models/onnx/ailia-models/LAION-CLAP/code/LICENSE +121 -0
- models/onnx/ailia-models/LAION-CLAP/code/README.md +64 -0
- models/onnx/ailia-models/LAION-CLAP/code/clap.py +203 -0
- models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py +170 -0
- models/onnx/ailia-models/LAION-CLAP/code/input.wav +3 -0
- models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt +0 -0
- models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json +0 -0
- models/onnx/ailia-models/LAION-CLAP/source.txt +10 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE +21 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/README.md +72 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt +6 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/input.wav +3 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py +270 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json +1 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt +0 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt +0 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json +1 -0
- models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json +0 -0
- models/onnx/ailia-models/Microsoft-CLAP/source.txt +1 -0
- models/onnx/clap-htsat-unfused (Xenova)/.gitattributes +35 -0
- models/onnx/clap-htsat-unfused (Xenova)/README.md +79 -0
- models/onnx/clap-htsat-unfused (Xenova)/config.json +27 -0
- models/onnx/clap-htsat-unfused (Xenova)/merges.txt +0 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx +3 -0
- models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json +22 -0
- models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json +122 -0
- models/onnx/clap-htsat-unfused (Xenova)/source.txt +1 -0
- models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json +15 -0
- models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json +0 -0
- models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json +63 -0
- models/onnx/clap-htsat-unfused (Xenova)/vocab.json +0 -0
- models/onnx/larger_clap_general (Xenova)/.gitattributes +35 -0
- models/onnx/larger_clap_general (Xenova)/README.md +80 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Large-scale[[:space:]]Contrastive[[:space:]]Language-Audio[[:space:]]Pretraining[[:space:]]with[[:space:]]Feature[[:space:]]Fusion[[:space:]]and[[:space:]]Keyword-to-Caption[[:space:]]Augmentation.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
models/onnx/ailia-models/LAION-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/onnx/ailia-models/Microsoft-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text
|
Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c223105503d6f5c173479b84bcc6648c0df9f12a8493492617616c75049e7d31
|
| 3 |
+
size 695271
|
code/CLAP.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e5ae04cc75b3acc7568a968365c28eebaba7e148a77a801e48a0e480595b30d
|
| 3 |
+
size 11947642
|
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9901a76d952d2a9be4d8e0a1e790bde6a86867f382c6aa991c4c6fcdfde0afeb
|
| 3 |
+
size 117413819
|
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c88525ca12a6e42b62ede1a086340411abd65bb7305dc963301eb1647825150
|
| 3 |
+
size 2626946
|
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ir_version: 6
|
| 2 |
+
producer_name: "pytorch"
|
| 3 |
+
producer_version: "1.13.0"
|
| 4 |
+
model_version: 0
|
| 5 |
+
graph {
|
| 6 |
+
name: "torch_jit"
|
| 7 |
+
node {
|
| 8 |
+
input: "x"
|
| 9 |
+
input: "text_projection.0.weight"
|
| 10 |
+
input: "text_projection.0.bias"
|
| 11 |
+
output: "/text_projection/text_projection.0/Gemm_output_0"
|
| 12 |
+
name: "/text_projection/text_projection.0/Gemm"
|
| 13 |
+
op_type: "Gemm"
|
| 14 |
+
attribute {
|
| 15 |
+
name: "alpha"
|
| 16 |
+
f: 1.0
|
| 17 |
+
type: FLOAT
|
| 18 |
+
}
|
| 19 |
+
attribute {
|
| 20 |
+
name: "beta"
|
| 21 |
+
f: 1.0
|
| 22 |
+
type: FLOAT
|
| 23 |
+
}
|
| 24 |
+
attribute {
|
| 25 |
+
name: "transB"
|
| 26 |
+
i: 1
|
| 27 |
+
type: INT
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
node {
|
| 31 |
+
input: "/text_projection/text_projection.0/Gemm_output_0"
|
| 32 |
+
output: "/text_projection/text_projection.1/Relu_output_0"
|
| 33 |
+
name: "/text_projection/text_projection.1/Relu"
|
| 34 |
+
op_type: "Relu"
|
| 35 |
+
}
|
| 36 |
+
node {
|
| 37 |
+
input: "/text_projection/text_projection.1/Relu_output_0"
|
| 38 |
+
input: "text_projection.2.weight"
|
| 39 |
+
input: "text_projection.2.bias"
|
| 40 |
+
output: "/text_projection/text_projection.2/Gemm_output_0"
|
| 41 |
+
name: "/text_projection/text_projection.2/Gemm"
|
| 42 |
+
op_type: "Gemm"
|
| 43 |
+
attribute {
|
| 44 |
+
name: "alpha"
|
| 45 |
+
f: 1.0
|
| 46 |
+
type: FLOAT
|
| 47 |
+
}
|
| 48 |
+
attribute {
|
| 49 |
+
name: "beta"
|
| 50 |
+
f: 1.0
|
| 51 |
+
type: FLOAT
|
| 52 |
+
}
|
| 53 |
+
attribute {
|
| 54 |
+
name: "transB"
|
| 55 |
+
i: 1
|
| 56 |
+
type: INT
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
node {
|
| 60 |
+
input: "/text_projection/text_projection.2/Gemm_output_0"
|
| 61 |
+
output: "/ReduceL2_output_0"
|
| 62 |
+
name: "/ReduceL2"
|
| 63 |
+
op_type: "ReduceL2"
|
| 64 |
+
attribute {
|
| 65 |
+
name: "axes"
|
| 66 |
+
ints: -1
|
| 67 |
+
type: INTS
|
| 68 |
+
}
|
| 69 |
+
attribute {
|
| 70 |
+
name: "keepdims"
|
| 71 |
+
i: 1
|
| 72 |
+
type: INT
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
node {
|
| 76 |
+
output: "/Constant_output_0"
|
| 77 |
+
name: "/Constant"
|
| 78 |
+
op_type: "Constant"
|
| 79 |
+
attribute {
|
| 80 |
+
name: "value"
|
| 81 |
+
t {
|
| 82 |
+
data_type: 1
|
| 83 |
+
raw_data: "\314\274\214+"
|
| 84 |
+
}
|
| 85 |
+
type: TENSOR
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
node {
|
| 89 |
+
input: "/ReduceL2_output_0"
|
| 90 |
+
input: "/Constant_output_0"
|
| 91 |
+
input: ""
|
| 92 |
+
output: "/Clip_output_0"
|
| 93 |
+
name: "/Clip"
|
| 94 |
+
op_type: "Clip"
|
| 95 |
+
}
|
| 96 |
+
node {
|
| 97 |
+
input: "/text_projection/text_projection.2/Gemm_output_0"
|
| 98 |
+
output: "/Shape_output_0"
|
| 99 |
+
name: "/Shape"
|
| 100 |
+
op_type: "Shape"
|
| 101 |
+
}
|
| 102 |
+
node {
|
| 103 |
+
input: "/Clip_output_0"
|
| 104 |
+
input: "/Shape_output_0"
|
| 105 |
+
output: "/Expand_output_0"
|
| 106 |
+
name: "/Expand"
|
| 107 |
+
op_type: "Expand"
|
| 108 |
+
}
|
| 109 |
+
node {
|
| 110 |
+
input: "/text_projection/text_projection.2/Gemm_output_0"
|
| 111 |
+
input: "/Expand_output_0"
|
| 112 |
+
output: "text_embed"
|
| 113 |
+
name: "/Div"
|
| 114 |
+
op_type: "Div"
|
| 115 |
+
}
|
| 116 |
+
initializer {
|
| 117 |
+
dims: 512
|
| 118 |
+
dims: 768
|
| 119 |
+
data_type: 1
|
| 120 |
+
name: "text_projection.0.weight"
|
| 121 |
+
}
|
| 122 |
+
initializer {
|
| 123 |
+
dims: 512
|
| 124 |
+
data_type: 1
|
| 125 |
+
name: "text_projection.0.bias"
|
| 126 |
+
}
|
| 127 |
+
initializer {
|
| 128 |
+
dims: 512
|
| 129 |
+
dims: 512
|
| 130 |
+
data_type: 1
|
| 131 |
+
name: "text_projection.2.weight"
|
| 132 |
+
}
|
| 133 |
+
initializer {
|
| 134 |
+
dims: 512
|
| 135 |
+
data_type: 1
|
| 136 |
+
name: "text_projection.2.bias"
|
| 137 |
+
}
|
| 138 |
+
input {
|
| 139 |
+
name: "x"
|
| 140 |
+
type {
|
| 141 |
+
tensor_type {
|
| 142 |
+
elem_type: 1
|
| 143 |
+
shape {
|
| 144 |
+
dim {
|
| 145 |
+
dim_param: "batch_size"
|
| 146 |
+
}
|
| 147 |
+
dim {
|
| 148 |
+
dim_value: 768
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
output {
|
| 155 |
+
name: "text_embed"
|
| 156 |
+
type {
|
| 157 |
+
tensor_type {
|
| 158 |
+
elem_type: 1
|
| 159 |
+
shape {
|
| 160 |
+
dim {
|
| 161 |
+
dim_param: "Divtext_embed_dim_0"
|
| 162 |
+
}
|
| 163 |
+
dim {
|
| 164 |
+
dim_param: "Divtext_embed_dim_1"
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
opset_import {
|
| 172 |
+
version: 11
|
| 173 |
+
}
|
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ae0bea2a8147289e8d23fb2d25af3a7fdccdaa7730e69c4c52f68c9428749ee
|
| 3 |
+
size 498829921
|
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/LAION-CLAP/code/LICENSE
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Creative Commons Legal Code
|
| 2 |
+
|
| 3 |
+
CC0 1.0 Universal
|
| 4 |
+
|
| 5 |
+
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
| 6 |
+
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
| 7 |
+
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
| 8 |
+
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
| 9 |
+
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
| 10 |
+
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
| 11 |
+
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
| 12 |
+
HEREUNDER.
|
| 13 |
+
|
| 14 |
+
Statement of Purpose
|
| 15 |
+
|
| 16 |
+
The laws of most jurisdictions throughout the world automatically confer
|
| 17 |
+
exclusive Copyright and Related Rights (defined below) upon the creator
|
| 18 |
+
and subsequent owner(s) (each and all, an "owner") of an original work of
|
| 19 |
+
authorship and/or a database (each, a "Work").
|
| 20 |
+
|
| 21 |
+
Certain owners wish to permanently relinquish those rights to a Work for
|
| 22 |
+
the purpose of contributing to a commons of creative, cultural and
|
| 23 |
+
scientific works ("Commons") that the public can reliably and without fear
|
| 24 |
+
of later claims of infringement build upon, modify, incorporate in other
|
| 25 |
+
works, reuse and redistribute as freely as possible in any form whatsoever
|
| 26 |
+
and for any purposes, including without limitation commercial purposes.
|
| 27 |
+
These owners may contribute to the Commons to promote the ideal of a free
|
| 28 |
+
culture and the further production of creative, cultural and scientific
|
| 29 |
+
works, or to gain reputation or greater distribution for their Work in
|
| 30 |
+
part through the use and efforts of others.
|
| 31 |
+
|
| 32 |
+
For these and/or other purposes and motivations, and without any
|
| 33 |
+
expectation of additional consideration or compensation, the person
|
| 34 |
+
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
|
| 35 |
+
is an owner of Copyright and Related Rights in the Work, voluntarily
|
| 36 |
+
elects to apply CC0 to the Work and publicly distribute the Work under its
|
| 37 |
+
terms, with knowledge of his or her Copyright and Related Rights in the
|
| 38 |
+
Work and the meaning and intended legal effect of CC0 on those rights.
|
| 39 |
+
|
| 40 |
+
1. Copyright and Related Rights. A Work made available under CC0 may be
|
| 41 |
+
protected by copyright and related or neighboring rights ("Copyright and
|
| 42 |
+
Related Rights"). Copyright and Related Rights include, but are not
|
| 43 |
+
limited to, the following:
|
| 44 |
+
|
| 45 |
+
i. the right to reproduce, adapt, distribute, perform, display,
|
| 46 |
+
communicate, and translate a Work;
|
| 47 |
+
ii. moral rights retained by the original author(s) and/or performer(s);
|
| 48 |
+
iii. publicity and privacy rights pertaining to a person's image or
|
| 49 |
+
likeness depicted in a Work;
|
| 50 |
+
iv. rights protecting against unfair competition in regards to a Work,
|
| 51 |
+
subject to the limitations in paragraph 4(a), below;
|
| 52 |
+
v. rights protecting the extraction, dissemination, use and reuse of data
|
| 53 |
+
in a Work;
|
| 54 |
+
vi. database rights (such as those arising under Directive 96/9/EC of the
|
| 55 |
+
European Parliament and of the Council of 11 March 1996 on the legal
|
| 56 |
+
protection of databases, and under any national implementation
|
| 57 |
+
thereof, including any amended or successor version of such
|
| 58 |
+
directive); and
|
| 59 |
+
vii. other similar, equivalent or corresponding rights throughout the
|
| 60 |
+
world based on applicable law or treaty, and any national
|
| 61 |
+
implementations thereof.
|
| 62 |
+
|
| 63 |
+
2. Waiver. To the greatest extent permitted by, but not in contravention
|
| 64 |
+
of, applicable law, Affirmer hereby overtly, fully, permanently,
|
| 65 |
+
irrevocably and unconditionally waives, abandons, and surrenders all of
|
| 66 |
+
Affirmer's Copyright and Related Rights and associated claims and causes
|
| 67 |
+
of action, whether now known or unknown (including existing as well as
|
| 68 |
+
future claims and causes of action), in the Work (i) in all territories
|
| 69 |
+
worldwide, (ii) for the maximum duration provided by applicable law or
|
| 70 |
+
treaty (including future time extensions), (iii) in any current or future
|
| 71 |
+
medium and for any number of copies, and (iv) for any purpose whatsoever,
|
| 72 |
+
including without limitation commercial, advertising or promotional
|
| 73 |
+
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
|
| 74 |
+
member of the public at large and to the detriment of Affirmer's heirs and
|
| 75 |
+
successors, fully intending that such Waiver shall not be subject to
|
| 76 |
+
revocation, rescission, cancellation, termination, or any other legal or
|
| 77 |
+
equitable action to disrupt the quiet enjoyment of the Work by the public
|
| 78 |
+
as contemplated by Affirmer's express Statement of Purpose.
|
| 79 |
+
|
| 80 |
+
3. Public License Fallback. Should any part of the Waiver for any reason
|
| 81 |
+
be judged legally invalid or ineffective under applicable law, then the
|
| 82 |
+
Waiver shall be preserved to the maximum extent permitted taking into
|
| 83 |
+
account Affirmer's express Statement of Purpose. In addition, to the
|
| 84 |
+
extent the Waiver is so judged Affirmer hereby grants to each affected
|
| 85 |
+
person a royalty-free, non transferable, non sublicensable, non exclusive,
|
| 86 |
+
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
| 87 |
+
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
| 88 |
+
maximum duration provided by applicable law or treaty (including future
|
| 89 |
+
time extensions), (iii) in any current or future medium and for any number
|
| 90 |
+
of copies, and (iv) for any purpose whatsoever, including without
|
| 91 |
+
limitation commercial, advertising or promotional purposes (the
|
| 92 |
+
"License"). The License shall be deemed effective as of the date CC0 was
|
| 93 |
+
applied by Affirmer to the Work. Should any part of the License for any
|
| 94 |
+
reason be judged legally invalid or ineffective under applicable law, such
|
| 95 |
+
partial invalidity or ineffectiveness shall not invalidate the remainder
|
| 96 |
+
of the License, and in such case Affirmer hereby affirms that he or she
|
| 97 |
+
will not (i) exercise any of his or her remaining Copyright and Related
|
| 98 |
+
Rights in the Work or (ii) assert any associated claims and causes of
|
| 99 |
+
action with respect to the Work, in either case contrary to Affirmer's
|
| 100 |
+
express Statement of Purpose.
|
| 101 |
+
|
| 102 |
+
4. Limitations and Disclaimers.
|
| 103 |
+
|
| 104 |
+
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
| 105 |
+
surrendered, licensed or otherwise affected by this document.
|
| 106 |
+
b. Affirmer offers the Work as-is and makes no representations or
|
| 107 |
+
warranties of any kind concerning the Work, express, implied,
|
| 108 |
+
statutory or otherwise, including without limitation warranties of
|
| 109 |
+
title, merchantability, fitness for a particular purpose, non
|
| 110 |
+
infringement, or the absence of latent or other defects, accuracy, or
|
| 111 |
+
the present or absence of errors, whether or not discoverable, all to
|
| 112 |
+
the greatest extent permissible under applicable law.
|
| 113 |
+
c. Affirmer disclaims responsibility for clearing rights of other persons
|
| 114 |
+
that may apply to the Work or any use thereof, including without
|
| 115 |
+
limitation any person's Copyright and Related Rights in the Work.
|
| 116 |
+
Further, Affirmer disclaims responsibility for obtaining any necessary
|
| 117 |
+
consents, permissions or other rights required for any use of the
|
| 118 |
+
Work.
|
| 119 |
+
d. Affirmer understands and acknowledges that Creative Commons is not a
|
| 120 |
+
party to this document and has no duty or obligation with respect to
|
| 121 |
+
this CC0 or use of the Work.
|
models/onnx/ailia-models/LAION-CLAP/code/README.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLAP
|
| 2 |
+
|
| 3 |
+
Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP architecture, similarly, the CLAP architecture is as follows.
|
| 4 |
+
|
| 5 |
+
## Input
|
| 6 |
+
|
| 7 |
+
Audio file
|
| 8 |
+
```
|
| 9 |
+
24965__www-bonson-ca__bigdogbarking-02.wav
|
| 10 |
+
Attribution 3.0 Unported (CC BY 3.0)
|
| 11 |
+
https://freesound.org/people/www.bonson.ca/sounds/24965/
|
| 12 |
+
```
|
| 13 |
+
## Output
|
| 14 |
+
|
| 15 |
+
Output the cosine similarity between the pre-prepared text embedding and the input audio file embedding. The higher a value of cosine similality is, the closer given text and given audio are in meaning.
|
| 16 |
+
```
|
| 17 |
+
===== cosine similality between text and audio =====
|
| 18 |
+
cossim=0.1514, word=applause applaud clap
|
| 19 |
+
cossim=0.2942, word=The crowd is clapping.
|
| 20 |
+
cossim=0.0391, word=I love the contrastive learning
|
| 21 |
+
cossim=0.0755, word=bell
|
| 22 |
+
cossim=-0.0926, word=soccer
|
| 23 |
+
cossim=0.0309, word=open the door.
|
| 24 |
+
cossim=0.0849, word=applause
|
| 25 |
+
cossim=0.4183, word=dog
|
| 26 |
+
cossim=0.3819, word=dog barking
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
Automatically downloads the onnx and prototxt files on the first run.
|
| 31 |
+
It is necessary to be connected to the Internet while downloading.
|
| 32 |
+
|
| 33 |
+
For the sample wav,
|
| 34 |
+
```bash
|
| 35 |
+
$ python3 clap.py
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
If you want to run in onnx mode, you specify `--onnx` option as below.
|
| 39 |
+
```bash
|
| 40 |
+
$ python3 clap.py --onnx
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
You can run with other wav file by adding `--input` option.
|
| 44 |
+
```bash
|
| 45 |
+
$ python3 clap.py --input [wav_file]
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## Reference
|
| 49 |
+
|
| 50 |
+
[CLAP](https://github.com/LAION-AI/CLAP)
|
| 51 |
+
|
| 52 |
+
## Framework
|
| 53 |
+
|
| 54 |
+
Pytorch
|
| 55 |
+
|
| 56 |
+
## Model Format
|
| 57 |
+
|
| 58 |
+
ONNX opset=11
|
| 59 |
+
|
| 60 |
+
## Netron
|
| 61 |
+
|
| 62 |
+
[CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt)
|
| 63 |
+
[CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt)
|
| 64 |
+
[CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt)
|
models/onnx/ailia-models/LAION-CLAP/code/clap.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
import ailia # noqa: E402
|
| 7 |
+
sys.path.append('../../util')
|
| 8 |
+
from arg_utils import get_base_parser, update_parser # noqa: E402
|
| 9 |
+
from model_utils import check_and_download_models, check_and_download_file # noqa: E402
|
| 10 |
+
|
| 11 |
+
# logger
|
| 12 |
+
from logging import getLogger # noqa: E402
|
| 13 |
+
logger = getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# for clap
|
| 16 |
+
import librosa
|
| 17 |
+
from clap_utils import *
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ======================
|
| 21 |
+
# Arguemnt Parser Config
|
| 22 |
+
# ======================
|
| 23 |
+
AUDIO_PATH = 'input.wav'
|
| 24 |
+
parser = get_base_parser('CLAP', AUDIO_PATH, None)
|
| 25 |
+
parser.add_argument(
|
| 26 |
+
'--onnx',
|
| 27 |
+
action='store_true',
|
| 28 |
+
help='By default, the ailia SDK is used, but with this option, you can switch to using ONNX Runtime'
|
| 29 |
+
)
|
| 30 |
+
parser.add_argument(
|
| 31 |
+
'--disable_ailia_tokenizer',
|
| 32 |
+
action='store_true',
|
| 33 |
+
help='disable ailia tokenizer.'
|
| 34 |
+
)
|
| 35 |
+
parser.add_argument(
|
| 36 |
+
'--disable_ailia_audio',
|
| 37 |
+
action='store_true',
|
| 38 |
+
help='disable ailia audio and use librosa to get spectrogram feature'
|
| 39 |
+
)
|
| 40 |
+
args = update_parser(parser)
|
| 41 |
+
|
| 42 |
+
# ======================
|
| 43 |
+
# PARAMETERS
|
| 44 |
+
# ======================
|
| 45 |
+
CLAP_AUDIO_WEIGHT_PATH = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx"
|
| 46 |
+
CLAP_AUDIO_MODEL_PATH = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt"
|
| 47 |
+
CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx"
|
| 48 |
+
CLAP_TEXT_ROBERTAMODEL_MODEL_PATH = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt"
|
| 49 |
+
CLAP_TEXT_PROJECTION_WEIGHT_PATH = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx"
|
| 50 |
+
CLAP_TEXT_PROJECTION_MODEL_PATH = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt"
|
| 51 |
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/clap/"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ======================
|
| 55 |
+
# Utils
|
| 56 |
+
# ======================
|
| 57 |
+
def cos_sim(v1, v2):
|
| 58 |
+
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ======================
|
| 62 |
+
# Main function
|
| 63 |
+
# ======================
|
| 64 |
+
def infer_text(net_text_branch, net_text_projection, text_data):
|
| 65 |
+
# tokenizer
|
| 66 |
+
if args.disable_ailia_tokenizer:
|
| 67 |
+
from transformers import RobertaTokenizer
|
| 68 |
+
tokenize = RobertaTokenizer.from_pretrained('roberta-base')
|
| 69 |
+
result = tokenize(
|
| 70 |
+
text_data,
|
| 71 |
+
padding="max_length",
|
| 72 |
+
truncation=True,
|
| 73 |
+
max_length=77,
|
| 74 |
+
return_tensors="pt",
|
| 75 |
+
)
|
| 76 |
+
data = {k: v.squeeze(0) for k, v in result.items()}
|
| 77 |
+
data["input_ids"] = data["input_ids"].to('cpu').detach().numpy().copy()
|
| 78 |
+
data["attention_mask"] = data["attention_mask"].to('cpu').detach().numpy().copy()
|
| 79 |
+
else:
|
| 80 |
+
from ailia_tokenizer import RobertaTokenizer
|
| 81 |
+
tokenize = RobertaTokenizer.from_pretrained("./tokenizer/")
|
| 82 |
+
result = tokenize(
|
| 83 |
+
text_data,
|
| 84 |
+
padding="max_length",
|
| 85 |
+
truncation=True,
|
| 86 |
+
max_length=77,
|
| 87 |
+
return_tensors="np",
|
| 88 |
+
)
|
| 89 |
+
data = {k: v for k, v in result.items()}
|
| 90 |
+
|
| 91 |
+
#print("input_ids", data["input_ids"])
|
| 92 |
+
#print("attention_mask", data["attention_mask"])
|
| 93 |
+
|
| 94 |
+
# predict
|
| 95 |
+
input_data = {
|
| 96 |
+
'input_ids': data["input_ids"],
|
| 97 |
+
'attention_mask': data["attention_mask"]
|
| 98 |
+
}
|
| 99 |
+
if not args.onnx:
|
| 100 |
+
output = net_text_branch.predict(input_data) # text_branch
|
| 101 |
+
_, x = output[0], output[1] # last_hidden_state, pooler_output
|
| 102 |
+
text_embeds = net_text_projection.predict(x) # projection
|
| 103 |
+
else:
|
| 104 |
+
output = net_text_branch.run(None, input_data) # text_branch
|
| 105 |
+
_, x = output[0], output[1] # last_hidden_state, pooler_output
|
| 106 |
+
text_embeds = net_text_projection.run(None, {'x': x})[0] # projection
|
| 107 |
+
|
| 108 |
+
return text_embeds
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def infer_audio(net_audio, audio_src):
|
| 112 |
+
# load the waveform of the shape (T,), should resample to 48000
|
| 113 |
+
audio_waveform, sr = librosa.load(audio_src, sr=48000)
|
| 114 |
+
|
| 115 |
+
# quantize
|
| 116 |
+
audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
|
| 117 |
+
|
| 118 |
+
# get audio features
|
| 119 |
+
_, mel_fusion, _ = get_audio_features(
|
| 120 |
+
{}, audio_waveform, 480000,
|
| 121 |
+
data_truncating='fusion',
|
| 122 |
+
data_filling='repeatpad',
|
| 123 |
+
audio_cfg={
|
| 124 |
+
'audio_length': 1024,
|
| 125 |
+
'clip_samples': 480000,
|
| 126 |
+
'mel_bins': 64,
|
| 127 |
+
'sample_rate': 48000,
|
| 128 |
+
'window_size': 1024,
|
| 129 |
+
'hop_size': 480,
|
| 130 |
+
'fmin': 50,
|
| 131 |
+
'fmax': 14000,
|
| 132 |
+
'class_num': 527,
|
| 133 |
+
'model_type': 'HTSAT',
|
| 134 |
+
'model_name': 'tiny'
|
| 135 |
+
},
|
| 136 |
+
b_use_ailia=not args.disable_ailia_audio
|
| 137 |
+
)
|
| 138 |
+
input_dict = {
|
| 139 |
+
'longer': [[True]], # Error occers when longer value is "False".
|
| 140 |
+
'mel_fusion': mel_fusion[np.newaxis, :, :, :]
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# predict
|
| 144 |
+
if not args.onnx:
|
| 145 |
+
input_dict["longer"] = np.array(input_dict["longer"])
|
| 146 |
+
audio_embed = net_audio.predict(input_dict)[0]
|
| 147 |
+
else:
|
| 148 |
+
audio_embed = net_audio.run(None, input_dict)[0]
|
| 149 |
+
|
| 150 |
+
return audio_embed
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def main():
|
| 154 |
+
# model files check and download
|
| 155 |
+
check_and_download_models(CLAP_AUDIO_WEIGHT_PATH, CLAP_AUDIO_MODEL_PATH, REMOTE_PATH)
|
| 156 |
+
check_and_download_models(CLAP_TEXT_PROJECTION_WEIGHT_PATH, CLAP_TEXT_PROJECTION_MODEL_PATH, REMOTE_PATH)
|
| 157 |
+
check_and_download_models(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, REMOTE_PATH)
|
| 158 |
+
|
| 159 |
+
# net initialize
|
| 160 |
+
if not args.onnx:
|
| 161 |
+
net_text_branch = \
|
| 162 |
+
ailia.Net(CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, env_id=args.env_id)
|
| 163 |
+
net_text_projection = \
|
| 164 |
+
ailia.Net(CLAP_TEXT_PROJECTION_MODEL_PATH, CLAP_TEXT_PROJECTION_WEIGHT_PATH, env_id=args.env_id)
|
| 165 |
+
net_audio = \
|
| 166 |
+
ailia.Net(CLAP_AUDIO_MODEL_PATH, CLAP_AUDIO_WEIGHT_PATH, env_id=args.env_id)
|
| 167 |
+
else:
|
| 168 |
+
import onnxruntime
|
| 169 |
+
net_text_branch = \
|
| 170 |
+
onnxruntime.InferenceSession(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH)
|
| 171 |
+
net_text_projection = \
|
| 172 |
+
onnxruntime.InferenceSession(CLAP_TEXT_PROJECTION_WEIGHT_PATH)
|
| 173 |
+
net_audio = \
|
| 174 |
+
onnxruntime.InferenceSession(CLAP_AUDIO_WEIGHT_PATH)
|
| 175 |
+
|
| 176 |
+
# text predict
|
| 177 |
+
text_inputs = [
|
| 178 |
+
"applause applaud clap",
|
| 179 |
+
"The crowd is clapping.",
|
| 180 |
+
"I love the contrastive learning",
|
| 181 |
+
"bell",
|
| 182 |
+
"soccer",
|
| 183 |
+
"open the door.",
|
| 184 |
+
"applause",
|
| 185 |
+
"dog",
|
| 186 |
+
"dog barking"
|
| 187 |
+
]
|
| 188 |
+
text_embedding = infer_text(net_text_branch, net_text_projection, text_inputs)
|
| 189 |
+
|
| 190 |
+
# audio predict
|
| 191 |
+
for audio_path in args.input:
|
| 192 |
+
audio_embedding = infer_audio(net_audio, audio_path)
|
| 193 |
+
# show result
|
| 194 |
+
print('===== cosine similality between text and audio =====')
|
| 195 |
+
print('audio: {}'.format(audio_path))
|
| 196 |
+
for i in range(text_embedding.shape[0]):
|
| 197 |
+
print('cossim={:.04f}, word={}'.format(cos_sim(text_embedding[i], audio_embedding[0]), text_inputs[i]))
|
| 198 |
+
|
| 199 |
+
logger.info('Script finished successfully.')
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
main()
|
models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import librosa
|
| 3 |
+
import ailia.audio
|
| 4 |
+
from skimage.transform import resize
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def int16_to_float32(x):
|
| 8 |
+
return (x / 32767.0).astype(np.float32)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def float32_to_int16(x):
|
| 12 |
+
x = np.clip(x, a_min=-1., a_max=1.)
|
| 13 |
+
return (x * 32767.).astype(np.int16)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_mel(audio_data, audio_cfg):
|
| 17 |
+
"""
|
| 18 |
+
# mel shape: (n_mels, T)
|
| 19 |
+
mel_torch = torchaudio.transforms.MelSpectrogram(
|
| 20 |
+
sample_rate=audio_cfg['sample_rate'],
|
| 21 |
+
n_fft=audio_cfg['window_size'],
|
| 22 |
+
win_length=audio_cfg['window_size'],
|
| 23 |
+
hop_length=audio_cfg['hop_size'],
|
| 24 |
+
center=True,
|
| 25 |
+
pad_mode="reflect",
|
| 26 |
+
power=2.0,
|
| 27 |
+
norm=None,
|
| 28 |
+
onesided=True,
|
| 29 |
+
n_mels=64,
|
| 30 |
+
f_min=audio_cfg['fmin'],
|
| 31 |
+
f_max=audio_cfg['fmax']
|
| 32 |
+
)(audio_data)
|
| 33 |
+
|
| 34 |
+
# we use log mel spectrogram as input
|
| 35 |
+
mel_torch = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel_torch)
|
| 36 |
+
mel_torch = mel_torch.T # (T, n_mels)
|
| 37 |
+
mel_torch = mel_torch.to('cpu').detach().numpy().copy()
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# Align to librosa:
|
| 41 |
+
mel_librosa = librosa.feature.melspectrogram(
|
| 42 |
+
y=audio_data,
|
| 43 |
+
sr=audio_cfg['sample_rate'],
|
| 44 |
+
n_fft=audio_cfg['window_size'],
|
| 45 |
+
hop_length=audio_cfg['hop_size'],
|
| 46 |
+
win_length=audio_cfg['window_size'],
|
| 47 |
+
center=True,
|
| 48 |
+
pad_mode="reflect",
|
| 49 |
+
power=2.0,
|
| 50 |
+
n_mels=64,
|
| 51 |
+
norm=None,
|
| 52 |
+
htk=True,
|
| 53 |
+
fmin=audio_cfg['fmin'],
|
| 54 |
+
fmax=audio_cfg['fmax']
|
| 55 |
+
)
|
| 56 |
+
mel_librosa = librosa.amplitude_to_db(mel_librosa, top_db=None)
|
| 57 |
+
mel_librosa = mel_librosa.transpose(1, 0)
|
| 58 |
+
|
| 59 |
+
return mel_librosa
|
| 60 |
+
|
| 61 |
+
def get_mel_ailia(audio_data, audio_cfg):
|
| 62 |
+
mel = ailia.audio.mel_spectrogram(
|
| 63 |
+
audio_data,
|
| 64 |
+
sample_rate=audio_cfg['sample_rate'],
|
| 65 |
+
fft_n=audio_cfg['window_size'],
|
| 66 |
+
hop_n=audio_cfg['hop_size'],
|
| 67 |
+
win_n=audio_cfg['window_size'],
|
| 68 |
+
win_type=1, # hann
|
| 69 |
+
center_mode=1,
|
| 70 |
+
power=2.0,
|
| 71 |
+
fft_norm_type=None,
|
| 72 |
+
f_min=audio_cfg['fmin'],
|
| 73 |
+
f_max=audio_cfg['fmax'],
|
| 74 |
+
mel_n=64,
|
| 75 |
+
mel_norm=False,
|
| 76 |
+
htk=True
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
|
| 80 |
+
S[(S >= 0) & (S < amin)] = amin
|
| 81 |
+
S[(S < 0) & (S > -amin)] = -amin
|
| 82 |
+
return 10 * np.log10(S / ref)
|
| 83 |
+
|
| 84 |
+
mel_db = power_to_db(np.square(mel), top_db=None)
|
| 85 |
+
mel_db = mel_db.transpose(1, 0)
|
| 86 |
+
|
| 87 |
+
return mel_db
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_audio_features(sample, audio_data, max_len, data_truncating, data_filling, audio_cfg, b_use_ailia=False):
|
| 91 |
+
"""
|
| 92 |
+
Calculate and add audio features to sample.
|
| 93 |
+
Sample: a dict containing all the data of current sample.
|
| 94 |
+
audio_data: a tensor of shape (T) containing audio data.
|
| 95 |
+
max_len: the maximum length of audio data.
|
| 96 |
+
data_truncating: the method of truncating data.
|
| 97 |
+
data_filling: the method of filling data.
|
| 98 |
+
audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg'].
|
| 99 |
+
"""
|
| 100 |
+
mel_func = get_mel_ailia if b_use_ailia else get_mel
|
| 101 |
+
if len(audio_data) > max_len:
|
| 102 |
+
if data_truncating == "fusion":
|
| 103 |
+
# fusion
|
| 104 |
+
mel = mel_func(audio_data, audio_cfg)
|
| 105 |
+
# split to three parts
|
| 106 |
+
chunk_frames = max_len // audio_cfg['hop_size']+1 # the +1 related to how the spectrogram is computed
|
| 107 |
+
total_frames = mel.shape[0]
|
| 108 |
+
if chunk_frames == total_frames:
|
| 109 |
+
# there is a corner case where the audio length is
|
| 110 |
+
# larger than max_len but smaller than max_len+hop_size.
|
| 111 |
+
# In this case, we just use the whole audio.
|
| 112 |
+
mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
|
| 113 |
+
longer = [[False]]
|
| 114 |
+
else:
|
| 115 |
+
ranges = np.array_split(list(range(0, total_frames-chunk_frames+1)), 3)
|
| 116 |
+
# print('total_frames-chunk_frames:', total_frames-chunk_frames,
|
| 117 |
+
# 'len(audio_data):', len(audio_data),
|
| 118 |
+
# 'chunk_frames:', chunk_frames,
|
| 119 |
+
# 'total_frames:', total_frames)
|
| 120 |
+
if len(ranges[1]) == 0:
|
| 121 |
+
# if the audio is too short, we just use the first chunk
|
| 122 |
+
ranges[1] = [0]
|
| 123 |
+
if len(ranges[2]) == 0:
|
| 124 |
+
# if the audio is too short, we just use the first chunk
|
| 125 |
+
ranges[2] = [0]
|
| 126 |
+
# randomly choose index for each part
|
| 127 |
+
idx_front = np.random.choice(ranges[0])
|
| 128 |
+
idx_middle = np.random.choice(ranges[1])
|
| 129 |
+
idx_back = np.random.choice(ranges[2])
|
| 130 |
+
# select mel
|
| 131 |
+
mel_chunk_front = mel[idx_front:idx_front+chunk_frames, :]
|
| 132 |
+
mel_chunk_middle = mel[idx_middle:idx_middle+chunk_frames, :]
|
| 133 |
+
mel_chunk_back = mel[idx_back:idx_back+chunk_frames, :]
|
| 134 |
+
|
| 135 |
+
# shrink the mel
|
| 136 |
+
# Output may differ between torchvision.transforms.Resize and skimage.transform.resize.
|
| 137 |
+
#mel_shrink_torch = torch.from_numpy(mel[None])
|
| 138 |
+
#mel_shrink_torch = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel_shrink_torch)[0]
|
| 139 |
+
#mel_shrink_torch = mel_shrink_torch.to('cpu').detach().numpy().copy()
|
| 140 |
+
mel_shrink_numpy = resize(mel, (chunk_frames, 64), preserve_range=True, anti_aliasing=True, mode='edge')
|
| 141 |
+
# logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
|
| 142 |
+
|
| 143 |
+
# stack
|
| 144 |
+
mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink_numpy], axis=0)
|
| 145 |
+
longer = [[True]]
|
| 146 |
+
# random crop to max_len (for compatibility)
|
| 147 |
+
overflow = len(audio_data) - max_len
|
| 148 |
+
idx = np.random.randint(0, overflow + 1)
|
| 149 |
+
audio_data = audio_data[idx: idx + max_len]
|
| 150 |
+
|
| 151 |
+
else: # padding if too short
|
| 152 |
+
if len(audio_data) < max_len: # do nothing if equal
|
| 153 |
+
if data_filling == "repeatpad":
|
| 154 |
+
n_repeat = int(max_len/len(audio_data))
|
| 155 |
+
audio_data = np.tile(audio_data, n_repeat)
|
| 156 |
+
# audio_data = audio_data.unsqueeze(0).unsqueeze(0).unsqueeze(0)
|
| 157 |
+
# audio_data = F.interpolate(audio_data,size=max_len,mode="bicubic")[0,0,0]
|
| 158 |
+
audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
|
| 159 |
+
elif data_filling == "pad":
|
| 160 |
+
audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
|
| 161 |
+
elif data_filling == "repeat":
|
| 162 |
+
n_repeat = int(max_len/len(audio_data))
|
| 163 |
+
audio_data = np.tile(audio_data, n_repeat+1)[:max_len]
|
| 164 |
+
|
| 165 |
+
if data_truncating == 'fusion':
|
| 166 |
+
mel = mel_func(audio_data, audio_cfg)
|
| 167 |
+
mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
|
| 168 |
+
longer = [[False]]
|
| 169 |
+
|
| 170 |
+
return longer, mel_fusion, audio_data
|
models/onnx/ailia-models/LAION-CLAP/code/input.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f20a2c45b4da238c377d048d2d5aa2e76fb5ea38c7f273553e38d28f502dce12
|
| 3 |
+
size 543122
|
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/LAION-CLAP/source.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/clap
|
| 2 |
+
|
| 3 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx
|
| 4 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt
|
| 5 |
+
|
| 6 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx
|
| 7 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt
|
| 8 |
+
|
| 9 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx
|
| 10 |
+
https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt
|
models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) Microsoft Corporation.
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE
|
models/onnx/ailia-models/Microsoft-CLAP/code/README.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Microsoft CLAP
|
| 2 |
+
|
| 3 |
+
## Input
|
| 4 |
+
|
| 5 |
+
**audio file**
|
| 6 |
+
|
| 7 |
+
Audio file in wav format to use as the model's input.
|
| 8 |
+
Default file name is [input.wav](./input.wav)
|
| 9 |
+
(source: https://freesound.org/people/InspectorJ/sounds/456440/)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
**text file**
|
| 14 |
+
|
| 15 |
+
A text file containing sentences separated by new lines.
|
| 16 |
+
Default file name is [captions.txt](./captions.txt)
|
| 17 |
+
|
| 18 |
+
## Output
|
| 19 |
+
|
| 20 |
+
**Cosine similarities**
|
| 21 |
+
|
| 22 |
+
Cosine similarity between the input audio and the sentences in the text file.
|
| 23 |
+
|
| 24 |
+
## Usage
|
| 25 |
+
Internet connection is required when running the script for the first time, as the model files will be automatically downloaded.
|
| 26 |
+
|
| 27 |
+
Running the script will compute the cosine similarities between the audio and the captions, using audio and language encoder models train by contrastive training.
|
| 28 |
+
|
| 29 |
+
You can switch the versions of the encoder model's weight (2022 or 2023) by specifying the version using the argument ```-v``` or ```--version```.
|
| 30 |
+
For more information on arguments, try running ```python3 msclap.py --help```
|
| 31 |
+
```bash
|
| 32 |
+
$ python3 msclap.py -t captions.txt -a input.wav -v 2023
|
| 33 |
+
INFO arg_utils.py (13) : Start!
|
| 34 |
+
INFO arg_utils.py (158) : env_id updated to 0
|
| 35 |
+
INFO arg_utils.py (163) : env_id: 0
|
| 36 |
+
INFO arg_utils.py (166) : CPU
|
| 37 |
+
INFO msclap.py (167) : input_text: ['Dog barking.', 'Birds whistling.', 'Car passing by.', 'Wind blowing.', 'Water flowing.', 'People talking.']
|
| 38 |
+
INFO msclap.py (170) : inference has started...
|
| 39 |
+
Similarity:
|
| 40 |
+
Birds whistling.: 0.41247469186782837
|
| 41 |
+
Wind blowing.: 0.2643369734287262
|
| 42 |
+
Water flowing.: 0.23884761333465576
|
| 43 |
+
Car passing by.: 0.22803542017936707
|
| 44 |
+
People talking.: 0.17387858033180237
|
| 45 |
+
Dog barking.: 0.11309497803449631
|
| 46 |
+
INFO msclap.py (192) : Script finished successfully.
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Reference
|
| 50 |
+
|
| 51 |
+
* [CLAP](https://github.com/microsoft/CLAP)
|
| 52 |
+
|
| 53 |
+
## Framework
|
| 54 |
+
|
| 55 |
+
Pytorch
|
| 56 |
+
|
| 57 |
+
## Model Format
|
| 58 |
+
|
| 59 |
+
ONNX opset=11
|
| 60 |
+
|
| 61 |
+
## Netron
|
| 62 |
+
|
| 63 |
+
[caption_model_2023.onnx.prototxt]()
|
| 64 |
+
|
| 65 |
+
[audio_model_2023.onnx.prototxt]()
|
| 66 |
+
|
| 67 |
+
[caption_model_2022.onnx.prototxt]()
|
| 68 |
+
|
| 69 |
+
[audio_model_2022.onnx.prototxt]()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Dog barking.
|
| 2 |
+
Birds whistling.
|
| 3 |
+
Car passing by.
|
| 4 |
+
Wind blowing.
|
| 5 |
+
Water flowing.
|
| 6 |
+
People talking.
|
models/onnx/ailia-models/Microsoft-CLAP/code/input.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2bbef488f5578b96843791d83cc7ad946c99e72c8b69bd56b351e1e0d80ac142
|
| 3 |
+
size 714238
|
models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import time
|
| 3 |
+
from logging import getLogger
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
import librosa
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
import ailia
|
| 12 |
+
|
| 13 |
+
# import original modules
|
| 14 |
+
sys.path.append('../../util')
|
| 15 |
+
from arg_utils import get_base_parser, update_parser, get_savepath # noqa
|
| 16 |
+
from model_utils import check_and_download_models # noqa
|
| 17 |
+
|
| 18 |
+
logger = getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# ======================
|
| 21 |
+
# Parameters
|
| 22 |
+
# ======================
|
| 23 |
+
|
| 24 |
+
CAPTION_WEIGHT_PATH_2023 = 'msclap_2023_caption.onnx'
|
| 25 |
+
AUDIO_WEIGHT_PATH_2023 = 'msclap_2023_audio.onnx'
|
| 26 |
+
|
| 27 |
+
CAPTION_MODEL_PATH_2023 = 'msclap_2023_caption.onnx.prototxt'
|
| 28 |
+
AUDIO_MODEL_PATH_2023 = 'msclap_2023_audio.onnx.prototxt'
|
| 29 |
+
|
| 30 |
+
CAPTION_WEIGHT_PATH_2022 = 'msclap_2022_caption.onnx'
|
| 31 |
+
AUDIO_WEIGHT_PATH_2022 = 'msclap_2022_audio.onnx'
|
| 32 |
+
|
| 33 |
+
CAPTION_MODEL_PATH_2022 = 'msclap_2022_caption.onnx.prototxt'
|
| 34 |
+
AUDIO_MODEL_PATH_2022 = 'msclap_2022_audio.onnx.prototxt'
|
| 35 |
+
|
| 36 |
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/msclap/"
|
| 37 |
+
|
| 38 |
+
# ======================
|
| 39 |
+
# Arguemnt Parser Config
|
| 40 |
+
# ======================
|
| 41 |
+
|
| 42 |
+
parser = get_base_parser(
|
| 43 |
+
'msclap', None, None
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
"-a", "--audio", type=str,
|
| 48 |
+
default="input.wav",
|
| 49 |
+
help="Input audio file path."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"-t", "--text", type=str,
|
| 54 |
+
default="captions.txt",
|
| 55 |
+
help="Input text caption file path"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"-v", "--version", type=str,
|
| 60 |
+
default="2023",
|
| 61 |
+
help="Version of the CLAP model (2022 or 2023)."
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
'-w', '--write_json',
|
| 66 |
+
action='store_true',
|
| 67 |
+
help='Flag to output results to json file.'
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
'--disable_ailia_tokenizer',
|
| 71 |
+
action='store_true',
|
| 72 |
+
help='disable ailia tokenizer.'
|
| 73 |
+
)
|
| 74 |
+
args = update_parser(parser, check_input_type=False)
|
| 75 |
+
|
| 76 |
+
# ======================
|
| 77 |
+
# Helper functions
|
| 78 |
+
# ======================
|
| 79 |
+
|
| 80 |
+
def read_audio(audio_path):
|
| 81 |
+
r"""Loads audio file or array and returns a numpy tensor"""
|
| 82 |
+
# Randomly sample a segment of audio_duration from the clip or pad to match duration
|
| 83 |
+
audio_time_series, sample_rate = librosa.load(audio_path, sr=None)
|
| 84 |
+
return audio_time_series, sample_rate
|
| 85 |
+
|
| 86 |
+
def resample_audio(audio_time_series, sample_rate, resample_rate):
|
| 87 |
+
resample_rate = 44100
|
| 88 |
+
if resample_rate != sample_rate:
|
| 89 |
+
audio_time_series = librosa.resample(
|
| 90 |
+
audio_time_series,
|
| 91 |
+
orig_sr=sample_rate,
|
| 92 |
+
target_sr=resample_rate,
|
| 93 |
+
res_type = 'sinc_best'
|
| 94 |
+
)
|
| 95 |
+
return audio_time_series, resample_rate
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def resize_audio(audio_time_series, sample_rate, audio_duration, resample=False):
|
| 99 |
+
r"""Loads audio file and returns raw audio."""
|
| 100 |
+
# Randomly sample a segment of audio_duration from the clip or pad to match duration
|
| 101 |
+
audio_time_series = audio_time_series.reshape(-1)
|
| 102 |
+
# audio_time_series is shorter than predefined audio duration,
|
| 103 |
+
# so audio_time_series is extended
|
| 104 |
+
if audio_duration*sample_rate >= audio_time_series.shape[0]:
|
| 105 |
+
repeat_factor = int(np.ceil((audio_duration*sample_rate) /
|
| 106 |
+
audio_time_series.shape[0]))
|
| 107 |
+
# Repeat audio_time_series by repeat_factor to match audio_duration
|
| 108 |
+
audio_time_series = np.tile(audio_time_series,repeat_factor)
|
| 109 |
+
# remove excess part of audio_time_series
|
| 110 |
+
audio_time_series = audio_time_series[0:audio_duration*sample_rate]
|
| 111 |
+
else:
|
| 112 |
+
# audio_time_series is longer than predefined audio duration,
|
| 113 |
+
# so audio_time_series is trimmed
|
| 114 |
+
start_index = random.randrange(
|
| 115 |
+
audio_time_series.shape[0] - audio_duration*sample_rate)
|
| 116 |
+
audio_time_series = audio_time_series[start_index:start_index +
|
| 117 |
+
audio_duration*sample_rate]
|
| 118 |
+
return audio_time_series
|
| 119 |
+
|
| 120 |
+
def get_audio_embeddings(wav_input, sample_rate, model, version="2023"):
|
| 121 |
+
if version in ('2023', '2022'):
|
| 122 |
+
wav_input = resample_audio(wav_input, sample_rate, 44100)[0]
|
| 123 |
+
wav_input = resize_audio(wav_input, 44100, 7)[None]
|
| 124 |
+
return model['audio_model'].predict(wav_input)
|
| 125 |
+
|
| 126 |
+
def get_caption_embeddings(text_input, model, version="2023"):
|
| 127 |
+
|
| 128 |
+
# preprocesing
|
| 129 |
+
if version == '2023':
|
| 130 |
+
text_input = [t + ' <|endoftext|>' for t in text_input]
|
| 131 |
+
tokenized = dict(model['tokenizer'](text_input, padding = True, return_tensors = 'np'))
|
| 132 |
+
|
| 133 |
+
# inference
|
| 134 |
+
model_input = (tokenized['input_ids'], tokenized['attention_mask'])
|
| 135 |
+
return model['caption_model'].predict(model_input)[0]
|
| 136 |
+
|
| 137 |
+
def cossim(v1, v2):
|
| 138 |
+
return np.sum(v1 * v2, axis = -1) / (np.sum(v1 ** 2, axis = -1) ** 0.5 * np.sum(v2 ** 2, axis = -1) ** 0.5)
|
| 139 |
+
|
| 140 |
+
def print_sorted_dict(d):
|
| 141 |
+
m_len = max([len(k) for k in d.keys()])
|
| 142 |
+
for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
|
| 143 |
+
pad = ' ' * (m_len - len(k) + 4)
|
| 144 |
+
print(f'{pad + k}: {v}')
|
| 145 |
+
|
| 146 |
+
def save_sorted_dict_as_json(d):
|
| 147 |
+
m_len = max([len(k) for k in d.keys()])
|
| 148 |
+
result = []
|
| 149 |
+
for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
|
| 150 |
+
result.append({"caption": k, "similarity": float(v)})
|
| 151 |
+
with open('output.json', 'w', encoding='utf-8') as f:
|
| 152 |
+
json.dump(result, f, indent=2)
|
| 153 |
+
|
| 154 |
+
# ======================
|
| 155 |
+
# Main functions
|
| 156 |
+
# ======================
|
| 157 |
+
|
| 158 |
+
def inference(model, input_text, input_wav, sample_rate, version):
|
| 159 |
+
# get embeddings
|
| 160 |
+
audio_embeddings = get_audio_embeddings(input_wav, sample_rate, model, version)
|
| 161 |
+
caption_embeddings = get_caption_embeddings(input_text, model, version)
|
| 162 |
+
|
| 163 |
+
return cossim(audio_embeddings, caption_embeddings)
|
| 164 |
+
|
| 165 |
+
def estimate_best_caption(model):
|
| 166 |
+
# load inputs
|
| 167 |
+
#input_text = CAPTIONS
|
| 168 |
+
with open(args.text, 'r') as f:
|
| 169 |
+
input_text = f.read().splitlines()
|
| 170 |
+
#input_text = args.input.split('.')
|
| 171 |
+
|
| 172 |
+
input_wav, sample_rate = read_audio(args.audio)
|
| 173 |
+
input_wav = input_wav[None]
|
| 174 |
+
|
| 175 |
+
logger.info("input_text: %s" % input_text)
|
| 176 |
+
|
| 177 |
+
# inference
|
| 178 |
+
logger.info('inference has started...')
|
| 179 |
+
if args.benchmark:
|
| 180 |
+
logger.info('BENCHMARK mode')
|
| 181 |
+
total_time_estimation = 0
|
| 182 |
+
for i in range(args.benchmark_count):
|
| 183 |
+
start = int(round(time.time() * 1000))
|
| 184 |
+
output = inference(model, input_text, input_wav, sample_rate, args.version)
|
| 185 |
+
end = int(round(time.time() * 1000))
|
| 186 |
+
estimation_time = (end - start)
|
| 187 |
+
|
| 188 |
+
# Logging
|
| 189 |
+
logger.info(f'\tailia processing estimation time {estimation_time} ms')
|
| 190 |
+
if i != 0:
|
| 191 |
+
total_time_estimation = total_time_estimation + estimation_time
|
| 192 |
+
|
| 193 |
+
logger.info(f'\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms')
|
| 194 |
+
else:
|
| 195 |
+
output = inference(model, input_text, input_wav, sample_rate, args.version)
|
| 196 |
+
|
| 197 |
+
print(f"Similarity: ")
|
| 198 |
+
print_sorted_dict(dict(zip(input_text, output)))
|
| 199 |
+
|
| 200 |
+
if args.write_json:
|
| 201 |
+
save_sorted_dict_as_json(dict(zip(input_text, output)))
|
| 202 |
+
|
| 203 |
+
logger.info('Script finished successfully.')
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def main():
|
| 207 |
+
# model files check and download
|
| 208 |
+
if args.version == '2023':
|
| 209 |
+
check_and_download_models(
|
| 210 |
+
CAPTION_WEIGHT_PATH_2023,
|
| 211 |
+
CAPTION_MODEL_PATH_2023,
|
| 212 |
+
REMOTE_PATH
|
| 213 |
+
)
|
| 214 |
+
check_and_download_models(
|
| 215 |
+
AUDIO_WEIGHT_PATH_2023,
|
| 216 |
+
AUDIO_MODEL_PATH_2023,
|
| 217 |
+
REMOTE_PATH
|
| 218 |
+
)
|
| 219 |
+
elif args.version == '2022':
|
| 220 |
+
check_and_download_models(
|
| 221 |
+
CAPTION_WEIGHT_PATH_2022,
|
| 222 |
+
CAPTION_MODEL_PATH_2022,
|
| 223 |
+
REMOTE_PATH
|
| 224 |
+
)
|
| 225 |
+
check_and_download_models(
|
| 226 |
+
AUDIO_WEIGHT_PATH_2022,
|
| 227 |
+
AUDIO_MODEL_PATH_2022,
|
| 228 |
+
REMOTE_PATH
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
env_id = args.env_id
|
| 232 |
+
|
| 233 |
+
# disable FP16
|
| 234 |
+
if "FP16" in ailia.get_environment(args.env_id).props or sys.platform == 'Darwin':
|
| 235 |
+
logger.warning('This model do not work on FP16. So use CPU mode.')
|
| 236 |
+
env_id = 0
|
| 237 |
+
|
| 238 |
+
# initialize
|
| 239 |
+
if args.version == '2023':
|
| 240 |
+
caption_model = ailia.Net(CAPTION_MODEL_PATH_2023, CAPTION_WEIGHT_PATH_2023, env_id=env_id)
|
| 241 |
+
audio_model = ailia.Net(AUDIO_MODEL_PATH_2023, AUDIO_WEIGHT_PATH_2023, env_id=env_id)
|
| 242 |
+
if args.disable_ailia_tokenizer:
|
| 243 |
+
from transformers import AutoTokenizer
|
| 244 |
+
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
| 245 |
+
tokenizer.add_special_tokens({'pad_token': '!'})
|
| 246 |
+
else:
|
| 247 |
+
from ailia_tokenizer import GPT2Tokenizer
|
| 248 |
+
tokenizer = GPT2Tokenizer.from_pretrained('./tokenizer_gpt2/')
|
| 249 |
+
tokenizer.add_special_tokens({'pad_token': '!'})
|
| 250 |
+
#tokenizer._pad_token_id = 0
|
| 251 |
+
elif args.version == '2022':
|
| 252 |
+
caption_model = ailia.Net(CAPTION_MODEL_PATH_2022, CAPTION_WEIGHT_PATH_2022, env_id=env_id)
|
| 253 |
+
audio_model = ailia.Net(AUDIO_MODEL_PATH_2022, AUDIO_WEIGHT_PATH_2022, env_id=env_id)
|
| 254 |
+
if args.disable_ailia_tokenizer:
|
| 255 |
+
from transformers import AutoTokenizer
|
| 256 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 257 |
+
else:
|
| 258 |
+
from ailia_tokenizer import BertTokenizer
|
| 259 |
+
tokenizer = BertTokenizer.from_pretrained('./tokenizer_bert/')
|
| 260 |
+
|
| 261 |
+
model = {
|
| 262 |
+
'caption_model':caption_model,
|
| 263 |
+
'audio_model':audio_model,
|
| 264 |
+
'tokenizer':tokenizer
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
estimate_best_caption(model)
|
| 268 |
+
|
| 269 |
+
if __name__ == '__main__':
|
| 270 |
+
main()
|
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"do_lower_case": true, "model_max_length": 512}
|
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_max_length": 1024}
|
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/ailia-models/Microsoft-CLAP/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/msclap
|
models/onnx/clap-htsat-unfused (Xenova)/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/onnx/clap-htsat-unfused (Xenova)/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: laion/clap-htsat-unfused
|
| 3 |
+
library_name: transformers.js
|
| 4 |
+
tags:
|
| 5 |
+
- zero-shot-audio-classification
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
https://huggingface.co/laion/clap-htsat-unfused with ONNX weights to be compatible with Transformers.js.
|
| 9 |
+
|
| 10 |
+
## Usage (Transformers.js)
|
| 11 |
+
|
| 12 |
+
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
|
| 13 |
+
```bash
|
| 14 |
+
npm i @xenova/transformers
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
**Example:** Perform zero-shot audio classification with `Xenova/clap-htsat-unfused`.
|
| 18 |
+
```js
|
| 19 |
+
import { pipeline } from '@xenova/transformers';
|
| 20 |
+
|
| 21 |
+
const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
|
| 22 |
+
|
| 23 |
+
const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/dog_barking.wav';
|
| 24 |
+
const candidate_labels = ['dog', 'vaccum cleaner'];
|
| 25 |
+
const scores = await classifier(audio, candidate_labels);
|
| 26 |
+
// [
|
| 27 |
+
// { score: 0.9993992447853088, label: 'dog' },
|
| 28 |
+
// { score: 0.0006007603369653225, label: 'vaccum cleaner' }
|
| 29 |
+
// ]
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
**Example:** Compute text embeddings with `ClapTextModelWithProjection`.
|
| 33 |
+
|
| 34 |
+
```js
|
| 35 |
+
import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
|
| 36 |
+
|
| 37 |
+
// Load tokenizer and text model
|
| 38 |
+
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused');
|
| 39 |
+
const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
|
| 40 |
+
|
| 41 |
+
// Run tokenization
|
| 42 |
+
const texts = ['a sound of a cat', 'a sound of a dog'];
|
| 43 |
+
const text_inputs = tokenizer(texts, { padding: true, truncation: true });
|
| 44 |
+
|
| 45 |
+
// Compute embeddings
|
| 46 |
+
const { text_embeds } = await text_model(text_inputs);
|
| 47 |
+
// Tensor {
|
| 48 |
+
// dims: [ 2, 512 ],
|
| 49 |
+
// type: 'float32',
|
| 50 |
+
// data: Float32Array(1024) [ ... ],
|
| 51 |
+
// size: 1024
|
| 52 |
+
// }
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
**Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
|
| 56 |
+
```js
|
| 57 |
+
import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
|
| 58 |
+
|
| 59 |
+
// Load processor and audio model
|
| 60 |
+
const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
|
| 61 |
+
const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
|
| 62 |
+
|
| 63 |
+
// Read audio and run processor
|
| 64 |
+
const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav');
|
| 65 |
+
const audio_inputs = await processor(audio);
|
| 66 |
+
|
| 67 |
+
// Compute embeddings
|
| 68 |
+
const { audio_embeds } = await audio_model(audio_inputs);
|
| 69 |
+
// Tensor {
|
| 70 |
+
// dims: [ 1, 512 ],
|
| 71 |
+
// type: 'float32',
|
| 72 |
+
// data: Float32Array(512) [ ... ],
|
| 73 |
+
// size: 512
|
| 74 |
+
// }
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
|
models/onnx/clap-htsat-unfused (Xenova)/config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "laion/clap-htsat-unfused",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"ClapModel"
|
| 5 |
+
],
|
| 6 |
+
"audio_config": {
|
| 7 |
+
"fusion_num_hidden_layers": 2,
|
| 8 |
+
"model_type": "clap_audio_model",
|
| 9 |
+
"projection_hidden_size": 768
|
| 10 |
+
},
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_factor": 1.0,
|
| 13 |
+
"logit_scale_init_value": 14.285714285714285,
|
| 14 |
+
"model_type": "clap",
|
| 15 |
+
"num_hidden_layers": 16,
|
| 16 |
+
"projection_dim": 512,
|
| 17 |
+
"projection_hidden_act": "relu",
|
| 18 |
+
"text_config": {
|
| 19 |
+
"classifier_dropout": null,
|
| 20 |
+
"fusion_hidden_size": 768,
|
| 21 |
+
"fusion_num_hidden_layers": 2,
|
| 22 |
+
"initializer_range": 0.02,
|
| 23 |
+
"model_type": "clap_text_model",
|
| 24 |
+
"projection_hidden_size": 768
|
| 25 |
+
},
|
| 26 |
+
"transformers_version": "4.36.0.dev0"
|
| 27 |
+
}
|
models/onnx/clap-htsat-unfused (Xenova)/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1c2b43c44f71e0fa841a4b86700886c199bf87699ea45632c4d831bc6c88957
|
| 3 |
+
size 117528416
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65963cfa5e903e0a8df475137252d618663df88ecf0b55a3fb327e6c1ca63a97
|
| 3 |
+
size 60414065
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fcff2c8824e7bcb83a983f2a49edab3b60cbcf4872ac70efee517355173bd1f
|
| 3 |
+
size 34301667
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8cbfb96dda10259964e678c1557466f925001f66b8cd1b24c84bd88b0f84345
|
| 3 |
+
size 619128635
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ec27913c473d8ce6367cc40376c27b79899248009d8f6f2ad62110abdcf6124
|
| 3 |
+
size 311602756
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f559313ec268518193101007fc0569ee4b2cfac03e369091c50adb4795f5c5d
|
| 3 |
+
size 161024085
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6df7c51f2d78e236a03f2b816e613d538c815639d13644fe7c2124f439da9648
|
| 3 |
+
size 501513769
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1fd1e4cdd02acbcacaafe7a8e608dcfc8f84a00bfea3e2ca7df710957b4c3a5
|
| 3 |
+
size 251029088
|
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a3df8b197e249816e08415fd040434c44762b2eea7eb7bf8a48a0f0bf3c14e5
|
| 3 |
+
size 126603263
|
models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chunk_length_s": 10,
|
| 3 |
+
"feature_extractor_type": "ClapFeatureExtractor",
|
| 4 |
+
"feature_size": 64,
|
| 5 |
+
"fft_window_size": 1024,
|
| 6 |
+
"frequency_max": 14000,
|
| 7 |
+
"frequency_min": 50,
|
| 8 |
+
"hop_length": 480,
|
| 9 |
+
"max_length_s": 10,
|
| 10 |
+
"n_fft": 1024,
|
| 11 |
+
"nb_frequency_bins": 513,
|
| 12 |
+
"nb_max_frames": 1000,
|
| 13 |
+
"nb_max_samples": 480000,
|
| 14 |
+
"padding": "repeatpad",
|
| 15 |
+
"padding_side": "right",
|
| 16 |
+
"padding_value": 0.0,
|
| 17 |
+
"processor_class": "ClapProcessor",
|
| 18 |
+
"return_attention_mask": false,
|
| 19 |
+
"sampling_rate": 48000,
|
| 20 |
+
"top_db": null,
|
| 21 |
+
"truncation": "rand_trunc"
|
| 22 |
+
}
|
models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"per_channel": true,
|
| 3 |
+
"reduce_range": true,
|
| 4 |
+
"per_model_config": {
|
| 5 |
+
"model": {
|
| 6 |
+
"op_types": [
|
| 7 |
+
"Expand",
|
| 8 |
+
"ScatterND",
|
| 9 |
+
"Pad",
|
| 10 |
+
"Abs",
|
| 11 |
+
"Unsqueeze",
|
| 12 |
+
"ReduceSum",
|
| 13 |
+
"Not",
|
| 14 |
+
"CumSum",
|
| 15 |
+
"Constant",
|
| 16 |
+
"Exp",
|
| 17 |
+
"Sub",
|
| 18 |
+
"MatMul",
|
| 19 |
+
"Cast",
|
| 20 |
+
"Reshape",
|
| 21 |
+
"Flatten",
|
| 22 |
+
"Resize",
|
| 23 |
+
"Conv",
|
| 24 |
+
"ConstantOfShape",
|
| 25 |
+
"Gather",
|
| 26 |
+
"Relu",
|
| 27 |
+
"Div",
|
| 28 |
+
"Mul",
|
| 29 |
+
"GlobalAveragePool",
|
| 30 |
+
"Range",
|
| 31 |
+
"Erf",
|
| 32 |
+
"Where",
|
| 33 |
+
"ReduceMean",
|
| 34 |
+
"Pow",
|
| 35 |
+
"Shape",
|
| 36 |
+
"Concat",
|
| 37 |
+
"Slice",
|
| 38 |
+
"Softmax",
|
| 39 |
+
"Tanh",
|
| 40 |
+
"Sqrt",
|
| 41 |
+
"BatchNormalization",
|
| 42 |
+
"Add",
|
| 43 |
+
"Transpose",
|
| 44 |
+
"Gemm",
|
| 45 |
+
"Equal"
|
| 46 |
+
],
|
| 47 |
+
"weight_type": "QUInt8"
|
| 48 |
+
},
|
| 49 |
+
"text_model": {
|
| 50 |
+
"op_types": [
|
| 51 |
+
"ReduceMean",
|
| 52 |
+
"Reshape",
|
| 53 |
+
"Softmax",
|
| 54 |
+
"Pow",
|
| 55 |
+
"Erf",
|
| 56 |
+
"Tanh",
|
| 57 |
+
"Concat",
|
| 58 |
+
"Sub",
|
| 59 |
+
"Not",
|
| 60 |
+
"Expand",
|
| 61 |
+
"Mul",
|
| 62 |
+
"Transpose",
|
| 63 |
+
"Div",
|
| 64 |
+
"Constant",
|
| 65 |
+
"Equal",
|
| 66 |
+
"Unsqueeze",
|
| 67 |
+
"Slice",
|
| 68 |
+
"MatMul",
|
| 69 |
+
"Gather",
|
| 70 |
+
"ConstantOfShape",
|
| 71 |
+
"Shape",
|
| 72 |
+
"Cast",
|
| 73 |
+
"Where",
|
| 74 |
+
"Sqrt",
|
| 75 |
+
"Add",
|
| 76 |
+
"Gemm",
|
| 77 |
+
"CumSum",
|
| 78 |
+
"Relu"
|
| 79 |
+
],
|
| 80 |
+
"weight_type": "QInt8"
|
| 81 |
+
},
|
| 82 |
+
"audio_model": {
|
| 83 |
+
"op_types": [
|
| 84 |
+
"BatchNormalization",
|
| 85 |
+
"ScatterND",
|
| 86 |
+
"ReduceMean",
|
| 87 |
+
"Reshape",
|
| 88 |
+
"Softmax",
|
| 89 |
+
"Pow",
|
| 90 |
+
"Erf",
|
| 91 |
+
"GlobalAveragePool",
|
| 92 |
+
"Concat",
|
| 93 |
+
"Sub",
|
| 94 |
+
"Not",
|
| 95 |
+
"Expand",
|
| 96 |
+
"Mul",
|
| 97 |
+
"Transpose",
|
| 98 |
+
"Div",
|
| 99 |
+
"Constant",
|
| 100 |
+
"Equal",
|
| 101 |
+
"Unsqueeze",
|
| 102 |
+
"Pad",
|
| 103 |
+
"Slice",
|
| 104 |
+
"Resize",
|
| 105 |
+
"Range",
|
| 106 |
+
"MatMul",
|
| 107 |
+
"Gather",
|
| 108 |
+
"ConstantOfShape",
|
| 109 |
+
"Shape",
|
| 110 |
+
"Cast",
|
| 111 |
+
"Sqrt",
|
| 112 |
+
"Where",
|
| 113 |
+
"Add",
|
| 114 |
+
"Conv",
|
| 115 |
+
"Flatten",
|
| 116 |
+
"Gemm",
|
| 117 |
+
"Relu"
|
| 118 |
+
],
|
| 119 |
+
"weight_type": "QUInt8"
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
}
|
models/onnx/clap-htsat-unfused (Xenova)/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Xenova/clap-htsat-unfused
|
models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"mask_token": {
|
| 6 |
+
"content": "<mask>",
|
| 7 |
+
"lstrip": true,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false
|
| 11 |
+
},
|
| 12 |
+
"pad_token": "<pad>",
|
| 13 |
+
"sep_token": "</s>",
|
| 14 |
+
"unk_token": "<unk>"
|
| 15 |
+
}
|
models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "<s>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "<pad>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "</s>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "<unk>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"50264": {
|
| 37 |
+
"content": "<mask>",
|
| 38 |
+
"lstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"bos_token": "<s>",
|
| 46 |
+
"clean_up_tokenization_spaces": true,
|
| 47 |
+
"cls_token": "<s>",
|
| 48 |
+
"eos_token": "</s>",
|
| 49 |
+
"errors": "replace",
|
| 50 |
+
"mask_token": "<mask>",
|
| 51 |
+
"max_length": null,
|
| 52 |
+
"model_max_length": 512,
|
| 53 |
+
"pad_to_multiple_of": null,
|
| 54 |
+
"pad_token": "<pad>",
|
| 55 |
+
"pad_token_type_id": 0,
|
| 56 |
+
"padding_side": "right",
|
| 57 |
+
"processor_class": "ClapProcessor",
|
| 58 |
+
"sep_token": "</s>",
|
| 59 |
+
"tokenizer_class": "RobertaTokenizer",
|
| 60 |
+
"trim_offsets": true,
|
| 61 |
+
"trust_remote_code": false,
|
| 62 |
+
"unk_token": "<unk>"
|
| 63 |
+
}
|
models/onnx/clap-htsat-unfused (Xenova)/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/onnx/larger_clap_general (Xenova)/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/onnx/larger_clap_general (Xenova)/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: laion/larger_clap_general
|
| 3 |
+
library_name: transformers.js
|
| 4 |
+
tags:
|
| 5 |
+
- zero-shot-audio-classification
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
https://huggingface.co/laion/larger_clap_general with ONNX weights to be compatible with Transformers.js.
|
| 9 |
+
|
| 10 |
+
## Usage (Transformers.js)
|
| 11 |
+
|
| 12 |
+
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
|
| 13 |
+
```bash
|
| 14 |
+
npm i @xenova/transformers
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
**Example:** Perform zero-shot audio classification with `Xenova/larger_clap_general`.
|
| 18 |
+
```js
|
| 19 |
+
import { pipeline } from '@xenova/transformers';
|
| 20 |
+
|
| 21 |
+
const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/larger_clap_general');
|
| 22 |
+
|
| 23 |
+
const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav';
|
| 24 |
+
const candidate_labels = ['calm piano music', 'heavy metal music'];
|
| 25 |
+
const scores = await classifier(audio, candidate_labels);
|
| 26 |
+
// [
|
| 27 |
+
// { score: 0.9829504489898682, label: 'calm piano music' },
|
| 28 |
+
// { score: 0.017049523070454597, label: 'heavy metal music' }
|
| 29 |
+
// ]
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
**Example:** Compute text embeddings with `ClapTextModelWithProjection`.
|
| 33 |
+
|
| 34 |
+
```js
|
| 35 |
+
import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
|
| 36 |
+
|
| 37 |
+
// Load tokenizer and text model
|
| 38 |
+
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/larger_clap_general');
|
| 39 |
+
const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/larger_clap_general');
|
| 40 |
+
|
| 41 |
+
// Run tokenization
|
| 42 |
+
const texts = ['calm piano music', 'heavy metal music'];
|
| 43 |
+
const text_inputs = tokenizer(texts, { padding: true, truncation: true });
|
| 44 |
+
|
| 45 |
+
// Compute embeddings
|
| 46 |
+
const { text_embeds } = await text_model(text_inputs);
|
| 47 |
+
// Tensor {
|
| 48 |
+
// dims: [ 2, 512 ],
|
| 49 |
+
// type: 'float32',
|
| 50 |
+
// data: Float32Array(1024) [ ... ],
|
| 51 |
+
// size: 1024
|
| 52 |
+
// }
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
**Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
|
| 56 |
+
```js
|
| 57 |
+
import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
|
| 58 |
+
|
| 59 |
+
// Load processor and audio model
|
| 60 |
+
const processor = await AutoProcessor.from_pretrained('Xenova/larger_clap_general');
|
| 61 |
+
const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/larger_clap_general');
|
| 62 |
+
|
| 63 |
+
// Read audio and run processor
|
| 64 |
+
const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav');
|
| 65 |
+
const audio_inputs = await processor(audio);
|
| 66 |
+
|
| 67 |
+
// Compute embeddings
|
| 68 |
+
const { audio_embeds } = await audio_model(audio_inputs);
|
| 69 |
+
// Tensor {
|
| 70 |
+
// dims: [ 1, 512 ],
|
| 71 |
+
// type: 'float32',
|
| 72 |
+
// data: Float32Array(512) [ ... ],
|
| 73 |
+
// size: 512
|
| 74 |
+
// }
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
|