niobures commited on
Commit
f8ffad7
·
verified ·
1 Parent(s): 87d6098

CLAP (code, models, paper)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf +3 -0
  3. code/CLAP.zip +3 -0
  4. models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx +3 -0
  5. models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt +0 -0
  6. models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx +3 -0
  7. models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt +173 -0
  8. models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx +3 -0
  9. models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt +0 -0
  10. models/onnx/ailia-models/LAION-CLAP/code/LICENSE +121 -0
  11. models/onnx/ailia-models/LAION-CLAP/code/README.md +64 -0
  12. models/onnx/ailia-models/LAION-CLAP/code/clap.py +203 -0
  13. models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py +170 -0
  14. models/onnx/ailia-models/LAION-CLAP/code/input.wav +3 -0
  15. models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt +0 -0
  16. models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json +0 -0
  17. models/onnx/ailia-models/LAION-CLAP/source.txt +10 -0
  18. models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE +21 -0
  19. models/onnx/ailia-models/Microsoft-CLAP/code/README.md +72 -0
  20. models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt +6 -0
  21. models/onnx/ailia-models/Microsoft-CLAP/code/input.wav +3 -0
  22. models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py +270 -0
  23. models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json +1 -0
  24. models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt +0 -0
  25. models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt +0 -0
  26. models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json +1 -0
  27. models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json +0 -0
  28. models/onnx/ailia-models/Microsoft-CLAP/source.txt +1 -0
  29. models/onnx/clap-htsat-unfused (Xenova)/.gitattributes +35 -0
  30. models/onnx/clap-htsat-unfused (Xenova)/README.md +79 -0
  31. models/onnx/clap-htsat-unfused (Xenova)/config.json +27 -0
  32. models/onnx/clap-htsat-unfused (Xenova)/merges.txt +0 -0
  33. models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx +3 -0
  34. models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx +3 -0
  35. models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx +3 -0
  36. models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx +3 -0
  37. models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx +3 -0
  38. models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx +3 -0
  39. models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx +3 -0
  40. models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx +3 -0
  41. models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx +3 -0
  42. models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json +22 -0
  43. models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json +122 -0
  44. models/onnx/clap-htsat-unfused (Xenova)/source.txt +1 -0
  45. models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json +15 -0
  46. models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json +0 -0
  47. models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json +63 -0
  48. models/onnx/clap-htsat-unfused (Xenova)/vocab.json +0 -0
  49. models/onnx/larger_clap_general (Xenova)/.gitattributes +35 -0
  50. models/onnx/larger_clap_general (Xenova)/README.md +80 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Large-scale[[:space:]]Contrastive[[:space:]]Language-Audio[[:space:]]Pretraining[[:space:]]with[[:space:]]Feature[[:space:]]Fusion[[:space:]]and[[:space:]]Keyword-to-Caption[[:space:]]Augmentation.pdf filter=lfs diff=lfs merge=lfs -text
37
+ models/onnx/ailia-models/LAION-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text
38
+ models/onnx/ailia-models/Microsoft-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text
Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c223105503d6f5c173479b84bcc6648c0df9f12a8493492617616c75049e7d31
3
+ size 695271
code/CLAP.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5ae04cc75b3acc7568a968365c28eebaba7e148a77a801e48a0e480595b30d
3
+ size 11947642
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9901a76d952d2a9be4d8e0a1e790bde6a86867f382c6aa991c4c6fcdfde0afeb
3
+ size 117413819
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c88525ca12a6e42b62ede1a086340411abd65bb7305dc963301eb1647825150
3
+ size 2626946
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ir_version: 6
2
+ producer_name: "pytorch"
3
+ producer_version: "1.13.0"
4
+ model_version: 0
5
+ graph {
6
+ name: "torch_jit"
7
+ node {
8
+ input: "x"
9
+ input: "text_projection.0.weight"
10
+ input: "text_projection.0.bias"
11
+ output: "/text_projection/text_projection.0/Gemm_output_0"
12
+ name: "/text_projection/text_projection.0/Gemm"
13
+ op_type: "Gemm"
14
+ attribute {
15
+ name: "alpha"
16
+ f: 1.0
17
+ type: FLOAT
18
+ }
19
+ attribute {
20
+ name: "beta"
21
+ f: 1.0
22
+ type: FLOAT
23
+ }
24
+ attribute {
25
+ name: "transB"
26
+ i: 1
27
+ type: INT
28
+ }
29
+ }
30
+ node {
31
+ input: "/text_projection/text_projection.0/Gemm_output_0"
32
+ output: "/text_projection/text_projection.1/Relu_output_0"
33
+ name: "/text_projection/text_projection.1/Relu"
34
+ op_type: "Relu"
35
+ }
36
+ node {
37
+ input: "/text_projection/text_projection.1/Relu_output_0"
38
+ input: "text_projection.2.weight"
39
+ input: "text_projection.2.bias"
40
+ output: "/text_projection/text_projection.2/Gemm_output_0"
41
+ name: "/text_projection/text_projection.2/Gemm"
42
+ op_type: "Gemm"
43
+ attribute {
44
+ name: "alpha"
45
+ f: 1.0
46
+ type: FLOAT
47
+ }
48
+ attribute {
49
+ name: "beta"
50
+ f: 1.0
51
+ type: FLOAT
52
+ }
53
+ attribute {
54
+ name: "transB"
55
+ i: 1
56
+ type: INT
57
+ }
58
+ }
59
+ node {
60
+ input: "/text_projection/text_projection.2/Gemm_output_0"
61
+ output: "/ReduceL2_output_0"
62
+ name: "/ReduceL2"
63
+ op_type: "ReduceL2"
64
+ attribute {
65
+ name: "axes"
66
+ ints: -1
67
+ type: INTS
68
+ }
69
+ attribute {
70
+ name: "keepdims"
71
+ i: 1
72
+ type: INT
73
+ }
74
+ }
75
+ node {
76
+ output: "/Constant_output_0"
77
+ name: "/Constant"
78
+ op_type: "Constant"
79
+ attribute {
80
+ name: "value"
81
+ t {
82
+ data_type: 1
83
+ raw_data: "\314\274\214+"
84
+ }
85
+ type: TENSOR
86
+ }
87
+ }
88
+ node {
89
+ input: "/ReduceL2_output_0"
90
+ input: "/Constant_output_0"
91
+ input: ""
92
+ output: "/Clip_output_0"
93
+ name: "/Clip"
94
+ op_type: "Clip"
95
+ }
96
+ node {
97
+ input: "/text_projection/text_projection.2/Gemm_output_0"
98
+ output: "/Shape_output_0"
99
+ name: "/Shape"
100
+ op_type: "Shape"
101
+ }
102
+ node {
103
+ input: "/Clip_output_0"
104
+ input: "/Shape_output_0"
105
+ output: "/Expand_output_0"
106
+ name: "/Expand"
107
+ op_type: "Expand"
108
+ }
109
+ node {
110
+ input: "/text_projection/text_projection.2/Gemm_output_0"
111
+ input: "/Expand_output_0"
112
+ output: "text_embed"
113
+ name: "/Div"
114
+ op_type: "Div"
115
+ }
116
+ initializer {
117
+ dims: 512
118
+ dims: 768
119
+ data_type: 1
120
+ name: "text_projection.0.weight"
121
+ }
122
+ initializer {
123
+ dims: 512
124
+ data_type: 1
125
+ name: "text_projection.0.bias"
126
+ }
127
+ initializer {
128
+ dims: 512
129
+ dims: 512
130
+ data_type: 1
131
+ name: "text_projection.2.weight"
132
+ }
133
+ initializer {
134
+ dims: 512
135
+ data_type: 1
136
+ name: "text_projection.2.bias"
137
+ }
138
+ input {
139
+ name: "x"
140
+ type {
141
+ tensor_type {
142
+ elem_type: 1
143
+ shape {
144
+ dim {
145
+ dim_param: "batch_size"
146
+ }
147
+ dim {
148
+ dim_value: 768
149
+ }
150
+ }
151
+ }
152
+ }
153
+ }
154
+ output {
155
+ name: "text_embed"
156
+ type {
157
+ tensor_type {
158
+ elem_type: 1
159
+ shape {
160
+ dim {
161
+ dim_param: "Divtext_embed_dim_0"
162
+ }
163
+ dim {
164
+ dim_param: "Divtext_embed_dim_1"
165
+ }
166
+ }
167
+ }
168
+ }
169
+ }
170
+ }
171
+ opset_import {
172
+ version: 11
173
+ }
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae0bea2a8147289e8d23fb2d25af3a7fdccdaa7730e69c4c52f68c9428749ee
3
+ size 498829921
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/LAION-CLAP/code/LICENSE ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Creative Commons Legal Code
2
+
3
+ CC0 1.0 Universal
4
+
5
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6
+ LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7
+ ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8
+ INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9
+ REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10
+ PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11
+ THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12
+ HEREUNDER.
13
+
14
+ Statement of Purpose
15
+
16
+ The laws of most jurisdictions throughout the world automatically confer
17
+ exclusive Copyright and Related Rights (defined below) upon the creator
18
+ and subsequent owner(s) (each and all, an "owner") of an original work of
19
+ authorship and/or a database (each, a "Work").
20
+
21
+ Certain owners wish to permanently relinquish those rights to a Work for
22
+ the purpose of contributing to a commons of creative, cultural and
23
+ scientific works ("Commons") that the public can reliably and without fear
24
+ of later claims of infringement build upon, modify, incorporate in other
25
+ works, reuse and redistribute as freely as possible in any form whatsoever
26
+ and for any purposes, including without limitation commercial purposes.
27
+ These owners may contribute to the Commons to promote the ideal of a free
28
+ culture and the further production of creative, cultural and scientific
29
+ works, or to gain reputation or greater distribution for their Work in
30
+ part through the use and efforts of others.
31
+
32
+ For these and/or other purposes and motivations, and without any
33
+ expectation of additional consideration or compensation, the person
34
+ associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35
+ is an owner of Copyright and Related Rights in the Work, voluntarily
36
+ elects to apply CC0 to the Work and publicly distribute the Work under its
37
+ terms, with knowledge of his or her Copyright and Related Rights in the
38
+ Work and the meaning and intended legal effect of CC0 on those rights.
39
+
40
+ 1. Copyright and Related Rights. A Work made available under CC0 may be
41
+ protected by copyright and related or neighboring rights ("Copyright and
42
+ Related Rights"). Copyright and Related Rights include, but are not
43
+ limited to, the following:
44
+
45
+ i. the right to reproduce, adapt, distribute, perform, display,
46
+ communicate, and translate a Work;
47
+ ii. moral rights retained by the original author(s) and/or performer(s);
48
+ iii. publicity and privacy rights pertaining to a person's image or
49
+ likeness depicted in a Work;
50
+ iv. rights protecting against unfair competition in regards to a Work,
51
+ subject to the limitations in paragraph 4(a), below;
52
+ v. rights protecting the extraction, dissemination, use and reuse of data
53
+ in a Work;
54
+ vi. database rights (such as those arising under Directive 96/9/EC of the
55
+ European Parliament and of the Council of 11 March 1996 on the legal
56
+ protection of databases, and under any national implementation
57
+ thereof, including any amended or successor version of such
58
+ directive); and
59
+ vii. other similar, equivalent or corresponding rights throughout the
60
+ world based on applicable law or treaty, and any national
61
+ implementations thereof.
62
+
63
+ 2. Waiver. To the greatest extent permitted by, but not in contravention
64
+ of, applicable law, Affirmer hereby overtly, fully, permanently,
65
+ irrevocably and unconditionally waives, abandons, and surrenders all of
66
+ Affirmer's Copyright and Related Rights and associated claims and causes
67
+ of action, whether now known or unknown (including existing as well as
68
+ future claims and causes of action), in the Work (i) in all territories
69
+ worldwide, (ii) for the maximum duration provided by applicable law or
70
+ treaty (including future time extensions), (iii) in any current or future
71
+ medium and for any number of copies, and (iv) for any purpose whatsoever,
72
+ including without limitation commercial, advertising or promotional
73
+ purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74
+ member of the public at large and to the detriment of Affirmer's heirs and
75
+ successors, fully intending that such Waiver shall not be subject to
76
+ revocation, rescission, cancellation, termination, or any other legal or
77
+ equitable action to disrupt the quiet enjoyment of the Work by the public
78
+ as contemplated by Affirmer's express Statement of Purpose.
79
+
80
+ 3. Public License Fallback. Should any part of the Waiver for any reason
81
+ be judged legally invalid or ineffective under applicable law, then the
82
+ Waiver shall be preserved to the maximum extent permitted taking into
83
+ account Affirmer's express Statement of Purpose. In addition, to the
84
+ extent the Waiver is so judged Affirmer hereby grants to each affected
85
+ person a royalty-free, non transferable, non sublicensable, non exclusive,
86
+ irrevocable and unconditional license to exercise Affirmer's Copyright and
87
+ Related Rights in the Work (i) in all territories worldwide, (ii) for the
88
+ maximum duration provided by applicable law or treaty (including future
89
+ time extensions), (iii) in any current or future medium and for any number
90
+ of copies, and (iv) for any purpose whatsoever, including without
91
+ limitation commercial, advertising or promotional purposes (the
92
+ "License"). The License shall be deemed effective as of the date CC0 was
93
+ applied by Affirmer to the Work. Should any part of the License for any
94
+ reason be judged legally invalid or ineffective under applicable law, such
95
+ partial invalidity or ineffectiveness shall not invalidate the remainder
96
+ of the License, and in such case Affirmer hereby affirms that he or she
97
+ will not (i) exercise any of his or her remaining Copyright and Related
98
+ Rights in the Work or (ii) assert any associated claims and causes of
99
+ action with respect to the Work, in either case contrary to Affirmer's
100
+ express Statement of Purpose.
101
+
102
+ 4. Limitations and Disclaimers.
103
+
104
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
105
+ surrendered, licensed or otherwise affected by this document.
106
+ b. Affirmer offers the Work as-is and makes no representations or
107
+ warranties of any kind concerning the Work, express, implied,
108
+ statutory or otherwise, including without limitation warranties of
109
+ title, merchantability, fitness for a particular purpose, non
110
+ infringement, or the absence of latent or other defects, accuracy, or
111
+ the present or absence of errors, whether or not discoverable, all to
112
+ the greatest extent permissible under applicable law.
113
+ c. Affirmer disclaims responsibility for clearing rights of other persons
114
+ that may apply to the Work or any use thereof, including without
115
+ limitation any person's Copyright and Related Rights in the Work.
116
+ Further, Affirmer disclaims responsibility for obtaining any necessary
117
+ consents, permissions or other rights required for any use of the
118
+ Work.
119
+ d. Affirmer understands and acknowledges that Creative Commons is not a
120
+ party to this document and has no duty or obligation with respect to
121
+ this CC0 or use of the Work.
models/onnx/ailia-models/LAION-CLAP/code/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAP
2
+
3
+ Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP architecture, similarly, the CLAP architecture is as follows.
4
+
5
+ ## Input
6
+
7
+ Audio file
8
+ ```
9
+ 24965__www-bonson-ca__bigdogbarking-02.wav
10
+ Attribution 3.0 Unported (CC BY 3.0)
11
+ https://freesound.org/people/www.bonson.ca/sounds/24965/
12
+ ```
13
+ ## Output
14
+
15
+ Output the cosine similarity between the pre-prepared text embedding and the input audio file embedding. The higher a value of cosine similality is, the closer given text and given audio are in meaning.
16
+ ```
17
+ ===== cosine similality between text and audio =====
18
+ cossim=0.1514, word=applause applaud clap
19
+ cossim=0.2942, word=The crowd is clapping.
20
+ cossim=0.0391, word=I love the contrastive learning
21
+ cossim=0.0755, word=bell
22
+ cossim=-0.0926, word=soccer
23
+ cossim=0.0309, word=open the door.
24
+ cossim=0.0849, word=applause
25
+ cossim=0.4183, word=dog
26
+ cossim=0.3819, word=dog barking
27
+ ```
28
+
29
+ ## Usage
30
+ Automatically downloads the onnx and prototxt files on the first run.
31
+ It is necessary to be connected to the Internet while downloading.
32
+
33
+ For the sample wav,
34
+ ```bash
35
+ $ python3 clap.py
36
+ ```
37
+
38
+ If you want to run in onnx mode, you specify `--onnx` option as below.
39
+ ```bash
40
+ $ python3 clap.py --onnx
41
+ ```
42
+
43
+ You can run with other wav file by adding `--input` option.
44
+ ```bash
45
+ $ python3 clap.py --input [wav_file]
46
+ ```
47
+
48
+ ## Reference
49
+
50
+ [CLAP](https://github.com/LAION-AI/CLAP)
51
+
52
+ ## Framework
53
+
54
+ Pytorch
55
+
56
+ ## Model Format
57
+
58
+ ONNX opset=11
59
+
60
+ ## Netron
61
+
62
+ [CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt)
63
+ [CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt)
64
+ [CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt)
models/onnx/ailia-models/LAION-CLAP/code/clap.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+
4
+ import numpy as np
5
+
6
+ import ailia # noqa: E402
7
+ sys.path.append('../../util')
8
+ from arg_utils import get_base_parser, update_parser # noqa: E402
9
+ from model_utils import check_and_download_models, check_and_download_file # noqa: E402
10
+
11
+ # logger
12
+ from logging import getLogger # noqa: E402
13
+ logger = getLogger(__name__)
14
+
15
+ # for clap
16
+ import librosa
17
+ from clap_utils import *
18
+
19
+
20
+ # ======================
21
+ # Arguemnt Parser Config
22
+ # ======================
23
+ AUDIO_PATH = 'input.wav'
24
+ parser = get_base_parser('CLAP', AUDIO_PATH, None)
25
+ parser.add_argument(
26
+ '--onnx',
27
+ action='store_true',
28
+ help='By default, the ailia SDK is used, but with this option, you can switch to using ONNX Runtime'
29
+ )
30
+ parser.add_argument(
31
+ '--disable_ailia_tokenizer',
32
+ action='store_true',
33
+ help='disable ailia tokenizer.'
34
+ )
35
+ parser.add_argument(
36
+ '--disable_ailia_audio',
37
+ action='store_true',
38
+ help='disable ailia audio and use librosa to get spectrogram feature'
39
+ )
40
+ args = update_parser(parser)
41
+
42
+ # ======================
43
+ # PARAMETERS
44
+ # ======================
45
+ CLAP_AUDIO_WEIGHT_PATH = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx"
46
+ CLAP_AUDIO_MODEL_PATH = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt"
47
+ CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx"
48
+ CLAP_TEXT_ROBERTAMODEL_MODEL_PATH = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt"
49
+ CLAP_TEXT_PROJECTION_WEIGHT_PATH = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx"
50
+ CLAP_TEXT_PROJECTION_MODEL_PATH = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt"
51
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/clap/"
52
+
53
+
54
+ # ======================
55
+ # Utils
56
+ # ======================
57
+ def cos_sim(v1, v2):
58
+ return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
59
+
60
+
61
+ # ======================
62
+ # Main function
63
+ # ======================
64
+ def infer_text(net_text_branch, net_text_projection, text_data):
65
+ # tokenizer
66
+ if args.disable_ailia_tokenizer:
67
+ from transformers import RobertaTokenizer
68
+ tokenize = RobertaTokenizer.from_pretrained('roberta-base')
69
+ result = tokenize(
70
+ text_data,
71
+ padding="max_length",
72
+ truncation=True,
73
+ max_length=77,
74
+ return_tensors="pt",
75
+ )
76
+ data = {k: v.squeeze(0) for k, v in result.items()}
77
+ data["input_ids"] = data["input_ids"].to('cpu').detach().numpy().copy()
78
+ data["attention_mask"] = data["attention_mask"].to('cpu').detach().numpy().copy()
79
+ else:
80
+ from ailia_tokenizer import RobertaTokenizer
81
+ tokenize = RobertaTokenizer.from_pretrained("./tokenizer/")
82
+ result = tokenize(
83
+ text_data,
84
+ padding="max_length",
85
+ truncation=True,
86
+ max_length=77,
87
+ return_tensors="np",
88
+ )
89
+ data = {k: v for k, v in result.items()}
90
+
91
+ #print("input_ids", data["input_ids"])
92
+ #print("attention_mask", data["attention_mask"])
93
+
94
+ # predict
95
+ input_data = {
96
+ 'input_ids': data["input_ids"],
97
+ 'attention_mask': data["attention_mask"]
98
+ }
99
+ if not args.onnx:
100
+ output = net_text_branch.predict(input_data) # text_branch
101
+ _, x = output[0], output[1] # last_hidden_state, pooler_output
102
+ text_embeds = net_text_projection.predict(x) # projection
103
+ else:
104
+ output = net_text_branch.run(None, input_data) # text_branch
105
+ _, x = output[0], output[1] # last_hidden_state, pooler_output
106
+ text_embeds = net_text_projection.run(None, {'x': x})[0] # projection
107
+
108
+ return text_embeds
109
+
110
+
111
+ def infer_audio(net_audio, audio_src):
112
+ # load the waveform of the shape (T,), should resample to 48000
113
+ audio_waveform, sr = librosa.load(audio_src, sr=48000)
114
+
115
+ # quantize
116
+ audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
117
+
118
+ # get audio features
119
+ _, mel_fusion, _ = get_audio_features(
120
+ {}, audio_waveform, 480000,
121
+ data_truncating='fusion',
122
+ data_filling='repeatpad',
123
+ audio_cfg={
124
+ 'audio_length': 1024,
125
+ 'clip_samples': 480000,
126
+ 'mel_bins': 64,
127
+ 'sample_rate': 48000,
128
+ 'window_size': 1024,
129
+ 'hop_size': 480,
130
+ 'fmin': 50,
131
+ 'fmax': 14000,
132
+ 'class_num': 527,
133
+ 'model_type': 'HTSAT',
134
+ 'model_name': 'tiny'
135
+ },
136
+ b_use_ailia=not args.disable_ailia_audio
137
+ )
138
+ input_dict = {
139
+ 'longer': [[True]], # Error occers when longer value is "False".
140
+ 'mel_fusion': mel_fusion[np.newaxis, :, :, :]
141
+ }
142
+
143
+ # predict
144
+ if not args.onnx:
145
+ input_dict["longer"] = np.array(input_dict["longer"])
146
+ audio_embed = net_audio.predict(input_dict)[0]
147
+ else:
148
+ audio_embed = net_audio.run(None, input_dict)[0]
149
+
150
+ return audio_embed
151
+
152
+
153
+ def main():
154
+ # model files check and download
155
+ check_and_download_models(CLAP_AUDIO_WEIGHT_PATH, CLAP_AUDIO_MODEL_PATH, REMOTE_PATH)
156
+ check_and_download_models(CLAP_TEXT_PROJECTION_WEIGHT_PATH, CLAP_TEXT_PROJECTION_MODEL_PATH, REMOTE_PATH)
157
+ check_and_download_models(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, REMOTE_PATH)
158
+
159
+ # net initialize
160
+ if not args.onnx:
161
+ net_text_branch = \
162
+ ailia.Net(CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, env_id=args.env_id)
163
+ net_text_projection = \
164
+ ailia.Net(CLAP_TEXT_PROJECTION_MODEL_PATH, CLAP_TEXT_PROJECTION_WEIGHT_PATH, env_id=args.env_id)
165
+ net_audio = \
166
+ ailia.Net(CLAP_AUDIO_MODEL_PATH, CLAP_AUDIO_WEIGHT_PATH, env_id=args.env_id)
167
+ else:
168
+ import onnxruntime
169
+ net_text_branch = \
170
+ onnxruntime.InferenceSession(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH)
171
+ net_text_projection = \
172
+ onnxruntime.InferenceSession(CLAP_TEXT_PROJECTION_WEIGHT_PATH)
173
+ net_audio = \
174
+ onnxruntime.InferenceSession(CLAP_AUDIO_WEIGHT_PATH)
175
+
176
+ # text predict
177
+ text_inputs = [
178
+ "applause applaud clap",
179
+ "The crowd is clapping.",
180
+ "I love the contrastive learning",
181
+ "bell",
182
+ "soccer",
183
+ "open the door.",
184
+ "applause",
185
+ "dog",
186
+ "dog barking"
187
+ ]
188
+ text_embedding = infer_text(net_text_branch, net_text_projection, text_inputs)
189
+
190
+ # audio predict
191
+ for audio_path in args.input:
192
+ audio_embedding = infer_audio(net_audio, audio_path)
193
+ # show result
194
+ print('===== cosine similality between text and audio =====')
195
+ print('audio: {}'.format(audio_path))
196
+ for i in range(text_embedding.shape[0]):
197
+ print('cossim={:.04f}, word={}'.format(cos_sim(text_embedding[i], audio_embedding[0]), text_inputs[i]))
198
+
199
+ logger.info('Script finished successfully.')
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()
models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ import ailia.audio
4
+ from skimage.transform import resize
5
+
6
+
7
+ def int16_to_float32(x):
8
+ return (x / 32767.0).astype(np.float32)
9
+
10
+
11
+ def float32_to_int16(x):
12
+ x = np.clip(x, a_min=-1., a_max=1.)
13
+ return (x * 32767.).astype(np.int16)
14
+
15
+
16
+ def get_mel(audio_data, audio_cfg):
17
+ """
18
+ # mel shape: (n_mels, T)
19
+ mel_torch = torchaudio.transforms.MelSpectrogram(
20
+ sample_rate=audio_cfg['sample_rate'],
21
+ n_fft=audio_cfg['window_size'],
22
+ win_length=audio_cfg['window_size'],
23
+ hop_length=audio_cfg['hop_size'],
24
+ center=True,
25
+ pad_mode="reflect",
26
+ power=2.0,
27
+ norm=None,
28
+ onesided=True,
29
+ n_mels=64,
30
+ f_min=audio_cfg['fmin'],
31
+ f_max=audio_cfg['fmax']
32
+ )(audio_data)
33
+
34
+ # we use log mel spectrogram as input
35
+ mel_torch = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel_torch)
36
+ mel_torch = mel_torch.T # (T, n_mels)
37
+ mel_torch = mel_torch.to('cpu').detach().numpy().copy()
38
+ """
39
+
40
+ # Align to librosa:
41
+ mel_librosa = librosa.feature.melspectrogram(
42
+ y=audio_data,
43
+ sr=audio_cfg['sample_rate'],
44
+ n_fft=audio_cfg['window_size'],
45
+ hop_length=audio_cfg['hop_size'],
46
+ win_length=audio_cfg['window_size'],
47
+ center=True,
48
+ pad_mode="reflect",
49
+ power=2.0,
50
+ n_mels=64,
51
+ norm=None,
52
+ htk=True,
53
+ fmin=audio_cfg['fmin'],
54
+ fmax=audio_cfg['fmax']
55
+ )
56
+ mel_librosa = librosa.amplitude_to_db(mel_librosa, top_db=None)
57
+ mel_librosa = mel_librosa.transpose(1, 0)
58
+
59
+ return mel_librosa
60
+
61
+ def get_mel_ailia(audio_data, audio_cfg):
62
+ mel = ailia.audio.mel_spectrogram(
63
+ audio_data,
64
+ sample_rate=audio_cfg['sample_rate'],
65
+ fft_n=audio_cfg['window_size'],
66
+ hop_n=audio_cfg['hop_size'],
67
+ win_n=audio_cfg['window_size'],
68
+ win_type=1, # hann
69
+ center_mode=1,
70
+ power=2.0,
71
+ fft_norm_type=None,
72
+ f_min=audio_cfg['fmin'],
73
+ f_max=audio_cfg['fmax'],
74
+ mel_n=64,
75
+ mel_norm=False,
76
+ htk=True
77
+ )
78
+
79
+ def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
80
+ S[(S >= 0) & (S < amin)] = amin
81
+ S[(S < 0) & (S > -amin)] = -amin
82
+ return 10 * np.log10(S / ref)
83
+
84
+ mel_db = power_to_db(np.square(mel), top_db=None)
85
+ mel_db = mel_db.transpose(1, 0)
86
+
87
+ return mel_db
88
+
89
+
90
+ def get_audio_features(sample, audio_data, max_len, data_truncating, data_filling, audio_cfg, b_use_ailia=False):
91
+ """
92
+ Calculate and add audio features to sample.
93
+ Sample: a dict containing all the data of current sample.
94
+ audio_data: a tensor of shape (T) containing audio data.
95
+ max_len: the maximum length of audio data.
96
+ data_truncating: the method of truncating data.
97
+ data_filling: the method of filling data.
98
+ audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg'].
99
+ """
100
+ mel_func = get_mel_ailia if b_use_ailia else get_mel
101
+ if len(audio_data) > max_len:
102
+ if data_truncating == "fusion":
103
+ # fusion
104
+ mel = mel_func(audio_data, audio_cfg)
105
+ # split to three parts
106
+ chunk_frames = max_len // audio_cfg['hop_size']+1 # the +1 related to how the spectrogram is computed
107
+ total_frames = mel.shape[0]
108
+ if chunk_frames == total_frames:
109
+ # there is a corner case where the audio length is
110
+ # larger than max_len but smaller than max_len+hop_size.
111
+ # In this case, we just use the whole audio.
112
+ mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
113
+ longer = [[False]]
114
+ else:
115
+ ranges = np.array_split(list(range(0, total_frames-chunk_frames+1)), 3)
116
+ # print('total_frames-chunk_frames:', total_frames-chunk_frames,
117
+ # 'len(audio_data):', len(audio_data),
118
+ # 'chunk_frames:', chunk_frames,
119
+ # 'total_frames:', total_frames)
120
+ if len(ranges[1]) == 0:
121
+ # if the audio is too short, we just use the first chunk
122
+ ranges[1] = [0]
123
+ if len(ranges[2]) == 0:
124
+ # if the audio is too short, we just use the first chunk
125
+ ranges[2] = [0]
126
+ # randomly choose index for each part
127
+ idx_front = np.random.choice(ranges[0])
128
+ idx_middle = np.random.choice(ranges[1])
129
+ idx_back = np.random.choice(ranges[2])
130
+ # select mel
131
+ mel_chunk_front = mel[idx_front:idx_front+chunk_frames, :]
132
+ mel_chunk_middle = mel[idx_middle:idx_middle+chunk_frames, :]
133
+ mel_chunk_back = mel[idx_back:idx_back+chunk_frames, :]
134
+
135
+ # shrink the mel
136
+ # Output may differ between torchvision.transforms.Resize and skimage.transform.resize.
137
+ #mel_shrink_torch = torch.from_numpy(mel[None])
138
+ #mel_shrink_torch = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel_shrink_torch)[0]
139
+ #mel_shrink_torch = mel_shrink_torch.to('cpu').detach().numpy().copy()
140
+ mel_shrink_numpy = resize(mel, (chunk_frames, 64), preserve_range=True, anti_aliasing=True, mode='edge')
141
+ # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
142
+
143
+ # stack
144
+ mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink_numpy], axis=0)
145
+ longer = [[True]]
146
+ # random crop to max_len (for compatibility)
147
+ overflow = len(audio_data) - max_len
148
+ idx = np.random.randint(0, overflow + 1)
149
+ audio_data = audio_data[idx: idx + max_len]
150
+
151
+ else: # padding if too short
152
+ if len(audio_data) < max_len: # do nothing if equal
153
+ if data_filling == "repeatpad":
154
+ n_repeat = int(max_len/len(audio_data))
155
+ audio_data = np.tile(audio_data, n_repeat)
156
+ # audio_data = audio_data.unsqueeze(0).unsqueeze(0).unsqueeze(0)
157
+ # audio_data = F.interpolate(audio_data,size=max_len,mode="bicubic")[0,0,0]
158
+ audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
159
+ elif data_filling == "pad":
160
+ audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
161
+ elif data_filling == "repeat":
162
+ n_repeat = int(max_len/len(audio_data))
163
+ audio_data = np.tile(audio_data, n_repeat+1)[:max_len]
164
+
165
+ if data_truncating == 'fusion':
166
+ mel = mel_func(audio_data, audio_cfg)
167
+ mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
168
+ longer = [[False]]
169
+
170
+ return longer, mel_fusion, audio_data
models/onnx/ailia-models/LAION-CLAP/code/input.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f20a2c45b4da238c377d048d2d5aa2e76fb5ea38c7f273553e38d28f502dce12
3
+ size 543122
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/LAION-CLAP/source.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/clap
2
+
3
+ https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx
4
+ https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt
5
+
6
+ https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx
7
+ https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt
8
+
9
+ https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx
10
+ https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt
models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
models/onnx/ailia-models/Microsoft-CLAP/code/README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Microsoft CLAP
2
+
3
+ ## Input
4
+
5
+ **audio file**
6
+
7
+ Audio file in wav format to use as the model's input.
8
+ Default file name is [input.wav](./input.wav)
9
+ (source: https://freesound.org/people/InspectorJ/sounds/456440/)
10
+
11
+
12
+
13
+ **text file**
14
+
15
+ A text file containing sentences separated by new lines.
16
+ Default file name is [captions.txt](./captions.txt)
17
+
18
+ ## Output
19
+
20
+ **Cosine similarities**
21
+
22
+ Cosine similarity between the input audio and the sentences in the text file.
23
+
24
+ ## Usage
25
+ Internet connection is required when running the script for the first time, as the model files will be automatically downloaded.
26
+
27
+ Running the script will compute the cosine similarities between the audio and the captions, using audio and language encoder models train by contrastive training.
28
+
29
+ You can switch the versions of the encoder model's weight (2022 or 2023) by specifying the version using the argument ```-v``` or ```--version```.
30
+ For more information on arguments, try running ```python3 msclap.py --help```
31
+ ```bash
32
+ $ python3 msclap.py -t captions.txt -a input.wav -v 2023
33
+ INFO arg_utils.py (13) : Start!
34
+ INFO arg_utils.py (158) : env_id updated to 0
35
+ INFO arg_utils.py (163) : env_id: 0
36
+ INFO arg_utils.py (166) : CPU
37
+ INFO msclap.py (167) : input_text: ['Dog barking.', 'Birds whistling.', 'Car passing by.', 'Wind blowing.', 'Water flowing.', 'People talking.']
38
+ INFO msclap.py (170) : inference has started...
39
+ Similarity:
40
+ Birds whistling.: 0.41247469186782837
41
+ Wind blowing.: 0.2643369734287262
42
+ Water flowing.: 0.23884761333465576
43
+ Car passing by.: 0.22803542017936707
44
+ People talking.: 0.17387858033180237
45
+ Dog barking.: 0.11309497803449631
46
+ INFO msclap.py (192) : Script finished successfully.
47
+ ```
48
+
49
+ ## Reference
50
+
51
+ * [CLAP](https://github.com/microsoft/CLAP)
52
+
53
+ ## Framework
54
+
55
+ Pytorch
56
+
57
+ ## Model Format
58
+
59
+ ONNX opset=11
60
+
61
+ ## Netron
62
+
63
+ [caption_model_2023.onnx.prototxt]()
64
+
65
+ [audio_model_2023.onnx.prototxt]()
66
+
67
+ [caption_model_2022.onnx.prototxt]()
68
+
69
+ [audio_model_2022.onnx.prototxt]()
70
+
71
+
72
+
models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Dog barking.
2
+ Birds whistling.
3
+ Car passing by.
4
+ Wind blowing.
5
+ Water flowing.
6
+ People talking.
models/onnx/ailia-models/Microsoft-CLAP/code/input.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bbef488f5578b96843791d83cc7ad946c99e72c8b69bd56b351e1e0d80ac142
3
+ size 714238
models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ from logging import getLogger
4
+ import json
5
+
6
+ import random
7
+
8
+ import librosa
9
+ import numpy as np
10
+
11
+ import ailia
12
+
13
+ # import original modules
14
+ sys.path.append('../../util')
15
+ from arg_utils import get_base_parser, update_parser, get_savepath # noqa
16
+ from model_utils import check_and_download_models # noqa
17
+
18
+ logger = getLogger(__name__)
19
+
20
+ # ======================
21
+ # Parameters
22
+ # ======================
23
+
24
+ CAPTION_WEIGHT_PATH_2023 = 'msclap_2023_caption.onnx'
25
+ AUDIO_WEIGHT_PATH_2023 = 'msclap_2023_audio.onnx'
26
+
27
+ CAPTION_MODEL_PATH_2023 = 'msclap_2023_caption.onnx.prototxt'
28
+ AUDIO_MODEL_PATH_2023 = 'msclap_2023_audio.onnx.prototxt'
29
+
30
+ CAPTION_WEIGHT_PATH_2022 = 'msclap_2022_caption.onnx'
31
+ AUDIO_WEIGHT_PATH_2022 = 'msclap_2022_audio.onnx'
32
+
33
+ CAPTION_MODEL_PATH_2022 = 'msclap_2022_caption.onnx.prototxt'
34
+ AUDIO_MODEL_PATH_2022 = 'msclap_2022_audio.onnx.prototxt'
35
+
36
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/msclap/"
37
+
38
+ # ======================
39
+ # Arguemnt Parser Config
40
+ # ======================
41
+
42
+ parser = get_base_parser(
43
+ 'msclap', None, None
44
+ )
45
+
46
+ parser.add_argument(
47
+ "-a", "--audio", type=str,
48
+ default="input.wav",
49
+ help="Input audio file path."
50
+ )
51
+
52
+ parser.add_argument(
53
+ "-t", "--text", type=str,
54
+ default="captions.txt",
55
+ help="Input text caption file path"
56
+ )
57
+
58
+ parser.add_argument(
59
+ "-v", "--version", type=str,
60
+ default="2023",
61
+ help="Version of the CLAP model (2022 or 2023)."
62
+ )
63
+
64
+ parser.add_argument(
65
+ '-w', '--write_json',
66
+ action='store_true',
67
+ help='Flag to output results to json file.'
68
+ )
69
+ parser.add_argument(
70
+ '--disable_ailia_tokenizer',
71
+ action='store_true',
72
+ help='disable ailia tokenizer.'
73
+ )
74
+ args = update_parser(parser, check_input_type=False)
75
+
76
+ # ======================
77
+ # Helper functions
78
+ # ======================
79
+
80
+ def read_audio(audio_path):
81
+ r"""Loads audio file or array and returns a numpy tensor"""
82
+ # Randomly sample a segment of audio_duration from the clip or pad to match duration
83
+ audio_time_series, sample_rate = librosa.load(audio_path, sr=None)
84
+ return audio_time_series, sample_rate
85
+
86
+ def resample_audio(audio_time_series, sample_rate, resample_rate):
87
+ resample_rate = 44100
88
+ if resample_rate != sample_rate:
89
+ audio_time_series = librosa.resample(
90
+ audio_time_series,
91
+ orig_sr=sample_rate,
92
+ target_sr=resample_rate,
93
+ res_type = 'sinc_best'
94
+ )
95
+ return audio_time_series, resample_rate
96
+
97
+
98
+ def resize_audio(audio_time_series, sample_rate, audio_duration, resample=False):
99
+ r"""Loads audio file and returns raw audio."""
100
+ # Randomly sample a segment of audio_duration from the clip or pad to match duration
101
+ audio_time_series = audio_time_series.reshape(-1)
102
+ # audio_time_series is shorter than predefined audio duration,
103
+ # so audio_time_series is extended
104
+ if audio_duration*sample_rate >= audio_time_series.shape[0]:
105
+ repeat_factor = int(np.ceil((audio_duration*sample_rate) /
106
+ audio_time_series.shape[0]))
107
+ # Repeat audio_time_series by repeat_factor to match audio_duration
108
+ audio_time_series = np.tile(audio_time_series,repeat_factor)
109
+ # remove excess part of audio_time_series
110
+ audio_time_series = audio_time_series[0:audio_duration*sample_rate]
111
+ else:
112
+ # audio_time_series is longer than predefined audio duration,
113
+ # so audio_time_series is trimmed
114
+ start_index = random.randrange(
115
+ audio_time_series.shape[0] - audio_duration*sample_rate)
116
+ audio_time_series = audio_time_series[start_index:start_index +
117
+ audio_duration*sample_rate]
118
+ return audio_time_series
119
+
120
+ def get_audio_embeddings(wav_input, sample_rate, model, version="2023"):
121
+ if version in ('2023', '2022'):
122
+ wav_input = resample_audio(wav_input, sample_rate, 44100)[0]
123
+ wav_input = resize_audio(wav_input, 44100, 7)[None]
124
+ return model['audio_model'].predict(wav_input)
125
+
126
+ def get_caption_embeddings(text_input, model, version="2023"):
127
+
128
+ # preprocesing
129
+ if version == '2023':
130
+ text_input = [t + ' <|endoftext|>' for t in text_input]
131
+ tokenized = dict(model['tokenizer'](text_input, padding = True, return_tensors = 'np'))
132
+
133
+ # inference
134
+ model_input = (tokenized['input_ids'], tokenized['attention_mask'])
135
+ return model['caption_model'].predict(model_input)[0]
136
+
137
+ def cossim(v1, v2):
138
+ return np.sum(v1 * v2, axis = -1) / (np.sum(v1 ** 2, axis = -1) ** 0.5 * np.sum(v2 ** 2, axis = -1) ** 0.5)
139
+
140
+ def print_sorted_dict(d):
141
+ m_len = max([len(k) for k in d.keys()])
142
+ for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
143
+ pad = ' ' * (m_len - len(k) + 4)
144
+ print(f'{pad + k}: {v}')
145
+
146
+ def save_sorted_dict_as_json(d):
147
+ m_len = max([len(k) for k in d.keys()])
148
+ result = []
149
+ for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
150
+ result.append({"caption": k, "similarity": float(v)})
151
+ with open('output.json', 'w', encoding='utf-8') as f:
152
+ json.dump(result, f, indent=2)
153
+
154
+ # ======================
155
+ # Main functions
156
+ # ======================
157
+
158
+ def inference(model, input_text, input_wav, sample_rate, version):
159
+ # get embeddings
160
+ audio_embeddings = get_audio_embeddings(input_wav, sample_rate, model, version)
161
+ caption_embeddings = get_caption_embeddings(input_text, model, version)
162
+
163
+ return cossim(audio_embeddings, caption_embeddings)
164
+
165
+ def estimate_best_caption(model):
166
+ # load inputs
167
+ #input_text = CAPTIONS
168
+ with open(args.text, 'r') as f:
169
+ input_text = f.read().splitlines()
170
+ #input_text = args.input.split('.')
171
+
172
+ input_wav, sample_rate = read_audio(args.audio)
173
+ input_wav = input_wav[None]
174
+
175
+ logger.info("input_text: %s" % input_text)
176
+
177
+ # inference
178
+ logger.info('inference has started...')
179
+ if args.benchmark:
180
+ logger.info('BENCHMARK mode')
181
+ total_time_estimation = 0
182
+ for i in range(args.benchmark_count):
183
+ start = int(round(time.time() * 1000))
184
+ output = inference(model, input_text, input_wav, sample_rate, args.version)
185
+ end = int(round(time.time() * 1000))
186
+ estimation_time = (end - start)
187
+
188
+ # Logging
189
+ logger.info(f'\tailia processing estimation time {estimation_time} ms')
190
+ if i != 0:
191
+ total_time_estimation = total_time_estimation + estimation_time
192
+
193
+ logger.info(f'\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms')
194
+ else:
195
+ output = inference(model, input_text, input_wav, sample_rate, args.version)
196
+
197
+ print(f"Similarity: ")
198
+ print_sorted_dict(dict(zip(input_text, output)))
199
+
200
+ if args.write_json:
201
+ save_sorted_dict_as_json(dict(zip(input_text, output)))
202
+
203
+ logger.info('Script finished successfully.')
204
+
205
+
206
+ def main():
207
+ # model files check and download
208
+ if args.version == '2023':
209
+ check_and_download_models(
210
+ CAPTION_WEIGHT_PATH_2023,
211
+ CAPTION_MODEL_PATH_2023,
212
+ REMOTE_PATH
213
+ )
214
+ check_and_download_models(
215
+ AUDIO_WEIGHT_PATH_2023,
216
+ AUDIO_MODEL_PATH_2023,
217
+ REMOTE_PATH
218
+ )
219
+ elif args.version == '2022':
220
+ check_and_download_models(
221
+ CAPTION_WEIGHT_PATH_2022,
222
+ CAPTION_MODEL_PATH_2022,
223
+ REMOTE_PATH
224
+ )
225
+ check_and_download_models(
226
+ AUDIO_WEIGHT_PATH_2022,
227
+ AUDIO_MODEL_PATH_2022,
228
+ REMOTE_PATH
229
+ )
230
+
231
+ env_id = args.env_id
232
+
233
+ # disable FP16
234
+ if "FP16" in ailia.get_environment(args.env_id).props or sys.platform == 'Darwin':
235
+ logger.warning('This model do not work on FP16. So use CPU mode.')
236
+ env_id = 0
237
+
238
+ # initialize
239
+ if args.version == '2023':
240
+ caption_model = ailia.Net(CAPTION_MODEL_PATH_2023, CAPTION_WEIGHT_PATH_2023, env_id=env_id)
241
+ audio_model = ailia.Net(AUDIO_MODEL_PATH_2023, AUDIO_WEIGHT_PATH_2023, env_id=env_id)
242
+ if args.disable_ailia_tokenizer:
243
+ from transformers import AutoTokenizer
244
+ tokenizer = AutoTokenizer.from_pretrained('gpt2')
245
+ tokenizer.add_special_tokens({'pad_token': '!'})
246
+ else:
247
+ from ailia_tokenizer import GPT2Tokenizer
248
+ tokenizer = GPT2Tokenizer.from_pretrained('./tokenizer_gpt2/')
249
+ tokenizer.add_special_tokens({'pad_token': '!'})
250
+ #tokenizer._pad_token_id = 0
251
+ elif args.version == '2022':
252
+ caption_model = ailia.Net(CAPTION_MODEL_PATH_2022, CAPTION_WEIGHT_PATH_2022, env_id=env_id)
253
+ audio_model = ailia.Net(AUDIO_MODEL_PATH_2022, AUDIO_WEIGHT_PATH_2022, env_id=env_id)
254
+ if args.disable_ailia_tokenizer:
255
+ from transformers import AutoTokenizer
256
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
257
+ else:
258
+ from ailia_tokenizer import BertTokenizer
259
+ tokenizer = BertTokenizer.from_pretrained('./tokenizer_bert/')
260
+
261
+ model = {
262
+ 'caption_model':caption_model,
263
+ 'audio_model':audio_model,
264
+ 'tokenizer':tokenizer
265
+ }
266
+
267
+ estimate_best_caption(model)
268
+
269
+ if __name__ == '__main__':
270
+ main()
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "model_max_length": 512}
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 1024}
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/ailia-models/Microsoft-CLAP/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/msclap
models/onnx/clap-htsat-unfused (Xenova)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/onnx/clap-htsat-unfused (Xenova)/README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: laion/clap-htsat-unfused
3
+ library_name: transformers.js
4
+ tags:
5
+ - zero-shot-audio-classification
6
+ ---
7
+
8
+ https://huggingface.co/laion/clap-htsat-unfused with ONNX weights to be compatible with Transformers.js.
9
+
10
+ ## Usage (Transformers.js)
11
+
12
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
13
+ ```bash
14
+ npm i @xenova/transformers
15
+ ```
16
+
17
+ **Example:** Perform zero-shot audio classification with `Xenova/clap-htsat-unfused`.
18
+ ```js
19
+ import { pipeline } from '@xenova/transformers';
20
+
21
+ const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
22
+
23
+ const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/dog_barking.wav';
24
+ const candidate_labels = ['dog', 'vaccum cleaner'];
25
+ const scores = await classifier(audio, candidate_labels);
26
+ // [
27
+ // { score: 0.9993992447853088, label: 'dog' },
28
+ // { score: 0.0006007603369653225, label: 'vaccum cleaner' }
29
+ // ]
30
+ ```
31
+
32
+ **Example:** Compute text embeddings with `ClapTextModelWithProjection`.
33
+
34
+ ```js
35
+ import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
36
+
37
+ // Load tokenizer and text model
38
+ const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused');
39
+ const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
40
+
41
+ // Run tokenization
42
+ const texts = ['a sound of a cat', 'a sound of a dog'];
43
+ const text_inputs = tokenizer(texts, { padding: true, truncation: true });
44
+
45
+ // Compute embeddings
46
+ const { text_embeds } = await text_model(text_inputs);
47
+ // Tensor {
48
+ // dims: [ 2, 512 ],
49
+ // type: 'float32',
50
+ // data: Float32Array(1024) [ ... ],
51
+ // size: 1024
52
+ // }
53
+ ```
54
+
55
+ **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
56
+ ```js
57
+ import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
58
+
59
+ // Load processor and audio model
60
+ const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
61
+ const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
62
+
63
+ // Read audio and run processor
64
+ const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav');
65
+ const audio_inputs = await processor(audio);
66
+
67
+ // Compute embeddings
68
+ const { audio_embeds } = await audio_model(audio_inputs);
69
+ // Tensor {
70
+ // dims: [ 1, 512 ],
71
+ // type: 'float32',
72
+ // data: Float32Array(512) [ ... ],
73
+ // size: 512
74
+ // }
75
+ ```
76
+
77
+ ---
78
+
79
+ Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
models/onnx/clap-htsat-unfused (Xenova)/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "laion/clap-htsat-unfused",
3
+ "architectures": [
4
+ "ClapModel"
5
+ ],
6
+ "audio_config": {
7
+ "fusion_num_hidden_layers": 2,
8
+ "model_type": "clap_audio_model",
9
+ "projection_hidden_size": 768
10
+ },
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "logit_scale_init_value": 14.285714285714285,
14
+ "model_type": "clap",
15
+ "num_hidden_layers": 16,
16
+ "projection_dim": 512,
17
+ "projection_hidden_act": "relu",
18
+ "text_config": {
19
+ "classifier_dropout": null,
20
+ "fusion_hidden_size": 768,
21
+ "fusion_num_hidden_layers": 2,
22
+ "initializer_range": 0.02,
23
+ "model_type": "clap_text_model",
24
+ "projection_hidden_size": 768
25
+ },
26
+ "transformers_version": "4.36.0.dev0"
27
+ }
models/onnx/clap-htsat-unfused (Xenova)/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c2b43c44f71e0fa841a4b86700886c199bf87699ea45632c4d831bc6c88957
3
+ size 117528416
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65963cfa5e903e0a8df475137252d618663df88ecf0b55a3fb327e6c1ca63a97
3
+ size 60414065
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fcff2c8824e7bcb83a983f2a49edab3b60cbcf4872ac70efee517355173bd1f
3
+ size 34301667
models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8cbfb96dda10259964e678c1557466f925001f66b8cd1b24c84bd88b0f84345
3
+ size 619128635
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec27913c473d8ce6367cc40376c27b79899248009d8f6f2ad62110abdcf6124
3
+ size 311602756
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f559313ec268518193101007fc0569ee4b2cfac03e369091c50adb4795f5c5d
3
+ size 161024085
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6df7c51f2d78e236a03f2b816e613d538c815639d13644fe7c2124f439da9648
3
+ size 501513769
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1fd1e4cdd02acbcacaafe7a8e608dcfc8f84a00bfea3e2ca7df710957b4c3a5
3
+ size 251029088
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a3df8b197e249816e08415fd040434c44762b2eea7eb7bf8a48a0f0bf3c14e5
3
+ size 126603263
models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": 10,
3
+ "feature_extractor_type": "ClapFeatureExtractor",
4
+ "feature_size": 64,
5
+ "fft_window_size": 1024,
6
+ "frequency_max": 14000,
7
+ "frequency_min": 50,
8
+ "hop_length": 480,
9
+ "max_length_s": 10,
10
+ "n_fft": 1024,
11
+ "nb_frequency_bins": 513,
12
+ "nb_max_frames": 1000,
13
+ "nb_max_samples": 480000,
14
+ "padding": "repeatpad",
15
+ "padding_side": "right",
16
+ "padding_value": 0.0,
17
+ "processor_class": "ClapProcessor",
18
+ "return_attention_mask": false,
19
+ "sampling_rate": 48000,
20
+ "top_db": null,
21
+ "truncation": "rand_trunc"
22
+ }
models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "per_channel": true,
3
+ "reduce_range": true,
4
+ "per_model_config": {
5
+ "model": {
6
+ "op_types": [
7
+ "Expand",
8
+ "ScatterND",
9
+ "Pad",
10
+ "Abs",
11
+ "Unsqueeze",
12
+ "ReduceSum",
13
+ "Not",
14
+ "CumSum",
15
+ "Constant",
16
+ "Exp",
17
+ "Sub",
18
+ "MatMul",
19
+ "Cast",
20
+ "Reshape",
21
+ "Flatten",
22
+ "Resize",
23
+ "Conv",
24
+ "ConstantOfShape",
25
+ "Gather",
26
+ "Relu",
27
+ "Div",
28
+ "Mul",
29
+ "GlobalAveragePool",
30
+ "Range",
31
+ "Erf",
32
+ "Where",
33
+ "ReduceMean",
34
+ "Pow",
35
+ "Shape",
36
+ "Concat",
37
+ "Slice",
38
+ "Softmax",
39
+ "Tanh",
40
+ "Sqrt",
41
+ "BatchNormalization",
42
+ "Add",
43
+ "Transpose",
44
+ "Gemm",
45
+ "Equal"
46
+ ],
47
+ "weight_type": "QUInt8"
48
+ },
49
+ "text_model": {
50
+ "op_types": [
51
+ "ReduceMean",
52
+ "Reshape",
53
+ "Softmax",
54
+ "Pow",
55
+ "Erf",
56
+ "Tanh",
57
+ "Concat",
58
+ "Sub",
59
+ "Not",
60
+ "Expand",
61
+ "Mul",
62
+ "Transpose",
63
+ "Div",
64
+ "Constant",
65
+ "Equal",
66
+ "Unsqueeze",
67
+ "Slice",
68
+ "MatMul",
69
+ "Gather",
70
+ "ConstantOfShape",
71
+ "Shape",
72
+ "Cast",
73
+ "Where",
74
+ "Sqrt",
75
+ "Add",
76
+ "Gemm",
77
+ "CumSum",
78
+ "Relu"
79
+ ],
80
+ "weight_type": "QInt8"
81
+ },
82
+ "audio_model": {
83
+ "op_types": [
84
+ "BatchNormalization",
85
+ "ScatterND",
86
+ "ReduceMean",
87
+ "Reshape",
88
+ "Softmax",
89
+ "Pow",
90
+ "Erf",
91
+ "GlobalAveragePool",
92
+ "Concat",
93
+ "Sub",
94
+ "Not",
95
+ "Expand",
96
+ "Mul",
97
+ "Transpose",
98
+ "Div",
99
+ "Constant",
100
+ "Equal",
101
+ "Unsqueeze",
102
+ "Pad",
103
+ "Slice",
104
+ "Resize",
105
+ "Range",
106
+ "MatMul",
107
+ "Gather",
108
+ "ConstantOfShape",
109
+ "Shape",
110
+ "Cast",
111
+ "Sqrt",
112
+ "Where",
113
+ "Add",
114
+ "Conv",
115
+ "Flatten",
116
+ "Gemm",
117
+ "Relu"
118
+ ],
119
+ "weight_type": "QUInt8"
120
+ }
121
+ }
122
+ }
models/onnx/clap-htsat-unfused (Xenova)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Xenova/clap-htsat-unfused
models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "max_length": null,
52
+ "model_max_length": 512,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "<pad>",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "processor_class": "ClapProcessor",
58
+ "sep_token": "</s>",
59
+ "tokenizer_class": "RobertaTokenizer",
60
+ "trim_offsets": true,
61
+ "trust_remote_code": false,
62
+ "unk_token": "<unk>"
63
+ }
models/onnx/clap-htsat-unfused (Xenova)/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/onnx/larger_clap_general (Xenova)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/onnx/larger_clap_general (Xenova)/README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: laion/larger_clap_general
3
+ library_name: transformers.js
4
+ tags:
5
+ - zero-shot-audio-classification
6
+ ---
7
+
8
+ https://huggingface.co/laion/larger_clap_general with ONNX weights to be compatible with Transformers.js.
9
+
10
+ ## Usage (Transformers.js)
11
+
12
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
13
+ ```bash
14
+ npm i @xenova/transformers
15
+ ```
16
+
17
+ **Example:** Perform zero-shot audio classification with `Xenova/larger_clap_general`.
18
+ ```js
19
+ import { pipeline } from '@xenova/transformers';
20
+
21
+ const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/larger_clap_general');
22
+
23
+ const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav';
24
+ const candidate_labels = ['calm piano music', 'heavy metal music'];
25
+ const scores = await classifier(audio, candidate_labels);
26
+ // [
27
+ // { score: 0.9829504489898682, label: 'calm piano music' },
28
+ // { score: 0.017049523070454597, label: 'heavy metal music' }
29
+ // ]
30
+ ```
31
+
32
+ **Example:** Compute text embeddings with `ClapTextModelWithProjection`.
33
+
34
+ ```js
35
+ import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
36
+
37
+ // Load tokenizer and text model
38
+ const tokenizer = await AutoTokenizer.from_pretrained('Xenova/larger_clap_general');
39
+ const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/larger_clap_general');
40
+
41
+ // Run tokenization
42
+ const texts = ['calm piano music', 'heavy metal music'];
43
+ const text_inputs = tokenizer(texts, { padding: true, truncation: true });
44
+
45
+ // Compute embeddings
46
+ const { text_embeds } = await text_model(text_inputs);
47
+ // Tensor {
48
+ // dims: [ 2, 512 ],
49
+ // type: 'float32',
50
+ // data: Float32Array(1024) [ ... ],
51
+ // size: 1024
52
+ // }
53
+ ```
54
+
55
+ **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
56
+ ```js
57
+ import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
58
+
59
+ // Load processor and audio model
60
+ const processor = await AutoProcessor.from_pretrained('Xenova/larger_clap_general');
61
+ const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/larger_clap_general');
62
+
63
+ // Read audio and run processor
64
+ const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav');
65
+ const audio_inputs = await processor(audio);
66
+
67
+ // Compute embeddings
68
+ const { audio_embeds } = await audio_model(audio_inputs);
69
+ // Tensor {
70
+ // dims: [ 1, 512 ],
71
+ // type: 'float32',
72
+ // data: Float32Array(512) [ ... ],
73
+ // size: 512
74
+ // }
75
+ ```
76
+
77
+ ---
78
+
79
+
80
+ Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).