niobures commited on Oct 25, 2025

Commit

f8ffad7

verified ·

1 Parent(s): 87d6098

CLAP (code, models, paper)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf +3 -0
code/CLAP.zip +3 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx +3 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt +0 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx +3 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt +173 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx +3 -0
models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt +0 -0
models/onnx/ailia-models/LAION-CLAP/code/LICENSE +121 -0
models/onnx/ailia-models/LAION-CLAP/code/README.md +64 -0
models/onnx/ailia-models/LAION-CLAP/code/clap.py +203 -0
models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py +170 -0
models/onnx/ailia-models/LAION-CLAP/code/input.wav +3 -0
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt +0 -0
models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json +0 -0
models/onnx/ailia-models/LAION-CLAP/source.txt +10 -0
models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE +21 -0
models/onnx/ailia-models/Microsoft-CLAP/code/README.md +72 -0
models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt +6 -0
models/onnx/ailia-models/Microsoft-CLAP/code/input.wav +3 -0
models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py +270 -0
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json +1 -0
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt +0 -0
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt +0 -0
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json +1 -0
models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json +0 -0
models/onnx/ailia-models/Microsoft-CLAP/source.txt +1 -0
models/onnx/clap-htsat-unfused (Xenova)/.gitattributes +35 -0
models/onnx/clap-htsat-unfused (Xenova)/README.md +79 -0
models/onnx/clap-htsat-unfused (Xenova)/config.json +27 -0
models/onnx/clap-htsat-unfused (Xenova)/merges.txt +0 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx +3 -0
models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json +22 -0
models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json +122 -0
models/onnx/clap-htsat-unfused (Xenova)/source.txt +1 -0
models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json +15 -0
models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json +0 -0
models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json +63 -0
models/onnx/clap-htsat-unfused (Xenova)/vocab.json +0 -0
models/onnx/larger_clap_general (Xenova)/.gitattributes +35 -0
models/onnx/larger_clap_general (Xenova)/README.md +80 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Large-scale[[:space:]]Contrastive[[:space:]]Language-Audio[[:space:]]Pretraining[[:space:]]with[[:space:]]Feature[[:space:]]Fusion[[:space:]]and[[:space:]]Keyword-to-Caption[[:space:]]Augmentation.pdf filter=lfs diff=lfs merge=lfs -text
+models/onnx/ailia-models/LAION-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text
+models/onnx/ailia-models/Microsoft-CLAP/code/input.wav filter=lfs diff=lfs merge=lfs -text

Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c223105503d6f5c173479b84bcc6648c0df9f12a8493492617616c75049e7d31
+size 695271

code/CLAP.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e5ae04cc75b3acc7568a968365c28eebaba7e148a77a801e48a0e480595b30d
+size 11947642

models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9901a76d952d2a9be4d8e0a1e790bde6a86867f382c6aa991c4c6fcdfde0afeb
+size 117413819

models/onnx/ailia-models/LAION-CLAP/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c88525ca12a6e42b62ede1a086340411abd65bb7305dc963301eb1647825150
+size 2626946

models/onnx/ailia-models/LAION-CLAP/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt ADDED Viewed

	@@ -0,0 +1,173 @@

+ir_version: 6
+producer_name: "pytorch"
+producer_version: "1.13.0"
+model_version: 0
+graph {
+  name: "torch_jit"
+  node {
+    input: "x"
+    input: "text_projection.0.weight"
+    input: "text_projection.0.bias"
+    output: "/text_projection/text_projection.0/Gemm_output_0"
+    name: "/text_projection/text_projection.0/Gemm"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "transB"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "/text_projection/text_projection.0/Gemm_output_0"
+    output: "/text_projection/text_projection.1/Relu_output_0"
+    name: "/text_projection/text_projection.1/Relu"
+    op_type: "Relu"
+  }
+  node {
+    input: "/text_projection/text_projection.1/Relu_output_0"
+    input: "text_projection.2.weight"
+    input: "text_projection.2.bias"
+    output: "/text_projection/text_projection.2/Gemm_output_0"
+    name: "/text_projection/text_projection.2/Gemm"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "transB"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "/text_projection/text_projection.2/Gemm_output_0"
+    output: "/ReduceL2_output_0"
+    name: "/ReduceL2"
+    op_type: "ReduceL2"
+    attribute {
+      name: "axes"
+      ints: -1
+      type: INTS
+    }
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_output_0"
+    name: "/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\314\274\214+"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/ReduceL2_output_0"
+    input: "/Constant_output_0"
+    input: ""
+    output: "/Clip_output_0"
+    name: "/Clip"
+    op_type: "Clip"
+  }
+  node {
+    input: "/text_projection/text_projection.2/Gemm_output_0"
+    output: "/Shape_output_0"
+    name: "/Shape"
+    op_type: "Shape"
+  }
+  node {
+    input: "/Clip_output_0"
+    input: "/Shape_output_0"
+    output: "/Expand_output_0"
+    name: "/Expand"
+    op_type: "Expand"
+  }
+  node {
+    input: "/text_projection/text_projection.2/Gemm_output_0"
+    input: "/Expand_output_0"
+    output: "text_embed"
+    name: "/Div"
+    op_type: "Div"
+  }
+  initializer {
+      dims: 512
+      dims: 768
+      data_type: 1
+      name: "text_projection.0.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "text_projection.0.bias"
+  }
+  initializer {
+      dims: 512
+      dims: 512
+      data_type: 1
+      name: "text_projection.2.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "text_projection.2.bias"
+  }
+  input {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "batch_size"
+          }
+          dim {
+            dim_value: 768
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "text_embed"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "Divtext_embed_dim_0"
+          }
+          dim {
+            dim_param: "Divtext_embed_dim_1"
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 11
+}

models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ae0bea2a8147289e8d23fb2d25af3a7fdccdaa7730e69c4c52f68c9428749ee
+size 498829921

models/onnx/ailia-models/LAION-CLAP/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/LAION-CLAP/code/LICENSE ADDED Viewed

	@@ -0,0 +1,121 @@

+Creative Commons Legal Code
+CC0 1.0 Universal
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+Statement of Purpose
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+4. Limitations and Disclaimers.
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.

models/onnx/ailia-models/LAION-CLAP/code/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# CLAP
+Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP architecture, similarly, the CLAP architecture is as follows.
+## Input
+Audio file
+```
+24965__www-bonson-ca__bigdogbarking-02.wav
+Attribution 3.0 Unported (CC BY 3.0)
+https://freesound.org/people/www.bonson.ca/sounds/24965/
+```
+## Output
+Output the cosine similarity between the pre-prepared text embedding and the input audio file embedding. The higher a value of cosine similality is, the closer given text and given audio are in meaning.
+```
+===== cosine similality between text and audio =====
+cossim=0.1514, word=applause applaud clap
+cossim=0.2942, word=The crowd is clapping.
+cossim=0.0391, word=I love the contrastive learning
+cossim=0.0755, word=bell
+cossim=-0.0926, word=soccer
+cossim=0.0309, word=open the door.
+cossim=0.0849, word=applause
+cossim=0.4183, word=dog
+cossim=0.3819, word=dog barking
+  ```
+## Usage
+Automatically downloads the onnx and prototxt files on the first run.
+It is necessary to be connected to the Internet while downloading.
+For the sample wav,
+```bash
+$ python3 clap.py
+```
+If you want to run in onnx mode, you specify `--onnx` option as below.
+```bash
+$ python3 clap.py --onnx
+```
+You can run with other wav file by adding `--input` option.
+```bash
+$ python3 clap.py --input [wav_file]
+```
+## Reference
+[CLAP](https://github.com/LAION-AI/CLAP)
+## Framework
+Pytorch
+## Model Format
+ONNX opset=11
+## Netron
+[CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt)
+[CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt)
+[CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt)

models/onnx/ailia-models/LAION-CLAP/code/clap.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import time
+import sys
+import numpy as np
+import ailia  # noqa: E402
+sys.path.append('../../util')
+from arg_utils import get_base_parser, update_parser  # noqa: E402
+from model_utils import check_and_download_models, check_and_download_file  # noqa: E402
+# logger
+from logging import getLogger   # noqa: E402
+logger = getLogger(__name__)
+# for clap
+import librosa
+from clap_utils import *
+# ======================
+# Arguemnt Parser Config
+# ======================
+AUDIO_PATH = 'input.wav'
+parser = get_base_parser('CLAP', AUDIO_PATH, None)
+parser.add_argument(
+    '--onnx',
+    action='store_true',
+    help='By default, the ailia SDK is used, but with this option, you can switch to using ONNX Runtime'
+)
+parser.add_argument(
+    '--disable_ailia_tokenizer',
+    action='store_true',
+    help='disable ailia tokenizer.'
+)
+parser.add_argument(
+    '--disable_ailia_audio',
+    action='store_true',
+    help='disable ailia audio and use librosa to get spectrogram feature'
+)
+args = update_parser(parser)
+# ======================
+# PARAMETERS
+# ======================
+CLAP_AUDIO_WEIGHT_PATH = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx"
+CLAP_AUDIO_MODEL_PATH  = "CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt"
+CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx"
+CLAP_TEXT_ROBERTAMODEL_MODEL_PATH  = "CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt"
+CLAP_TEXT_PROJECTION_WEIGHT_PATH   = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx"
+CLAP_TEXT_PROJECTION_MODEL_PATH    = "CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt"
+REMOTE_PATH = "https://storage.googleapis.com/ailia-models/clap/"
+# ======================
+# Utils
+# ======================
+def cos_sim(v1, v2):
+    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
+# ======================
+# Main function
+# ======================
+def infer_text(net_text_branch, net_text_projection, text_data):
+    # tokenizer
+    if args.disable_ailia_tokenizer:
+        from transformers import RobertaTokenizer
+        tokenize = RobertaTokenizer.from_pretrained('roberta-base')
+        result = tokenize(
+            text_data,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        data = {k: v.squeeze(0) for k, v in result.items()}
+        data["input_ids"] = data["input_ids"].to('cpu').detach().numpy().copy()
+        data["attention_mask"] = data["attention_mask"].to('cpu').detach().numpy().copy()
+    else:
+        from ailia_tokenizer import RobertaTokenizer
+        tokenize = RobertaTokenizer.from_pretrained("./tokenizer/")
+        result = tokenize(
+            text_data,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_tensors="np",
+        )
+        data = {k: v for k, v in result.items()}
+    #print("input_ids", data["input_ids"])
+    #print("attention_mask", data["attention_mask"])
+    # predict
+    input_data = {
+        'input_ids': data["input_ids"],
+        'attention_mask': data["attention_mask"]
+    }
+    if not args.onnx:
+        output = net_text_branch.predict(input_data) # text_branch
+        _, x = output[0], output[1] # last_hidden_state, pooler_output
+        text_embeds = net_text_projection.predict(x) # projection
+    else:
+        output = net_text_branch.run(None, input_data) # text_branch
+        _, x = output[0], output[1] # last_hidden_state, pooler_output
+        text_embeds = net_text_projection.run(None, {'x': x})[0] # projection
+    return text_embeds
+def infer_audio(net_audio, audio_src):
+    # load the waveform of the shape (T,), should resample to 48000
+    audio_waveform, sr = librosa.load(audio_src, sr=48000)
+    # quantize
+    audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
+    # get audio features
+    _, mel_fusion, _ = get_audio_features(
+        {}, audio_waveform, 480000,
+        data_truncating='fusion',
+        data_filling='repeatpad',
+        audio_cfg={
+            'audio_length': 1024,
+            'clip_samples': 480000,
+            'mel_bins': 64,
+            'sample_rate': 48000,
+            'window_size': 1024,
+            'hop_size': 480,
+            'fmin': 50,
+            'fmax': 14000,
+            'class_num': 527,
+            'model_type': 'HTSAT',
+            'model_name': 'tiny'
+        },
+        b_use_ailia=not args.disable_ailia_audio
+    )
+    input_dict = {
+        'longer': [[True]], # Error occers when longer value is "False".
+        'mel_fusion': mel_fusion[np.newaxis, :, :, :]
+    }
+    # predict
+    if not args.onnx:
+        input_dict["longer"] = np.array(input_dict["longer"])
+        audio_embed = net_audio.predict(input_dict)[0]
+    else:
+        audio_embed = net_audio.run(None, input_dict)[0]
+    return audio_embed
+def main():
+    # model files check and download
+    check_and_download_models(CLAP_AUDIO_WEIGHT_PATH, CLAP_AUDIO_MODEL_PATH, REMOTE_PATH)
+    check_and_download_models(CLAP_TEXT_PROJECTION_WEIGHT_PATH, CLAP_TEXT_PROJECTION_MODEL_PATH, REMOTE_PATH)
+    check_and_download_models(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, REMOTE_PATH)
+    # net initialize
+    if not args.onnx:
+        net_text_branch = \
+            ailia.Net(CLAP_TEXT_ROBERTAMODEL_MODEL_PATH, CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH, env_id=args.env_id)
+        net_text_projection = \
+            ailia.Net(CLAP_TEXT_PROJECTION_MODEL_PATH, CLAP_TEXT_PROJECTION_WEIGHT_PATH, env_id=args.env_id)
+        net_audio = \
+            ailia.Net(CLAP_AUDIO_MODEL_PATH, CLAP_AUDIO_WEIGHT_PATH, env_id=args.env_id)
+    else:
+        import onnxruntime
+        net_text_branch = \
+            onnxruntime.InferenceSession(CLAP_TEXT_ROBERTAMODEL_WEIGHT_PATH)
+        net_text_projection = \
+            onnxruntime.InferenceSession(CLAP_TEXT_PROJECTION_WEIGHT_PATH)
+        net_audio = \
+            onnxruntime.InferenceSession(CLAP_AUDIO_WEIGHT_PATH)
+    # text predict
+    text_inputs = [
+        "applause applaud clap",
+        "The crowd is clapping.",
+        "I love the contrastive learning",
+        "bell",
+        "soccer",
+        "open the door.",
+        "applause",
+        "dog",
+        "dog barking"
+    ]
+    text_embedding = infer_text(net_text_branch, net_text_projection, text_inputs)
+    # audio predict
+    for audio_path in args.input:
+        audio_embedding = infer_audio(net_audio, audio_path)
+        # show result
+        print('===== cosine similality between text and audio =====')
+        print('audio: {}'.format(audio_path))
+        for i in range(text_embedding.shape[0]):
+            print('cossim={:.04f}, word={}'.format(cos_sim(text_embedding[i], audio_embedding[0]), text_inputs[i]))
+    logger.info('Script finished successfully.')
+if __name__ == "__main__":
+    main()

models/onnx/ailia-models/LAION-CLAP/code/clap_utils.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import numpy as np
+import librosa
+import ailia.audio
+from skimage.transform import resize
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+def get_mel(audio_data, audio_cfg):
+    """
+    # mel shape: (n_mels, T)
+    mel_torch = torchaudio.transforms.MelSpectrogram(
+        sample_rate=audio_cfg['sample_rate'],
+        n_fft=audio_cfg['window_size'],
+        win_length=audio_cfg['window_size'],
+        hop_length=audio_cfg['hop_size'],
+        center=True,
+        pad_mode="reflect",
+        power=2.0,
+        norm=None,
+        onesided=True,
+        n_mels=64,
+        f_min=audio_cfg['fmin'],
+        f_max=audio_cfg['fmax']
+    )(audio_data)
+    # we use log mel spectrogram as input
+    mel_torch = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel_torch)
+    mel_torch = mel_torch.T # (T, n_mels)
+    mel_torch = mel_torch.to('cpu').detach().numpy().copy()
+    """
+    # Align to librosa:
+    mel_librosa = librosa.feature.melspectrogram(
+        y=audio_data,
+        sr=audio_cfg['sample_rate'],
+        n_fft=audio_cfg['window_size'],
+        hop_length=audio_cfg['hop_size'],
+        win_length=audio_cfg['window_size'],
+        center=True,
+        pad_mode="reflect",
+        power=2.0,
+        n_mels=64,
+        norm=None,
+        htk=True,
+        fmin=audio_cfg['fmin'],
+        fmax=audio_cfg['fmax']
+    )
+    mel_librosa = librosa.amplitude_to_db(mel_librosa, top_db=None)
+    mel_librosa = mel_librosa.transpose(1, 0)
+    return mel_librosa
+def get_mel_ailia(audio_data, audio_cfg):
+    mel = ailia.audio.mel_spectrogram(
+        audio_data,
+        sample_rate=audio_cfg['sample_rate'],
+        fft_n=audio_cfg['window_size'],
+        hop_n=audio_cfg['hop_size'],
+        win_n=audio_cfg['window_size'],
+        win_type=1, # hann
+        center_mode=1,
+        power=2.0,
+        fft_norm_type=None,
+        f_min=audio_cfg['fmin'],
+        f_max=audio_cfg['fmax'],
+        mel_n=64,
+        mel_norm=False,
+        htk=True
+    )
+    def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
+        S[(S >= 0) & (S < amin)] = amin
+        S[(S < 0) & (S > -amin)] = -amin
+        return 10 * np.log10(S / ref)
+    mel_db = power_to_db(np.square(mel), top_db=None)
+    mel_db = mel_db.transpose(1, 0)
+    return mel_db
+def get_audio_features(sample, audio_data, max_len, data_truncating, data_filling, audio_cfg, b_use_ailia=False):
+    """
+    Calculate and add audio features to sample.
+    Sample: a dict containing all the data of current sample.
+    audio_data: a tensor of shape (T) containing audio data.
+    max_len: the maximum length of audio data.
+    data_truncating: the method of truncating data.
+    data_filling: the method of filling data.
+    audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg'].
+    """
+    mel_func = get_mel_ailia if b_use_ailia else get_mel
+    if len(audio_data) > max_len:
+        if data_truncating == "fusion":
+            # fusion
+            mel = mel_func(audio_data, audio_cfg)
+            # split to three parts
+            chunk_frames = max_len // audio_cfg['hop_size']+1  # the +1 related to how the spectrogram is computed
+            total_frames = mel.shape[0]
+            if chunk_frames == total_frames:
+                # there is a corner case where the audio length is
+                # larger than max_len but smaller than max_len+hop_size.
+                # In this case, we just use the whole audio.
+                mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
+                longer = [[False]]
+            else:
+                ranges = np.array_split(list(range(0, total_frames-chunk_frames+1)), 3)
+                # print('total_frames-chunk_frames:', total_frames-chunk_frames,
+                #       'len(audio_data):', len(audio_data),
+                #       'chunk_frames:', chunk_frames,
+                #       'total_frames:', total_frames)
+                if len(ranges[1]) == 0:
+                    # if the audio is too short, we just use the first chunk
+                    ranges[1] = [0]
+                if len(ranges[2]) == 0:
+                    # if the audio is too short, we just use the first chunk
+                    ranges[2] = [0]
+                # randomly choose index for each part
+                idx_front = np.random.choice(ranges[0])
+                idx_middle = np.random.choice(ranges[1])
+                idx_back = np.random.choice(ranges[2])
+                # select mel
+                mel_chunk_front = mel[idx_front:idx_front+chunk_frames, :]
+                mel_chunk_middle = mel[idx_middle:idx_middle+chunk_frames, :]
+                mel_chunk_back = mel[idx_back:idx_back+chunk_frames, :]
+                # shrink the mel
+                # Output may differ between torchvision.transforms.Resize and skimage.transform.resize.
+                #mel_shrink_torch = torch.from_numpy(mel[None])
+                #mel_shrink_torch = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel_shrink_torch)[0]
+                #mel_shrink_torch = mel_shrink_torch.to('cpu').detach().numpy().copy()
+                mel_shrink_numpy = resize(mel, (chunk_frames, 64), preserve_range=True, anti_aliasing=True, mode='edge')
+                # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
+                # stack
+                mel_fusion = np.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back, mel_shrink_numpy], axis=0)
+                longer = [[True]]
+        # random crop to max_len (for compatibility)
+        overflow = len(audio_data) - max_len
+        idx = np.random.randint(0, overflow + 1)
+        audio_data = audio_data[idx: idx + max_len]
+    else:  # padding if too short
+        if len(audio_data) < max_len:  # do nothing if equal
+            if data_filling == "repeatpad":
+                n_repeat = int(max_len/len(audio_data))
+                audio_data = np.tile(audio_data, n_repeat)
+                # audio_data = audio_data.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                # audio_data = F.interpolate(audio_data,size=max_len,mode="bicubic")[0,0,0]
+                audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
+            elif data_filling == "pad":
+                audio_data = np.pad(audio_data, [(0, max_len - len(audio_data))], "constant")
+            elif data_filling == "repeat":
+                n_repeat = int(max_len/len(audio_data))
+                audio_data = np.tile(audio_data, n_repeat+1)[:max_len]
+        if data_truncating == 'fusion':
+            mel = mel_func(audio_data, audio_cfg)
+            mel_fusion = np.stack([mel, mel, mel, mel], axis=0)
+        longer = [[False]]
+    return longer, mel_fusion, audio_data

models/onnx/ailia-models/LAION-CLAP/code/input.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f20a2c45b4da238c377d048d2d5aa2e76fb5ea38c7f273553e38d28f502dce12
+size 543122

models/onnx/ailia-models/LAION-CLAP/code/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/LAION-CLAP/code/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/LAION-CLAP/source.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/clap
+https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx
+https://storage.googleapis.com/ailia-models/clap/CLAP_text_text_branch_RobertaModel_roberta-base.onnx.prototxt
+https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx
+https://storage.googleapis.com/ailia-models/clap/CLAP_text_projection_LAION-Audio-630K_with_fusion.onnx.prototxt
+https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx
+https://storage.googleapis.com/ailia-models/clap/CLAP_audio_LAION-Audio-630K_with_fusion.onnx.prototxt

models/onnx/ailia-models/Microsoft-CLAP/code/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

models/onnx/ailia-models/Microsoft-CLAP/code/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Microsoft CLAP
+## Input
+**audio file**
+Audio file in wav format to use as the model's input.
+Default file name is [input.wav](./input.wav)
+(source: https://freesound.org/people/InspectorJ/sounds/456440/)
+**text file**
+A text file containing sentences separated by new lines.
+Default file name is [captions.txt](./captions.txt)
+## Output
+**Cosine similarities**
+Cosine similarity between the input audio and the sentences in the text file.
+## Usage
+Internet connection is required when running the script for the first time, as the model files will be automatically downloaded.
+Running the script will compute the cosine similarities between the audio and the captions, using audio and language encoder models train by contrastive training.
+You can switch the versions of the encoder model's weight (2022 or 2023) by specifying the version using the argument ```-v``` or ```--version```.
+For more information on arguments, try running ```python3 msclap.py --help```
+```bash
+$ python3 msclap.py -t captions.txt -a input.wav -v 2023
+ INFO arg_utils.py (13) : Start!
+ INFO arg_utils.py (158) : env_id updated to 0
+ INFO arg_utils.py (163) : env_id: 0
+ INFO arg_utils.py (166) : CPU
+ INFO msclap.py (167) : input_text: ['Dog barking.', 'Birds whistling.', 'Car passing by.', 'Wind blowing.', 'Water flowing.', 'People talking.']
+ INFO msclap.py (170) : inference has started...
+Similarity:
+    Birds whistling.: 0.41247469186782837
+       Wind blowing.: 0.2643369734287262
+      Water flowing.: 0.23884761333465576
+     Car passing by.: 0.22803542017936707
+     People talking.: 0.17387858033180237
+        Dog barking.: 0.11309497803449631
+ INFO msclap.py (192) : Script finished successfully.
+```
+## Reference
+* [CLAP](https://github.com/microsoft/CLAP)
+## Framework
+Pytorch
+## Model Format
+ONNX opset=11
+## Netron
+[caption_model_2023.onnx.prototxt]()
+[audio_model_2023.onnx.prototxt]()
+[caption_model_2022.onnx.prototxt]()
+[audio_model_2022.onnx.prototxt]()

models/onnx/ailia-models/Microsoft-CLAP/code/captions.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+Dog barking.
+Birds whistling.
+Car passing by.
+Wind blowing.
+Water flowing.
+People talking.

models/onnx/ailia-models/Microsoft-CLAP/code/input.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bbef488f5578b96843791d83cc7ad946c99e72c8b69bd56b351e1e0d80ac142
+size 714238

models/onnx/ailia-models/Microsoft-CLAP/code/msclap.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import sys
+import time
+from logging import getLogger
+import json
+import random
+import librosa
+import numpy as np
+import ailia
+# import original modules
+sys.path.append('../../util')
+from arg_utils import get_base_parser, update_parser, get_savepath  # noqa
+from model_utils import check_and_download_models  # noqa
+logger = getLogger(__name__)
+# ======================
+# Parameters
+# ======================
+CAPTION_WEIGHT_PATH_2023 = 'msclap_2023_caption.onnx'
+AUDIO_WEIGHT_PATH_2023 = 'msclap_2023_audio.onnx'
+CAPTION_MODEL_PATH_2023 = 'msclap_2023_caption.onnx.prototxt'
+AUDIO_MODEL_PATH_2023 = 'msclap_2023_audio.onnx.prototxt'
+CAPTION_WEIGHT_PATH_2022 = 'msclap_2022_caption.onnx'
+AUDIO_WEIGHT_PATH_2022 = 'msclap_2022_audio.onnx'
+CAPTION_MODEL_PATH_2022 = 'msclap_2022_caption.onnx.prototxt'
+AUDIO_MODEL_PATH_2022 = 'msclap_2022_audio.onnx.prototxt'
+REMOTE_PATH = "https://storage.googleapis.com/ailia-models/msclap/"
+# ======================
+# Arguemnt Parser Config
+# ======================
+parser = get_base_parser(
+    'msclap', None, None
+)
+parser.add_argument(
+    "-a", "--audio", type=str,
+    default="input.wav",
+    help="Input audio file path."
+)
+parser.add_argument(
+    "-t", "--text", type=str,
+    default="captions.txt",
+    help="Input text caption file path"
+)
+parser.add_argument(
+    "-v", "--version", type=str,
+    default="2023",
+    help="Version of the CLAP model (2022 or 2023)."
+)
+parser.add_argument(
+    '-w', '--write_json',
+    action='store_true',
+    help='Flag to output results to json file.'
+)
+parser.add_argument(
+    '--disable_ailia_tokenizer',
+    action='store_true',
+    help='disable ailia tokenizer.'
+)
+args = update_parser(parser, check_input_type=False)
+# ======================
+# Helper functions
+# ======================
+def read_audio(audio_path):
+    r"""Loads audio file or array and returns a numpy tensor"""
+    # Randomly sample a segment of audio_duration from the clip or pad to match duration
+    audio_time_series, sample_rate = librosa.load(audio_path, sr=None)
+    return audio_time_series, sample_rate
+def resample_audio(audio_time_series, sample_rate, resample_rate):
+    resample_rate = 44100
+    if resample_rate != sample_rate:
+        audio_time_series = librosa.resample(
+            audio_time_series,
+            orig_sr=sample_rate,
+            target_sr=resample_rate,
+            res_type = 'sinc_best'
+        )
+    return audio_time_series, resample_rate
+def resize_audio(audio_time_series, sample_rate, audio_duration, resample=False):
+    r"""Loads audio file and returns raw audio."""
+    # Randomly sample a segment of audio_duration from the clip or pad to match duration
+    audio_time_series = audio_time_series.reshape(-1)
+    # audio_time_series is shorter than predefined audio duration,
+    # so audio_time_series is extended
+    if audio_duration*sample_rate >= audio_time_series.shape[0]:
+        repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                    audio_time_series.shape[0]))
+        # Repeat audio_time_series by repeat_factor to match audio_duration
+        audio_time_series = np.tile(audio_time_series,repeat_factor)
+        # remove excess part of audio_time_series
+        audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+    else:
+        # audio_time_series is longer than predefined audio duration,
+        # so audio_time_series is trimmed
+        start_index = random.randrange(
+            audio_time_series.shape[0] - audio_duration*sample_rate)
+        audio_time_series = audio_time_series[start_index:start_index +
+                                              audio_duration*sample_rate]
+    return audio_time_series
+def get_audio_embeddings(wav_input, sample_rate, model, version="2023"):
+    if version in ('2023', '2022'):
+        wav_input = resample_audio(wav_input, sample_rate, 44100)[0]
+        wav_input = resize_audio(wav_input, 44100, 7)[None]
+    return model['audio_model'].predict(wav_input)
+def get_caption_embeddings(text_input, model, version="2023"):
+    # preprocesing
+    if version == '2023':
+        text_input = [t + ' <|endoftext|>' for t in text_input]
+    tokenized = dict(model['tokenizer'](text_input, padding = True, return_tensors = 'np'))
+    # inference
+    model_input = (tokenized['input_ids'], tokenized['attention_mask'])
+    return model['caption_model'].predict(model_input)[0]
+def cossim(v1, v2):
+    return np.sum(v1 * v2, axis = -1) / (np.sum(v1 ** 2, axis = -1) ** 0.5 * np.sum(v2 ** 2, axis = -1) ** 0.5)
+def print_sorted_dict(d):
+    m_len = max([len(k) for k in d.keys()])
+    for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
+        pad = ' ' * (m_len - len(k) + 4)
+        print(f'{pad + k}: {v}')
+def save_sorted_dict_as_json(d):
+    m_len = max([len(k) for k in d.keys()])
+    result = []
+    for k, v in sorted(d.items(), key=lambda x: x[1], reverse=True):
+        result.append({"caption": k, "similarity": float(v)})
+    with open('output.json', 'w', encoding='utf-8') as f:
+        json.dump(result, f, indent=2)
+# ======================
+# Main functions
+# ======================
+def inference(model, input_text, input_wav, sample_rate, version):
+    # get embeddings
+    audio_embeddings = get_audio_embeddings(input_wav, sample_rate, model, version)
+    caption_embeddings = get_caption_embeddings(input_text, model, version)
+    return cossim(audio_embeddings, caption_embeddings)
+def estimate_best_caption(model):
+    # load inputs
+    #input_text = CAPTIONS
+    with open(args.text, 'r') as f:
+        input_text = f.read().splitlines()
+    #input_text = args.input.split('.')
+    input_wav, sample_rate = read_audio(args.audio)
+    input_wav = input_wav[None]
+    logger.info("input_text: %s" % input_text)
+    # inference
+    logger.info('inference has started...')
+    if args.benchmark:
+        logger.info('BENCHMARK mode')
+        total_time_estimation = 0
+        for i in range(args.benchmark_count):
+            start = int(round(time.time() * 1000))
+            output = inference(model, input_text, input_wav, sample_rate, args.version)
+            end = int(round(time.time() * 1000))
+            estimation_time = (end - start)
+            # Logging
+            logger.info(f'\tailia processing estimation time {estimation_time} ms')
+            if i != 0:
+                total_time_estimation = total_time_estimation + estimation_time
+        logger.info(f'\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms')
+    else:
+        output = inference(model, input_text, input_wav, sample_rate, args.version)
+    print(f"Similarity: ")
+    print_sorted_dict(dict(zip(input_text, output)))
+    if args.write_json:
+        save_sorted_dict_as_json(dict(zip(input_text, output)))
+    logger.info('Script finished successfully.')
+def main():
+    # model files check and download
+    if args.version == '2023':
+        check_and_download_models(
+            CAPTION_WEIGHT_PATH_2023,
+            CAPTION_MODEL_PATH_2023,
+            REMOTE_PATH
+        )
+        check_and_download_models(
+            AUDIO_WEIGHT_PATH_2023,
+            AUDIO_MODEL_PATH_2023,
+            REMOTE_PATH
+        )
+    elif args.version == '2022':
+        check_and_download_models(
+            CAPTION_WEIGHT_PATH_2022,
+            CAPTION_MODEL_PATH_2022,
+            REMOTE_PATH
+        )
+        check_and_download_models(
+            AUDIO_WEIGHT_PATH_2022,
+            AUDIO_MODEL_PATH_2022,
+            REMOTE_PATH
+        )
+    env_id = args.env_id
+    # disable FP16
+    if "FP16" in ailia.get_environment(args.env_id).props or sys.platform == 'Darwin':
+        logger.warning('This model do not work on FP16. So use CPU mode.')
+        env_id = 0
+    # initialize
+    if args.version == '2023':
+        caption_model = ailia.Net(CAPTION_MODEL_PATH_2023, CAPTION_WEIGHT_PATH_2023, env_id=env_id)
+        audio_model = ailia.Net(AUDIO_MODEL_PATH_2023, AUDIO_WEIGHT_PATH_2023, env_id=env_id)
+        if args.disable_ailia_tokenizer:
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')
+            tokenizer.add_special_tokens({'pad_token': '!'})
+        else:
+            from ailia_tokenizer import GPT2Tokenizer
+            tokenizer = GPT2Tokenizer.from_pretrained('./tokenizer_gpt2/')
+            tokenizer.add_special_tokens({'pad_token': '!'})
+            #tokenizer._pad_token_id = 0
+    elif args.version == '2022':
+        caption_model = ailia.Net(CAPTION_MODEL_PATH_2022, CAPTION_WEIGHT_PATH_2022, env_id=env_id)
+        audio_model = ailia.Net(AUDIO_MODEL_PATH_2022, AUDIO_WEIGHT_PATH_2022, env_id=env_id)
+        if args.disable_ailia_tokenizer:
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        else:
+            from ailia_tokenizer import BertTokenizer
+            tokenizer = BertTokenizer.from_pretrained('./tokenizer_bert/')
+    model = {
+        'caption_model':caption_model,
+        'audio_model':audio_model,
+        'tokenizer':tokenizer
+    }
+    estimate_best_caption(model)
+if __name__ == '__main__':
+    main()

models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "model_max_length": 512}

models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_bert/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"model_max_length": 1024}

models/onnx/ailia-models/Microsoft-CLAP/code/tokenizer_gpt2/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/Microsoft-CLAP/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/msclap

models/onnx/clap-htsat-unfused (Xenova)/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

models/onnx/clap-htsat-unfused (Xenova)/README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+base_model: laion/clap-htsat-unfused
+library_name: transformers.js
+tags:
+- zero-shot-audio-classification
+---
+https://huggingface.co/laion/clap-htsat-unfused with ONNX weights to be compatible with Transformers.js.
+## Usage (Transformers.js)
+If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
+```bash
+npm i @xenova/transformers
+```
+**Example:** Perform zero-shot audio classification with `Xenova/clap-htsat-unfused`.
+```js
+import { pipeline } from '@xenova/transformers';
+const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
+const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/dog_barking.wav';
+const candidate_labels = ['dog', 'vaccum cleaner'];
+const scores = await classifier(audio, candidate_labels);
+// [
+//   { score: 0.9993992447853088, label: 'dog' },
+//   { score: 0.0006007603369653225, label: 'vaccum cleaner' }
+// ]
+```
+**Example:** Compute text embeddings with `ClapTextModelWithProjection`.
+```js
+import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
+// Load tokenizer and text model
+const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused');
+const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
+// Run tokenization
+const texts = ['a sound of a cat', 'a sound of a dog'];
+const text_inputs = tokenizer(texts, { padding: true, truncation: true });
+// Compute embeddings
+const { text_embeds } = await text_model(text_inputs);
+// Tensor {
+//   dims: [ 2, 512 ],
+//   type: 'float32',
+//   data: Float32Array(1024) [ ... ],
+//   size: 1024
+// }
+```
+**Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
+```js
+import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
+// Load processor and audio model
+const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
+const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
+// Read audio and run processor
+const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav');
+const audio_inputs = await processor(audio);
+// Compute embeddings
+const { audio_embeds } = await audio_model(audio_inputs);
+// Tensor {
+//   dims: [ 1, 512 ],
+//   type: 'float32',
+//   data: Float32Array(512) [ ... ],
+//   size: 512
+// }
+```
+---
+Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).

models/onnx/clap-htsat-unfused (Xenova)/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "laion/clap-htsat-unfused",
+  "architectures": [
+    "ClapModel"
+  ],
+  "audio_config": {
+    "fusion_num_hidden_layers": 2,
+    "model_type": "clap_audio_model",
+    "projection_hidden_size": 768
+  },
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 14.285714285714285,
+  "model_type": "clap",
+  "num_hidden_layers": 16,
+  "projection_dim": 512,
+  "projection_hidden_act": "relu",
+  "text_config": {
+    "classifier_dropout": null,
+    "fusion_hidden_size": 768,
+    "fusion_num_hidden_layers": 2,
+    "initializer_range": 0.02,
+    "model_type": "clap_text_model",
+    "projection_hidden_size": 768
+  },
+  "transformers_version": "4.36.0.dev0"
+}

models/onnx/clap-htsat-unfused (Xenova)/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1c2b43c44f71e0fa841a4b86700886c199bf87699ea45632c4d831bc6c88957
+size 117528416

models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65963cfa5e903e0a8df475137252d618663df88ecf0b55a3fb327e6c1ca63a97
+size 60414065

models/onnx/clap-htsat-unfused (Xenova)/onnx/audio_model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcff2c8824e7bcb83a983f2a49edab3b60cbcf4872ac70efee517355173bd1f
+size 34301667

models/onnx/clap-htsat-unfused (Xenova)/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8cbfb96dda10259964e678c1557466f925001f66b8cd1b24c84bd88b0f84345
+size 619128635

models/onnx/clap-htsat-unfused (Xenova)/onnx/model_fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ec27913c473d8ce6367cc40376c27b79899248009d8f6f2ad62110abdcf6124
+size 311602756

models/onnx/clap-htsat-unfused (Xenova)/onnx/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f559313ec268518193101007fc0569ee4b2cfac03e369091c50adb4795f5c5d
+size 161024085

models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6df7c51f2d78e236a03f2b816e613d538c815639d13644fe7c2124f439da9648
+size 501513769

models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1fd1e4cdd02acbcacaafe7a8e608dcfc8f84a00bfea3e2ca7df710957b4c3a5
+size 251029088

models/onnx/clap-htsat-unfused (Xenova)/onnx/text_model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a3df8b197e249816e08415fd040434c44762b2eea7eb7bf8a48a0f0bf3c14e5
+size 126603263

models/onnx/clap-htsat-unfused (Xenova)/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "chunk_length_s": 10,
+  "feature_extractor_type": "ClapFeatureExtractor",
+  "feature_size": 64,
+  "fft_window_size": 1024,
+  "frequency_max": 14000,
+  "frequency_min": 50,
+  "hop_length": 480,
+  "max_length_s": 10,
+  "n_fft": 1024,
+  "nb_frequency_bins": 513,
+  "nb_max_frames": 1000,
+  "nb_max_samples": 480000,
+  "padding": "repeatpad",
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "ClapProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 48000,
+  "top_db": null,
+  "truncation": "rand_trunc"
+}

models/onnx/clap-htsat-unfused (Xenova)/quantize_config.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+    "per_channel": true,
+    "reduce_range": true,
+    "per_model_config": {
+        "model": {
+            "op_types": [
+                "Expand",
+                "ScatterND",
+                "Pad",
+                "Abs",
+                "Unsqueeze",
+                "ReduceSum",
+                "Not",
+                "CumSum",
+                "Constant",
+                "Exp",
+                "Sub",
+                "MatMul",
+                "Cast",
+                "Reshape",
+                "Flatten",
+                "Resize",
+                "Conv",
+                "ConstantOfShape",
+                "Gather",
+                "Relu",
+                "Div",
+                "Mul",
+                "GlobalAveragePool",
+                "Range",
+                "Erf",
+                "Where",
+                "ReduceMean",
+                "Pow",
+                "Shape",
+                "Concat",
+                "Slice",
+                "Softmax",
+                "Tanh",
+                "Sqrt",
+                "BatchNormalization",
+                "Add",
+                "Transpose",
+                "Gemm",
+                "Equal"
+            ],
+            "weight_type": "QUInt8"
+        },
+        "text_model": {
+            "op_types": [
+                "ReduceMean",
+                "Reshape",
+                "Softmax",
+                "Pow",
+                "Erf",
+                "Tanh",
+                "Concat",
+                "Sub",
+                "Not",
+                "Expand",
+                "Mul",
+                "Transpose",
+                "Div",
+                "Constant",
+                "Equal",
+                "Unsqueeze",
+                "Slice",
+                "MatMul",
+                "Gather",
+                "ConstantOfShape",
+                "Shape",
+                "Cast",
+                "Where",
+                "Sqrt",
+                "Add",
+                "Gemm",
+                "CumSum",
+                "Relu"
+            ],
+            "weight_type": "QInt8"
+        },
+        "audio_model": {
+            "op_types": [
+                "BatchNormalization",
+                "ScatterND",
+                "ReduceMean",
+                "Reshape",
+                "Softmax",
+                "Pow",
+                "Erf",
+                "GlobalAveragePool",
+                "Concat",
+                "Sub",
+                "Not",
+                "Expand",
+                "Mul",
+                "Transpose",
+                "Div",
+                "Constant",
+                "Equal",
+                "Unsqueeze",
+                "Pad",
+                "Slice",
+                "Resize",
+                "Range",
+                "MatMul",
+                "Gather",
+                "ConstantOfShape",
+                "Shape",
+                "Cast",
+                "Sqrt",
+                "Where",
+                "Add",
+                "Conv",
+                "Flatten",
+                "Gemm",
+                "Relu"
+            ],
+            "weight_type": "QUInt8"
+        }
+    }
+}

models/onnx/clap-htsat-unfused (Xenova)/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://huggingface.co/Xenova/clap-htsat-unfused

models/onnx/clap-htsat-unfused (Xenova)/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

models/onnx/clap-htsat-unfused (Xenova)/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/clap-htsat-unfused (Xenova)/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "max_length": null,
+  "model_max_length": 512,
+  "pad_to_multiple_of": null,
+  "pad_token": "<pad>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "processor_class": "ClapProcessor",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "trust_remote_code": false,
+  "unk_token": "<unk>"
+}

models/onnx/clap-htsat-unfused (Xenova)/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/larger_clap_general (Xenova)/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

models/onnx/larger_clap_general (Xenova)/README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+---
+base_model: laion/larger_clap_general
+library_name: transformers.js
+tags:
+- zero-shot-audio-classification
+---
+https://huggingface.co/laion/larger_clap_general with ONNX weights to be compatible with Transformers.js.
+## Usage (Transformers.js)
+If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
+```bash
+npm i @xenova/transformers
+```
+**Example:** Perform zero-shot audio classification with `Xenova/larger_clap_general`.
+```js
+import { pipeline } from '@xenova/transformers';
+const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/larger_clap_general');
+const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav';
+const candidate_labels = ['calm piano music', 'heavy metal music'];
+const scores = await classifier(audio, candidate_labels);
+// [
+//   { score: 0.9829504489898682, label: 'calm piano music' },
+//   { score: 0.017049523070454597, label: 'heavy metal music' }
+// ]
+```
+**Example:** Compute text embeddings with `ClapTextModelWithProjection`.
+```js
+import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers';
+// Load tokenizer and text model
+const tokenizer = await AutoTokenizer.from_pretrained('Xenova/larger_clap_general');
+const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/larger_clap_general');
+// Run tokenization
+const texts = ['calm piano music', 'heavy metal music'];
+const text_inputs = tokenizer(texts, { padding: true, truncation: true });
+// Compute embeddings
+const { text_embeds } = await text_model(text_inputs);
+// Tensor {
+//   dims: [ 2, 512 ],
+//   type: 'float32',
+//   data: Float32Array(1024) [ ... ],
+//   size: 1024
+// }
+```
+**Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
+```js
+import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers';
+// Load processor and audio model
+const processor = await AutoProcessor.from_pretrained('Xenova/larger_clap_general');
+const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/larger_clap_general');
+// Read audio and run processor
+const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/piano.wav');
+const audio_inputs = await processor(audio);
+// Compute embeddings
+const { audio_embeds } = await audio_model(audio_inputs);
+// Tensor {
+//   dims: [ 1, 512 ],
+//   type: 'float32',
+//   data: Float32Array(512) [ ... ],
+//   size: 512
+// }
+```
+---
+Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).