Update CLIPCLAP model with trained audio projection
Browse files- .gitattributes +6 -0
- audio_model.onnx +3 -0
- audio_model.onnx.data +3 -0
- audio_projection.onnx +3 -0
- audio_projection.onnx.data +3 -0
- clip_config.json +21 -0
- processor_config.json +30 -0
- projection_training_metadata.json +9 -0
- text_model.onnx +3 -0
- text_model.onnx.data +3 -0
- text_projection.onnx +3 -0
- text_projection.onnx.data +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +13 -0
- visual_model.onnx +3 -0
- visual_model.onnx.data +3 -0
- visual_projection.onnx +3 -0
- visual_projection.onnx.data +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
audio_model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
audio_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
text_model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
text_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
visual_model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
visual_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
|
audio_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:faa9d5379ba1acb8ad6e21a448ab36ea0f662400bf6409e37c88495b48b25b4d
|
| 3 |
+
size 3320426
|
audio_model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:318f55059a9504380f3a6e8d1765a50bf1c6c726ee0ba849d014ec950fdac87a
|
| 3 |
+
size 277348352
|
audio_projection.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:956a052354210cdc8014adcbae668e9a4b637c1a7ad1f53d6e056065dca3be1d
|
| 3 |
+
size 340
|
audio_projection.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb2cddffad79e72642b59b1f4c86739575534f03a87ad881bb5e09dfc84586b4
|
| 3 |
+
size 1048576
|
clip_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "clipclap",
|
| 3 |
+
"vision_config": {
|
| 4 |
+
"hidden_size": 768,
|
| 5 |
+
"image_size": 224,
|
| 6 |
+
"patch_size": 32,
|
| 7 |
+
"projection_dim": 512
|
| 8 |
+
},
|
| 9 |
+
"text_config": {
|
| 10 |
+
"hidden_size": 512,
|
| 11 |
+
"max_position_embeddings": 77,
|
| 12 |
+
"projection_dim": 512
|
| 13 |
+
},
|
| 14 |
+
"audio_config": {
|
| 15 |
+
"hidden_size": 1024,
|
| 16 |
+
"sample_rate": 48000,
|
| 17 |
+
"max_length_s": 10,
|
| 18 |
+
"projection_dim": 512
|
| 19 |
+
},
|
| 20 |
+
"projection_dim": 512
|
| 21 |
+
}
|
processor_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_processor": {
|
| 3 |
+
"crop_size": {
|
| 4 |
+
"height": 224,
|
| 5 |
+
"width": 224
|
| 6 |
+
},
|
| 7 |
+
"do_center_crop": true,
|
| 8 |
+
"do_convert_rgb": true,
|
| 9 |
+
"do_normalize": true,
|
| 10 |
+
"do_rescale": true,
|
| 11 |
+
"do_resize": true,
|
| 12 |
+
"image_mean": [
|
| 13 |
+
0.48145466,
|
| 14 |
+
0.4578275,
|
| 15 |
+
0.40821073
|
| 16 |
+
],
|
| 17 |
+
"image_processor_type": "CLIPImageProcessor",
|
| 18 |
+
"image_std": [
|
| 19 |
+
0.26862954,
|
| 20 |
+
0.26130258,
|
| 21 |
+
0.27577711
|
| 22 |
+
],
|
| 23 |
+
"resample": 3,
|
| 24 |
+
"rescale_factor": 0.00392156862745098,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 224
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"processor_class": "CLIPProcessor"
|
| 30 |
+
}
|
projection_training_metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clip_model": "openai/clip-vit-base-patch32",
|
| 3 |
+
"clap_model": "laion/larger_clap_music_and_speech",
|
| 4 |
+
"embed_dim": 512,
|
| 5 |
+
"num_captions": 1000,
|
| 6 |
+
"epochs": 20,
|
| 7 |
+
"batch_size": 256,
|
| 8 |
+
"lr": 0.001
|
| 9 |
+
}
|
text_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5501d22847954f1a203268bd3c23b0a576d98ebb6e1d573970a03bec1169956
|
| 3 |
+
size 1238678
|
text_model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1da1df9bcae44e6b6450fa88ca6a21af8c19735fbba2f31834ae805a41d3a125
|
| 3 |
+
size 252706816
|
text_projection.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7dc464d70e63787590745b0ec820d361df09fc41f4bf3245856a7dd44b3c5a9
|
| 3 |
+
size 339
|
text_projection.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:478d41294224ac5b677b45e1d5ff60b5c05cddb1c09227e8b5166ce1cb30fd51
|
| 3 |
+
size 1048576
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<|startoftext|>",
|
| 5 |
+
"do_lower_case": true,
|
| 6 |
+
"eos_token": "<|endoftext|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"model_max_length": 77,
|
| 10 |
+
"pad_token": "<|endoftext|>",
|
| 11 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 12 |
+
"unk_token": "<|endoftext|>"
|
| 13 |
+
}
|
visual_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9947ea5e7826a3868800792149bebe37791b46bb56d9fcd283b48998f809cd2a
|
| 3 |
+
size 1137817
|
visual_model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04ada8fe1125ce75c4629e0eaf047ec7ea0e5f43e2a518e52ba5343cea8c41c4
|
| 3 |
+
size 349831168
|
visual_projection.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42d814607233e9ac34b0df9dfba9d5da9e71d30c53e6d36cf7bd8024f8bd799e
|
| 3 |
+
size 341
|
visual_projection.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13295d3b6f85989c124f39c5c1581c9f6a1c0d393ea151402f757987d4327419
|
| 3 |
+
size 1572864
|