CarpenterAnt91 commited on
Commit
1dc9521
·
verified ·
1 Parent(s): 5767c39

Update CLIPCLAP model with trained audio projection

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ audio_model.onnx.data filter=lfs diff=lfs merge=lfs -text
37
+ audio_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
38
+ text_model.onnx.data filter=lfs diff=lfs merge=lfs -text
39
+ text_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
40
+ visual_model.onnx.data filter=lfs diff=lfs merge=lfs -text
41
+ visual_projection.onnx.data filter=lfs diff=lfs merge=lfs -text
audio_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa9d5379ba1acb8ad6e21a448ab36ea0f662400bf6409e37c88495b48b25b4d
3
+ size 3320426
audio_model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318f55059a9504380f3a6e8d1765a50bf1c6c726ee0ba849d014ec950fdac87a
3
+ size 277348352
audio_projection.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:956a052354210cdc8014adcbae668e9a4b637c1a7ad1f53d6e056065dca3be1d
3
+ size 340
audio_projection.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb2cddffad79e72642b59b1f4c86739575534f03a87ad881bb5e09dfc84586b4
3
+ size 1048576
clip_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "clipclap",
3
+ "vision_config": {
4
+ "hidden_size": 768,
5
+ "image_size": 224,
6
+ "patch_size": 32,
7
+ "projection_dim": 512
8
+ },
9
+ "text_config": {
10
+ "hidden_size": 512,
11
+ "max_position_embeddings": 77,
12
+ "projection_dim": 512
13
+ },
14
+ "audio_config": {
15
+ "hidden_size": 1024,
16
+ "sample_rate": 48000,
17
+ "max_length_s": 10,
18
+ "projection_dim": 512
19
+ },
20
+ "projection_dim": 512
21
+ }
processor_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "crop_size": {
4
+ "height": 224,
5
+ "width": 224
6
+ },
7
+ "do_center_crop": true,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ },
29
+ "processor_class": "CLIPProcessor"
30
+ }
projection_training_metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clip_model": "openai/clip-vit-base-patch32",
3
+ "clap_model": "laion/larger_clap_music_and_speech",
4
+ "embed_dim": 512,
5
+ "num_captions": 1000,
6
+ "epochs": 20,
7
+ "batch_size": 256,
8
+ "lr": 0.001
9
+ }
text_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5501d22847954f1a203268bd3c23b0a576d98ebb6e1d573970a03bec1169956
3
+ size 1238678
text_model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1da1df9bcae44e6b6450fa88ca6a21af8c19735fbba2f31834ae805a41d3a125
3
+ size 252706816
text_projection.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7dc464d70e63787590745b0ec820d361df09fc41f4bf3245856a7dd44b3c5a9
3
+ size 339
text_projection.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478d41294224ac5b677b45e1d5ff60b5c05cddb1c09227e8b5166ce1cb30fd51
3
+ size 1048576
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|startoftext|>",
5
+ "do_lower_case": true,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "is_local": false,
9
+ "model_max_length": 77,
10
+ "pad_token": "<|endoftext|>",
11
+ "tokenizer_class": "CLIPTokenizer",
12
+ "unk_token": "<|endoftext|>"
13
+ }
visual_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9947ea5e7826a3868800792149bebe37791b46bb56d9fcd283b48998f809cd2a
3
+ size 1137817
visual_model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ada8fe1125ce75c4629e0eaf047ec7ea0e5f43e2a518e52ba5343cea8c41c4
3
+ size 349831168
visual_projection.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d814607233e9ac34b0df9dfba9d5da9e71d30c53e6d36cf7bd8024f8bd799e
3
+ size 341
visual_projection.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13295d3b6f85989c124f39c5c1581c9f6a1c0d393ea151402f757987d4327419
3
+ size 1572864