File size: 2,251 Bytes
1be2e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
{
"in_channels": 768,
"audio_codec": {
"encoder_dim": 64,
"encoder_rates": [
2,
8,
10,
12
],
"latent_dim": 1024,
"decoder_dim": 1536,
"decoder_rates": [
12,
10,
8,
2
],
"n_codebooks": 16,
"codebook_size": 1024,
"codebook_dim": 128,
"quantizer_dropout": false,
"sample_rate": 48000,
"mean": 0.0,
"std": 1.0
},
"text_encoder": {
"dim": 768,
"name": "t5-base",
"max_length": 512,
"pad_mode": "longest"
},
"vision_encoder": {
"dim": 1024,
"batch_size": 300,
"name": "PE-Core-L14-336",
"normalize_feature": true,
"interpolation_mode": "BICUBIC",
"image_size": 336
},
"transformer": {
"dim": 1536,
"n_heads": 12,
"n_layers": 12,
"dropout": 0.1,
"norm_eps": 1e-05,
"qk_norm": true,
"fc_bias": false,
"ffn_exp": 4,
"ffn_dim_multiplier": 1,
"multiple_of": 64,
"non_linearity": "swiglu",
"use_rope": true,
"max_positions": 10000,
"frequency_embedding_dim": 256,
"timestep_non_linearity": "swiglu",
"t_block_non_linearity": "silu",
"t_block_bias": true,
"context_dim": 1536,
"context_non_linearity": "swiglu",
"context_embedder_dropout": 0.0,
"context_norm": false,
"out_channels": 256,
"in_channels": null
},
"num_anchors": 3,
"anchor_embedding_dim": 128,
"visual_ranker": {
"checkpoint": null,
"kind": "imagebind"
},
"text_ranker": {
"rankers": {
"clap": [
{
"checkpoint": null,
"kind": "clap"
},
5.0
],
"judge": [
{
"checkpoint_or_model_id": "facebook/sam-audio-judge",
"kind": "judge"
},
1.0
]
},
"kind": "ensemble"
},
"span_predictor": "pe-a-frame-large"
}
|