| { |
| "in_channels": 768, |
| "audio_codec": { |
| "encoder_dim": 64, |
| "encoder_rates": [ |
| 2, |
| 8, |
| 10, |
| 12 |
| ], |
| "latent_dim": 1024, |
| "decoder_dim": 1536, |
| "decoder_rates": [ |
| 12, |
| 10, |
| 8, |
| 2 |
| ], |
| "n_codebooks": 16, |
| "codebook_size": 1024, |
| "codebook_dim": 128, |
| "quantizer_dropout": false, |
| "sample_rate": 48000, |
| "mean": 0.0, |
| "std": 1.0 |
| }, |
| "text_encoder": { |
| "dim": 768, |
| "name": "t5-base", |
| "max_length": 512, |
| "pad_mode": "longest" |
| }, |
| "vision_encoder": { |
| "dim": 1024, |
| "batch_size": 300, |
| "name": "PE-Core-L14-336", |
| "normalize_feature": true, |
| "interpolation_mode": "BICUBIC", |
| "image_size": 336 |
| }, |
| "transformer": { |
| "dim": 2816, |
| "n_heads": 22, |
| "n_layers": 22, |
| "dropout": 0.1, |
| "norm_eps": 1e-05, |
| "qk_norm": true, |
| "fc_bias": false, |
| "ffn_exp": 4, |
| "ffn_dim_multiplier": 1, |
| "multiple_of": 64, |
| "non_linearity": "swiglu", |
| "use_rope": true, |
| "max_positions": 10000, |
| "frequency_embedding_dim": 256, |
| "timestep_non_linearity": "swiglu", |
| "t_block_non_linearity": "silu", |
| "t_block_bias": true, |
| "context_dim": 2816, |
| "context_non_linearity": "swiglu", |
| "context_embedder_dropout": 0.0, |
| "context_norm": false, |
| "out_channels": 256, |
| "in_channels": null |
| }, |
| "num_anchors": 3, |
| "anchor_embedding_dim": 128, |
| "visual_ranker": { |
| "checkpoint": null, |
| "kind": "imagebind" |
| }, |
| "text_ranker": { |
| "rankers": { |
| "clap": [ |
| { |
| "checkpoint": null, |
| "kind": "clap" |
| }, |
| 5.0 |
| ], |
| "judge": [ |
| { |
| "checkpoint_or_model_id": "facebook/sam-audio-judge", |
| "kind": "judge" |
| }, |
| 1.0 |
| ] |
| }, |
| "kind": "ensemble" |
| }, |
| "span_predictor": "pe-a-frame-large" |
| } |
|
|