| { |
| "dim": 3072, |
| "n_layers": 30, |
| "head_dim": 128, |
| "hidden_dim": 8192, |
| "n_heads": 32, |
| "n_kv_heads": 8, |
| "rope_theta": 100000000.0, |
| "norm_eps": 1e-05, |
| "vocab_size": 131072, |
| "max_position_embeddings": 32768, |
| "multimodal": { |
| "whisper_model_args": { |
| "encoder_args": { |
| "dim": 1280, |
| "n_layers": 32, |
| "head_dim": 64, |
| "hidden_dim": 5120, |
| "n_heads": 20, |
| "vocab_size": 51866, |
| "max_source_positions": 1500, |
| "audio_encoding_args": { |
| "sampling_rate": 16000, |
| "num_mel_bins": 128, |
| "hop_length": 160, |
| "window_size": 400 |
| } |
| }, |
| "downsample_args": { |
| "downsample_factor": 4 |
| } |
| } |
| } |
| } |