| { |
| "_name_or_path": "suno/bark", |
| "architectures": [ |
| "BarkModel" |
| ], |
| "coarse_acoustics_config": { |
| "_attn_implementation_autoset": true, |
| "architectures": [ |
| "BarkCoarseModel" |
| ], |
| "bias": false, |
| "hidden_size": 1024, |
| "input_vocab_size": 12096, |
| "model_type": "coarse_acoustics", |
| "num_heads": 16, |
| "num_layers": 24, |
| "output_vocab_size": 12096, |
| "torch_dtype": "float32" |
| }, |
| "codec_config": { |
| "_name_or_path": "facebook/encodec_24khz", |
| "architectures": [ |
| "EncodecModel" |
| ], |
| "model_type": "encodec", |
| "torch_dtype": "float32" |
| }, |
| "fine_acoustics_config": { |
| "_attn_implementation_autoset": true, |
| "architectures": [ |
| "BarkFineModel" |
| ], |
| "bias": false, |
| "hidden_size": 1024, |
| "input_vocab_size": 1056, |
| "model_type": "fine_acoustics", |
| "num_heads": 16, |
| "num_layers": 24, |
| "output_vocab_size": 1056, |
| "torch_dtype": "float32" |
| }, |
| "flash_attention_version": "2.1.0", |
| "initializer_range": 0.02, |
| "model_type": "bark", |
| "semantic_config": { |
| "_attn_implementation_autoset": true, |
| "architectures": [ |
| "BarkSemanticModel" |
| ], |
| "bias": false, |
| "hidden_size": 1024, |
| "input_vocab_size": 129600, |
| "model_type": "semantic", |
| "num_heads": 16, |
| "num_layers": 24, |
| "torch_dtype": "float32" |
| }, |
| "torch_dtype": "float16", |
| "transformers_version": "4.48.2", |
| "use_flash_attention_2": true |
| } |
|
|