{ "architectures": [ "BarkModel" ], "coarse_acoustics_config": { "block_size": 256, "bos_token_id": 32, "eos_token_id": 32, "hidden_size": 16, "input_vocab_size": 300, "model_type": "coarse_acoustics", "num_heads": 2, "num_layers": 2, "output_vocab_size": 300, "pad_token_id": 32 }, "codec_config": { "audio_channels": 2, "chunk_in_sec": null, "codebook_dim": 32, "codebook_size": 64, "hidden_size": 32, "model_type": "encodec", "num_filters": 8, "num_lstm_layers": 1, "upsampling_ratios": [ 8, 4 ] }, "fine_acoustics_config": { "block_size": 256, "bos_token_id": 32, "eos_token_id": 32, "hidden_size": 16, "input_vocab_size": 300, "model_type": "fine_acoustics", "num_heads": 2, "num_layers": 2, "output_vocab_size": 300, "pad_token_id": 32 }, "initializer_range": 0.02, "model_type": "bark", "semantic_config": { "block_size": 256, "bos_token_id": 32, "eos_token_id": 32, "hidden_size": 16, "input_vocab_size": 300, "model_type": "semantic", "num_heads": 2, "num_layers": 2, "output_vocab_size": 300, "pad_token_id": 32 }, "torch_dtype": "float32", "transformers_version": "4.32.0.dev0" }