{ "architectures": [ "FlowMatchingWithBigVGan" ], "dtype": "float32", "model_config": { "attention_dropout": 0.0, "cfg_dropout": 0.2, "cfg_strength": 0.7, "dt": 0.1, "embedding_dim": 768, "hidden_size": 512, "intermediate_size": 1024, "max_position_embeddings": null, "mean": -5.8843, "model_type": "", "num_attention_heads": 2, "num_encoder_layers": 2, "num_hidden_layers": 4, "num_mel_bins": 80, "rope_parameters": { "rope_theta": 10000.0, "rope_type": "default" }, "rope_theta": 10000.0, "std": 2.2615, "vocab_size": 8192 }, "model_type": "flow_matching_with_bigvgan", "transformers_version": "5.8.0.dev0", "vocoder_config": { "architectures": [ "Qwen2_5OmniToken2WavBigVGANModel" ], "dtype": "float32", "mel_dim": 80, "model_type": "qwen2_5_omni_bigvgan", "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "resblock_kernel_sizes": [ 3, 7, 11 ], "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 10, 9, 8, 4, 4 ], "upsample_rates": [ 5, 4, 4, 2, 2 ] } }