{
  "architectures": [
    "FlowMatchingWithBigVGan"
  ],
  "dtype": "float32",
  "model_config": {
    "architectures": [
      "FlowMatchingModel"
    ],
    "attention_dropout": 0.0,
    "cfg_dropout": 0.2,
    "cfg_strength": 0.7,
    "dt": 0.1,
    "dtype": "float32",
    "embedding_dim": 768,
    "hidden_size": 512,
    "intermediate_size": 1024,
    "max_position_embeddings": null,
    "mean": -5.8843,
    "model_type": "",
    "num_attention_heads": 2,
    "num_hidden_layers": 4,
    "num_mel_bins": 80,
    "predict_duration": true,
    "rope_theta": 10000.0,
    "std": 2.2615,
    "vocab_size": 8192
  },
  "model_type": "flow_matching_with_bigvgan",
  "transformers_version": "4.56.1",
  "vocoder_config": {
    "_name_or_path": "models/bigvgan",
    "activation": "snakebeta",
    "architectures": [
      "BigVGAN"
    ],
    "dtype": "float32",
    "model_in_dim": 80,
    "model_type": "bigvgan",
    "resblock_dilation_sizes": [
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ]
    ],
    "resblock_kernel_sizes": [
      3,
      7,
      11
    ],
    "snake_logscale": true,
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [
      10,
      9,
      8,
      4,
      4
    ],
    "upsample_rates": [
      5,
      4,
      4,
      2,
      2
    ],
    "use_bias_at_final": false,
    "use_tanh_at_final": false
  }
}