hik63382
/

TTS-Multi_language_High_Quality

+{
+  "acoustic_model_config": {
+    "codebook_dim": 8,
+    "codebook_loss_weight": 1.0,
+    "codebook_size": 1024,
+    "commitment_loss_weight": 0.25,
+    "decoder_hidden_size": 1024,
+    "downsampling_ratios": [
+      8,
+      5,
+      4,
+      2,
+      3
+    ],
+    "encoder_hidden_size": 64,
+    "hidden_size": 256,
+    "hop_length": 960,
+    "model_type": "dac",
+    "n_codebooks": 9,
+    "quantizer_dropout": 0,
+    "sampling_rate": 16000,
+    "upsampling_ratios": [
+      8,
+      5,
+      4,
+      2,
+      3
+    ]
+  },
+  "architectures": [
+    "HiggsAudioV2TokenizerModel"
+  ],
+  "block_dilations": [
+    1,
+    1
+  ],
+  "channel_ratios": [
+    1,
+    1
+  ],
+  "codebook_dim": 64,
+  "codebook_size": 1024,
+  "downsample_factor": 320,
+  "dtype": "float32",
+  "initializer_range": 0.02,
+  "kernel_size": 3,
+  "model_type": "higgs_audio_v2_tokenizer",
+  "sample_rate": 24000,
+  "semantic_model_config": {
+    "activation_dropout": 0.1,
+    "apply_spec_augment": true,
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "classifier_proj_size": 256,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_pos_batch_norm": false,
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "do_stable_layer_norm": false,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_norm": "group",
+    "feat_proj_dropout": 0.0,
+    "feat_proj_layer_norm": true,
+    "final_dropout": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.0,
+    "model_type": "hubert",
+    "num_attention_heads": 12,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32
+  },
+  "semantic_sample_rate": 16000,
+  "strides": [
+    1,
+    1
+  ],
+  "target_bandwidths": [
+    0.5,
+    1,
+    1.5,
+    2
+  ],
+  "transformers_version": "5.3.0.dev0",
+  "unit_kernel_size": 3
+}