Spaces:
Running on Zero
Running on Zero
| { | |
| "semantic_llm": { | |
| "start_text_token": 32000, | |
| "stop_text_token": 32001, | |
| "num_text_tokens": 32002, | |
| "start_audio_token": 16384, | |
| "stop_audio_token": 16385, | |
| "num_audio_tokens": 16386, | |
| "llm_hidden_size": 1024, | |
| "llm_intermediate_size": 4096, | |
| "llm_num_layers": 30, | |
| "llm_num_heads": 16, | |
| "llm_max_audio_seq_len": 630, | |
| "llm_max_text_seq_len": 402, | |
| "llm_max_prompt_len": 250, | |
| "code_stride_len": 640, | |
| "EOS_TOKEN": 16385 | |
| }, | |
| "acoustic_llm": { | |
| "n_stacks": 1, | |
| "layers": 24, | |
| "model_dim": 1536, | |
| "heads": 16, | |
| "max_text_tokens": 2048, | |
| "max_speech_tokens": 2048, | |
| "max_conditioning_inputs": 1, | |
| "number_text_tokens": 16386, | |
| "start_text_token": 16384, | |
| "stop_text_token": 16385, | |
| "n_frames_per_step": 1, | |
| "n_heads_per_frame": 8, | |
| "delay_prediction": 1, | |
| "upsample_factors": 1, | |
| "streaming_delayed_frames": 8, | |
| "number_speech_tokens": 16386, | |
| "start_speech_token": 16384, | |
| "stop_speech_token": 16385, | |
| "speaker_embedding_pretrained": true, | |
| "speaker_embedding_ckpt": null, | |
| "speaker_embedding_dim": 512, | |
| "temperature": 0.5, | |
| "repetition_penalty": 2.0, | |
| "top_p": 0.5, | |
| "top_k": 25 | |
| }, | |
| "acoustic_codec": { | |
| "n_model_size": 1024, | |
| "encoder_config": { | |
| "ngf": 48, | |
| "up_ratios": [ | |
| 2, | |
| 4, | |
| 4, | |
| 4, | |
| 5 | |
| ], | |
| "causal": true | |
| }, | |
| "decoder_config": { | |
| "upsample_initial_channel": 1536, | |
| "ngf": 48, | |
| "up_ratios": [ | |
| 6, | |
| 5, | |
| 4, | |
| 4, | |
| 2 | |
| ], | |
| "causal": true | |
| }, | |
| "vq_config": { | |
| "n_groups": 8, | |
| "ordered": true, | |
| "codebook_size": [ | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128 | |
| ], | |
| "codebook_dim": [ | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8 | |
| ], | |
| "requires_projection": true, | |
| "decay": 0.99, | |
| "threshold_ema_dead_code": 0, | |
| "commitment_weight": 0.01 | |
| }, | |
| "resampler_config": { | |
| "source_sr": 16000, | |
| "target_sr": 16000 | |
| } | |
| }, | |
| "semantic_tokenizer": { | |
| "in_dim": 1024, | |
| "out_dim": 80, | |
| "n_model_size": 512, | |
| "downsample_scales": [ | |
| 1, | |
| 1, | |
| 1, | |
| 2 | |
| ], | |
| "upsample_scales": [ | |
| [ | |
| 2, | |
| 1 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| ], | |
| "mel_config": { | |
| "style": "BigVGAN", | |
| "filter_length": 1024, | |
| "hop_length": 160, | |
| "win_length": 640, | |
| "n_mel_channels": 80, | |
| "sampling_rate": 16000 | |
| }, | |
| "vq_config": { | |
| "codebook_size": [ | |
| 128, | |
| 128 | |
| ], | |
| "codebook_dim": [ | |
| 128, | |
| 128 | |
| ], | |
| "requires_projection": true | |
| }, | |
| "tree_config": [ | |
| { | |
| "downsample_rate": 1, | |
| "n_groups": 1, | |
| "dropout": 0 | |
| } | |
| ], | |
| "n_samples_per_token": 640, | |
| "checkpointing": true | |
| } | |
| } |