| { | |
| "architectures": [ | |
| "CFM" | |
| ], | |
| "attn_implementation": "chunk_attn", | |
| "audio_drop_prob": 0.3, | |
| "chunk_size": 2048, | |
| "cond_drop_prob": 0.2, | |
| "conv_layers": 4, | |
| "depth": 18, | |
| "frac_lengths_mask": [ | |
| 0.7, | |
| 1.0 | |
| ], | |
| "hidden_size": 768, | |
| "hop_length": 256, | |
| "intermediate_scale": 2, | |
| "local_window": 384, | |
| "max_position_embeddings": 131072, | |
| "mel_spec_type": "vocos", | |
| "model_type": "f5_tts", | |
| "n_fft": 1024, | |
| "n_mel_channels": 100, | |
| "num_attention_heads": 12, | |
| "num_key_value_heads": 4, | |
| "odeint_kwargs": { | |
| "method": "euler" | |
| }, | |
| "sigma": 0.0, | |
| "target_sample_rate": 24000, | |
| "text_hidden_size": 512, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.47.1", | |
| "vocab_size": 54, | |
| "win_length": 1024 | |
| } | |