Text-to-Speech
Transformers
Safetensors
omnivoice
tts
singing
emotion
expressive-tts
multilingual
voice-cloning
Instructions to use ModelsLab/omnivoice-singing with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModelsLab/omnivoice-singing with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-speech", model="ModelsLab/omnivoice-singing")# Load model directly from transformers import OmniVoice model = OmniVoice.from_pretrained("ModelsLab/omnivoice-singing", dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 2,531 Bytes
2c17b79 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | {
"acoustic_model_config": {
"codebook_dim": 8,
"codebook_loss_weight": 1.0,
"codebook_size": 1024,
"commitment_loss_weight": 0.25,
"decoder_hidden_size": 1024,
"downsampling_ratios": [
8,
5,
4,
2,
3
],
"encoder_hidden_size": 64,
"hidden_size": 256,
"hop_length": 960,
"model_type": "dac",
"n_codebooks": 9,
"quantizer_dropout": 0,
"sampling_rate": 16000,
"upsampling_ratios": [
8,
5,
4,
2,
3
]
},
"architectures": [
"HiggsAudioV2TokenizerModel"
],
"block_dilations": [
1,
1
],
"channel_ratios": [
1,
1
],
"codebook_dim": 64,
"codebook_size": 1024,
"downsample_factor": 320,
"dtype": "float32",
"initializer_range": 0.02,
"kernel_size": 3,
"model_type": "higgs_audio_v2_tokenizer",
"sample_rate": 24000,
"semantic_model_config": {
"activation_dropout": 0.1,
"apply_spec_augment": true,
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_pos_batch_norm": false,
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_norm": "group",
"feat_proj_dropout": 0.0,
"feat_proj_layer_norm": true,
"final_dropout": 0.1,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_prob": 0.0,
"model_type": "hubert",
"num_attention_heads": 12,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 12,
"pad_token_id": 0,
"use_weighted_layer_sum": false,
"vocab_size": 32
},
"semantic_sample_rate": 16000,
"strides": [
1,
1
],
"target_bandwidths": [
0.5,
1,
1.5,
2
],
"transformers_version": "5.3.0.dev0",
"unit_kernel_size": 3
}
|