Instructions to use mlx-community/audiogen-medium-mlx with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/audiogen-medium-mlx with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir audiogen-medium-mlx mlx-community/audiogen-medium-mlx
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
| { | |
| "model_type": "audiogen", | |
| "nQ": 4, | |
| "card": 2048, | |
| "dim": 1536, | |
| "numHeads": 24, | |
| "hiddenScale": 4, | |
| "numLayers": 48, | |
| "causal": true, | |
| "crossAttention": true, | |
| "dropout": 0.0, | |
| "activation": "gelu", | |
| "norm": "layer_norm", | |
| "normFirst": true, | |
| "biasFF": false, | |
| "biasAttn": false, | |
| "layerScale": null, | |
| "context": 4096, | |
| "maxPeriod": 10000, | |
| "positionalEmbedding": "sin", | |
| "positionalScale": 1.0, | |
| "xPos": false, | |
| "weight": 1.0, | |
| "conditionProvider": "t5", | |
| "twoStepCFG": false, | |
| "kvRepeat": 1, | |
| "qkLayerNorm": false, | |
| "emptyLikeInit": false, | |
| "emptyLikeInitDetokenized": false, | |
| "zeroHypothesisRate": 0.0, | |
| "quantize": false, | |
| "weightsPerStep": null, | |
| "normalize": true, | |
| "frameRate": 50.0, | |
| "sampleRate": 16000, | |
| "duration": 10.0, | |
| "numSamples": 1, | |
| "specialToken": 2048, | |
| "tokenizer": "t5-large", | |
| "t5_model_name": "t5-large", | |
| "clsToken": 2048, | |
| "padToken": 2048, | |
| "encodec": { | |
| "model_type": "encodec", | |
| "audio_channels": 1, | |
| "num_filters": 32, | |
| "kernel_size": 7, | |
| "num_residual_layers": 1, | |
| "dilation_growth_rate": 2, | |
| "codebook_size": 2048, | |
| "codebook_dim": 128, | |
| "hidden_size": 128, | |
| "num_lstm_layers": 2, | |
| "residual_kernel_size": 3, | |
| "use_causal_conv": true, | |
| "normalize": false, | |
| "pad_mode": "reflect", | |
| "norm_type": "time_group_norm", | |
| "last_kernel_size": 7, | |
| "trim_right_ratio": 1.0, | |
| "compress": 2, | |
| "upsampling_ratios": [8, 5, 4, 2], | |
| "target_bandwidths": [1.5, 3.0, 6.0, 12.0, 24.0], | |
| "sampling_rate": 16000, | |
| "chunk_length_s": null, | |
| "overlap": null, | |
| "use_conv_shortcut": false | |
| }, | |
| "t5": { | |
| "model_name": "t5-large", | |
| "d_model": 1024, | |
| "d_kv": 64, | |
| "d_ff": 4096, | |
| "num_layers": 24, | |
| "num_heads": 16, | |
| "relative_attention_num_buckets": 32, | |
| "relative_attention_max_distance": 128, | |
| "dropout_rate": 0.0, | |
| "layer_norm_epsilon": 1e-06, | |
| "feed_forward_proj": "relu", | |
| "vocab_size": 32128, | |
| "tie_word_embeddings": true | |
| } | |
| } | |