Instructions to use mlx-community/audiogen-medium-mlx with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/audiogen-medium-mlx with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir audiogen-medium-mlx mlx-community/audiogen-medium-mlx
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
Initial upload: AudioGen Medium MLX-native port
Browse files- .gitattributes +1 -34
- README.md +51 -0
- config.json +82 -0
- model.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,2 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,3 +1,54 @@
|
|
| 1 |
---
|
| 2 |
license: cc-by-nc-4.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: cc-by-nc-4.0
|
| 3 |
+
library_name: mlx
|
| 4 |
+
pipeline_tag: text-to-audio
|
| 5 |
+
base_model: facebook/audiogen-medium
|
| 6 |
+
tags:
|
| 7 |
+
- audio-generation
|
| 8 |
+
- text-to-audio
|
| 9 |
+
- audiogen
|
| 10 |
+
- mlx
|
| 11 |
+
- encodec
|
| 12 |
---
|
| 13 |
+
|
| 14 |
+
# AudioGen Medium (MLX)
|
| 15 |
+
|
| 16 |
+
This is the MLX-native port of [facebook/audiogen-medium](https://huggingface.co/facebook/audiogen-medium), a 1.5B parameter autoregressive transformer for text-to-audio generation.
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
- **Architecture**: Autoregressive Transformer LM over EnCodec discrete tokens
|
| 21 |
+
- **Parameters**: ~1.5B (LM) + EnCodec compression model
|
| 22 |
+
- **Sampling rate**: 16 kHz
|
| 23 |
+
- **Frame rate**: 50 Hz (4 codebooks, delayed pattern)
|
| 24 |
+
- **Text encoder**: T5-small (loaded separately)
|
| 25 |
+
- **Max duration**: 10 seconds (configurable)
|
| 26 |
+
|
| 27 |
+
## Files
|
| 28 |
+
|
| 29 |
+
- `config.json` — Model configuration
|
| 30 |
+
- `model.safetensors` — LM + EnCodec weights
|
| 31 |
+
- `model.safetensors.index.json` — Weight index (for sharded variants)
|
| 32 |
+
- `tokenizer.json` / `tokenizer_config.json` — T5 tokenizer files
|
| 33 |
+
|
| 34 |
+
## Usage (Swift/MLX)
|
| 35 |
+
|
| 36 |
+
```swift
|
| 37 |
+
import MLXAudioGen
|
| 38 |
+
|
| 39 |
+
let model = try await AudioGenModel.fromPretrained(
|
| 40 |
+
modelFolder: modelURL,
|
| 41 |
+
t5Folder: t5URL
|
| 42 |
+
)
|
| 43 |
+
let audio = try await model.generateAudio(
|
| 44 |
+
description: "dog barking",
|
| 45 |
+
duration: 5.0,
|
| 46 |
+
cfgCoef: 3.0,
|
| 47 |
+
temperature: 1.0,
|
| 48 |
+
topK: 250
|
| 49 |
+
)
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## License
|
| 53 |
+
|
| 54 |
+
This model is published under the [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) license (non-commercial use only), following the original [AudioGen license](https://huggingface.co/facebook/audiogen-medium).
|
config.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "audiogen",
|
| 3 |
+
"nQ": 4,
|
| 4 |
+
"card": 2048,
|
| 5 |
+
"dim": 1536,
|
| 6 |
+
"numHeads": 24,
|
| 7 |
+
"hiddenScale": 4,
|
| 8 |
+
"numLayers": 48,
|
| 9 |
+
"causal": true,
|
| 10 |
+
"crossAttention": true,
|
| 11 |
+
"dropout": 0.0,
|
| 12 |
+
"activation": "gelu",
|
| 13 |
+
"norm": "layer_norm",
|
| 14 |
+
"normFirst": true,
|
| 15 |
+
"biasFF": false,
|
| 16 |
+
"biasAttn": false,
|
| 17 |
+
"layerScale": null,
|
| 18 |
+
"context": 4096,
|
| 19 |
+
"maxPeriod": 10000,
|
| 20 |
+
"positionalEmbedding": "sin",
|
| 21 |
+
"positionalScale": 1.0,
|
| 22 |
+
"xPos": false,
|
| 23 |
+
"weight": 1.0,
|
| 24 |
+
"conditionProvider": "t5",
|
| 25 |
+
"twoStepCFG": false,
|
| 26 |
+
"kvRepeat": 1,
|
| 27 |
+
"qkLayerNorm": false,
|
| 28 |
+
"emptyLikeInit": false,
|
| 29 |
+
"emptyLikeInitDetokenized": false,
|
| 30 |
+
"zeroHypothesisRate": 0.0,
|
| 31 |
+
"quantize": false,
|
| 32 |
+
"weightsPerStep": null,
|
| 33 |
+
"frameRate": 50.0,
|
| 34 |
+
"sampleRate": 16000,
|
| 35 |
+
"duration": 10.0,
|
| 36 |
+
"numSamples": 1,
|
| 37 |
+
"specialToken": 2048,
|
| 38 |
+
"tokenizer": "t5-small",
|
| 39 |
+
"clsToken": 2048,
|
| 40 |
+
"padToken": 2048,
|
| 41 |
+
"encodec": {
|
| 42 |
+
"model_type": "encodec",
|
| 43 |
+
"audio_channels": 1,
|
| 44 |
+
"num_filters": 32,
|
| 45 |
+
"kernel_size": 7,
|
| 46 |
+
"num_residual_layers": 1,
|
| 47 |
+
"dilation_growth_rate": 2,
|
| 48 |
+
"codebook_size": 2048,
|
| 49 |
+
"codebook_dim": 128,
|
| 50 |
+
"hidden_size": 128,
|
| 51 |
+
"num_lstm_layers": 2,
|
| 52 |
+
"residual_kernel_size": 3,
|
| 53 |
+
"use_causal_conv": true,
|
| 54 |
+
"normalize": false,
|
| 55 |
+
"pad_mode": "reflect",
|
| 56 |
+
"norm_type": "time_group_norm",
|
| 57 |
+
"last_kernel_size": 7,
|
| 58 |
+
"trim_right_ratio": 1.0,
|
| 59 |
+
"compress": 2,
|
| 60 |
+
"upsampling_ratios": [8, 5, 4, 2],
|
| 61 |
+
"target_bandwidths": [1.5, 3.0, 6.0, 12.0, 24.0],
|
| 62 |
+
"sampling_rate": 16000,
|
| 63 |
+
"chunk_length_s": null,
|
| 64 |
+
"overlap": null,
|
| 65 |
+
"use_conv_shortcut": false
|
| 66 |
+
},
|
| 67 |
+
"t5": {
|
| 68 |
+
"model_name": "t5-small",
|
| 69 |
+
"d_model": 512,
|
| 70 |
+
"d_kv": 64,
|
| 71 |
+
"d_ff": 2048,
|
| 72 |
+
"num_layers": 8,
|
| 73 |
+
"num_heads": 6,
|
| 74 |
+
"relative_attention_num_buckets": 32,
|
| 75 |
+
"relative_attention_max_distance": 128,
|
| 76 |
+
"dropout_rate": 0.1,
|
| 77 |
+
"layer_norm_epsilon": 1e-06,
|
| 78 |
+
"feed_forward_proj": "relu",
|
| 79 |
+
"vocab_size": 32128,
|
| 80 |
+
"tie_word_embeddings": true
|
| 81 |
+
}
|
| 82 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3d73edf9f74765df95239449aa6b6c9e6e7ff34b37c42e4c28ce8ef52db91a6
|
| 3 |
+
size 3910789388
|