ClementDuhamel commited on
Commit
bbf4412
·
verified ·
1 Parent(s): 8ece500

Initial upload: AudioGen Medium MLX-native port

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -34
  2. README.md +51 -0
  3. config.json +82 -0
  4. model.safetensors +3 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,54 @@
1
  ---
2
  license: cc-by-nc-4.0
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-nc-4.0
3
+ library_name: mlx
4
+ pipeline_tag: text-to-audio
5
+ base_model: facebook/audiogen-medium
6
+ tags:
7
+ - audio-generation
8
+ - text-to-audio
9
+ - audiogen
10
+ - mlx
11
+ - encodec
12
  ---
13
+
14
+ # AudioGen Medium (MLX)
15
+
16
+ This is the MLX-native port of [facebook/audiogen-medium](https://huggingface.co/facebook/audiogen-medium), a 1.5B parameter autoregressive transformer for text-to-audio generation.
17
+
18
+ ## Model Details
19
+
20
+ - **Architecture**: Autoregressive Transformer LM over EnCodec discrete tokens
21
+ - **Parameters**: ~1.5B (LM) + EnCodec compression model
22
+ - **Sampling rate**: 16 kHz
23
+ - **Frame rate**: 50 Hz (4 codebooks, delayed pattern)
24
+ - **Text encoder**: T5-small (loaded separately)
25
+ - **Max duration**: 10 seconds (configurable)
26
+
27
+ ## Files
28
+
29
+ - `config.json` — Model configuration
30
+ - `model.safetensors` — LM + EnCodec weights
31
+ - `model.safetensors.index.json` — Weight index (for sharded variants)
32
+ - `tokenizer.json` / `tokenizer_config.json` — T5 tokenizer files
33
+
34
+ ## Usage (Swift/MLX)
35
+
36
+ ```swift
37
+ import MLXAudioGen
38
+
39
+ let model = try await AudioGenModel.fromPretrained(
40
+ modelFolder: modelURL,
41
+ t5Folder: t5URL
42
+ )
43
+ let audio = try await model.generateAudio(
44
+ description: "dog barking",
45
+ duration: 5.0,
46
+ cfgCoef: 3.0,
47
+ temperature: 1.0,
48
+ topK: 250
49
+ )
50
+ ```
51
+
52
+ ## License
53
+
54
+ This model is published under the [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) license (non-commercial use only), following the original [AudioGen license](https://huggingface.co/facebook/audiogen-medium).
config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "audiogen",
3
+ "nQ": 4,
4
+ "card": 2048,
5
+ "dim": 1536,
6
+ "numHeads": 24,
7
+ "hiddenScale": 4,
8
+ "numLayers": 48,
9
+ "causal": true,
10
+ "crossAttention": true,
11
+ "dropout": 0.0,
12
+ "activation": "gelu",
13
+ "norm": "layer_norm",
14
+ "normFirst": true,
15
+ "biasFF": false,
16
+ "biasAttn": false,
17
+ "layerScale": null,
18
+ "context": 4096,
19
+ "maxPeriod": 10000,
20
+ "positionalEmbedding": "sin",
21
+ "positionalScale": 1.0,
22
+ "xPos": false,
23
+ "weight": 1.0,
24
+ "conditionProvider": "t5",
25
+ "twoStepCFG": false,
26
+ "kvRepeat": 1,
27
+ "qkLayerNorm": false,
28
+ "emptyLikeInit": false,
29
+ "emptyLikeInitDetokenized": false,
30
+ "zeroHypothesisRate": 0.0,
31
+ "quantize": false,
32
+ "weightsPerStep": null,
33
+ "frameRate": 50.0,
34
+ "sampleRate": 16000,
35
+ "duration": 10.0,
36
+ "numSamples": 1,
37
+ "specialToken": 2048,
38
+ "tokenizer": "t5-small",
39
+ "clsToken": 2048,
40
+ "padToken": 2048,
41
+ "encodec": {
42
+ "model_type": "encodec",
43
+ "audio_channels": 1,
44
+ "num_filters": 32,
45
+ "kernel_size": 7,
46
+ "num_residual_layers": 1,
47
+ "dilation_growth_rate": 2,
48
+ "codebook_size": 2048,
49
+ "codebook_dim": 128,
50
+ "hidden_size": 128,
51
+ "num_lstm_layers": 2,
52
+ "residual_kernel_size": 3,
53
+ "use_causal_conv": true,
54
+ "normalize": false,
55
+ "pad_mode": "reflect",
56
+ "norm_type": "time_group_norm",
57
+ "last_kernel_size": 7,
58
+ "trim_right_ratio": 1.0,
59
+ "compress": 2,
60
+ "upsampling_ratios": [8, 5, 4, 2],
61
+ "target_bandwidths": [1.5, 3.0, 6.0, 12.0, 24.0],
62
+ "sampling_rate": 16000,
63
+ "chunk_length_s": null,
64
+ "overlap": null,
65
+ "use_conv_shortcut": false
66
+ },
67
+ "t5": {
68
+ "model_name": "t5-small",
69
+ "d_model": 512,
70
+ "d_kv": 64,
71
+ "d_ff": 2048,
72
+ "num_layers": 8,
73
+ "num_heads": 6,
74
+ "relative_attention_num_buckets": 32,
75
+ "relative_attention_max_distance": 128,
76
+ "dropout_rate": 0.1,
77
+ "layer_norm_epsilon": 1e-06,
78
+ "feed_forward_proj": "relu",
79
+ "vocab_size": 32128,
80
+ "tie_word_embeddings": true
81
+ }
82
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d73edf9f74765df95239449aa6b6c9e6e7ff34b37c42e4c28ce8ef52db91a6
3
+ size 3910789388