Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +46 -0
config.json +116 -0
model.safetensors +3 -0
model.safetensors.index.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+license: mit
+language:
+- zh
+- en
+library_name: mlx-audio
+tags:
+- mlx
+- text-to-speech
+- speech
+- speech generation
+- voice cloning
+- tts
+- mlx-audio
+---
+# mlx-community/LongCat-AudioDiT-1B-4bit
+This model was converted to MLX format from [`meituan-longcat/LongCat-AudioDiT-1B`](https://huggingface.co/meituan-longcat/LongCat-AudioDiT-1B) using mlx-audio version **0.4.3**.
+Refer to the [original model card](https://huggingface.co/meituan-longcat/LongCat-AudioDiT-1B) for more details on the model.
+## Use with mlx-audio
+```bash
+pip install -U mlx-audio
+```
+### CLI Example:
+```bash
+python -m mlx_audio.tts.generate --model mlx-community/LongCat-AudioDiT-1B-4bit --text "Hello, this is a test."
+```
+### Python Example:
+```python
+from mlx_audio.tts.utils import load_model
+from mlx_audio.tts.generate import generate_audio
+model = load_model("mlx-community/LongCat-AudioDiT-1B-4bit")
+generate_audio(
+    model=model,
+    text="Hello, this is a test.",
+    ref_audio="path_to_audio.wav",
+    file_prefix="test_audio",
+)
+```

config.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+    "dit_adaln_type": "global",
+    "dit_adaln_use_text_cond": true,
+    "dit_bias": true,
+    "dit_cross_attn": true,
+    "dit_cross_attn_norm": false,
+    "dit_depth": 24,
+    "dit_dim": 1536,
+    "dit_dropout": 0.0,
+    "dit_eps": 1e-06,
+    "dit_ff_mult": 4,
+    "dit_heads": 24,
+    "dit_long_skip": true,
+    "dit_qk_norm": true,
+    "dit_text_conv": true,
+    "dit_text_dim": 768,
+    "dit_use_latent_condition": true,
+    "latent_dim": 64,
+    "latent_hop": 2048,
+    "max_wav_duration": 30,
+    "model_type": "longcat_audiodit",
+    "quantization": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    },
+    "repa_dit_layer": 8,
+    "sampling_rate": 24000,
+    "sigma": 0.0,
+    "text_add_embed": true,
+    "text_encoder_config": {
+        "_name_or_path": "ArthurZ/umt5-base",
+        "architectures": [
+            "UMT5ForConditionalGeneration"
+        ],
+        "chunk_size_feed_forward": 0,
+        "classifier_dropout": 0.0,
+        "d_ff": 2048,
+        "d_kv": 64,
+        "d_model": 768,
+        "decoder_start_token_id": 0,
+        "dense_act_fn": "gelu_new",
+        "dropout_rate": 0.1,
+        "dtype": "float32",
+        "eos_token_id": 1,
+        "feed_forward_proj": "gated-gelu",
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "initializer_factor": 1.0,
+        "is_decoder": false,
+        "is_encoder_decoder": true,
+        "is_gated_act": true,
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "layer_norm_epsilon": 1e-06,
+        "model_type": "umt5",
+        "num_decoder_layers": 12,
+        "num_heads": 12,
+        "num_layers": 12,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "output_past": true,
+        "pad_token_id": 0,
+        "problem_type": null,
+        "relative_attention_max_distance": 128,
+        "relative_attention_num_buckets": 32,
+        "return_dict": true,
+        "scalable_attention": true,
+        "tie_word_embeddings": true,
+        "tokenizer_class": "T5Tokenizer",
+        "use_cache": true,
+        "vocab_size": 256384
+    },
+    "text_encoder_model": "google/umt5-base",
+    "text_norm_feat": true,
+    "transformers_version": "5.3.0",
+    "vae_config": {
+        "c_mults": [
+            1,
+            2,
+            4,
+            8,
+            16
+        ],
+        "channels": 128,
+        "downsample_shortcut": "averaging",
+        "downsampling_ratio": 2048,
+        "encoder_latent_dim": 128,
+        "final_tanh": false,
+        "in_channels": 1,
+        "in_shortcut": "duplicating",
+        "latent_dim": 64,
+        "model_type": "audiodit_vae",
+        "out_shortcut": "averaging",
+        "sample_rate": 24000,
+        "scale": 0.71,
+        "strides": [
+            2,
+            4,
+            4,
+            8,
+            8
+        ],
+        "upsample_shortcut": "duplicating",
+        "use_snake": true
+    }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f492cab2990823b7dbdd927e8faf87369bc993bbf145a48265fba9218b5eccf
+size 1417765849

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff