Update README.md
Browse files
README.md
CHANGED
|
@@ -62,6 +62,7 @@ from mlx_audio.sts.models.lfm_audio import (
|
|
| 62 |
ChatState,
|
| 63 |
LFMModality,
|
| 64 |
)
|
|
|
|
| 65 |
|
| 66 |
# Load model and processor
|
| 67 |
model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-4bit")
|
|
@@ -70,32 +71,34 @@ processor = LFM2AudioProcessor.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-
|
|
| 70 |
# Create chat state
|
| 71 |
chat = ChatState(processor)
|
| 72 |
chat.new_turn("system")
|
| 73 |
-
chat.add_text("
|
| 74 |
chat.end_turn()
|
| 75 |
chat.new_turn("user")
|
| 76 |
-
chat.add_text("
|
| 77 |
chat.end_turn()
|
| 78 |
chat.new_turn("assistant")
|
| 79 |
|
| 80 |
# Generate with interleaved text and audio
|
| 81 |
-
|
| 82 |
-
for token, modality in model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
mx.eval(token)
|
| 84 |
-
if modality == LFMModality.
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
audio_out.append(token)
|
| 89 |
|
| 90 |
-
# Decode audio
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
waveform = processor.decode_with_detokenizer(audio_codes)
|
| 94 |
-
# Or use Mimi codec: waveform = processor.decode_audio(audio_codes[0])
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
```
|
| 100 |
|
| 101 |
### Speech-to-Text (ASR)
|
|
|
|
| 62 |
ChatState,
|
| 63 |
LFMModality,
|
| 64 |
)
|
| 65 |
+
from mlx_audio.sts.models.lfm_audio.model import AUDIO_EOS_TOKEN
|
| 66 |
|
| 67 |
# Load model and processor
|
| 68 |
model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-4bit")
|
|
|
|
| 71 |
# Create chat state
|
| 72 |
chat = ChatState(processor)
|
| 73 |
chat.new_turn("system")
|
| 74 |
+
chat.add_text("Perform TTS. Use a UK male voice.")
|
| 75 |
chat.end_turn()
|
| 76 |
chat.new_turn("user")
|
| 77 |
+
chat.add_text("Hello, welcome to MLX Audio!")
|
| 78 |
chat.end_turn()
|
| 79 |
chat.new_turn("assistant")
|
| 80 |
|
| 81 |
# Generate with interleaved text and audio
|
| 82 |
+
audio_codes = []
|
| 83 |
+
for token, modality in model.generate_sequential(
|
| 84 |
+
**dict(chat),
|
| 85 |
+
max_new_tokens=2048,
|
| 86 |
+
temperature=0.8,
|
| 87 |
+
|
| 88 |
+
):
|
| 89 |
mx.eval(token)
|
| 90 |
+
if modality == LFMModality.AUDIO_OUT:
|
| 91 |
+
if token[0].item() == AUDIO_EOS_TOKEN:
|
| 92 |
+
break
|
| 93 |
+
audio_codes.append(token)
|
|
|
|
| 94 |
|
| 95 |
+
# Decode audio
|
| 96 |
+
audio_codes = mx.stack(audio_codes, axis=0)[None, :].transpose(0, 2, 1)
|
| 97 |
+
waveform = processor.decode_audio(audio_codes)
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
# Save audio (24kHz sample rate)
|
| 100 |
+
import soundfile as sf
|
| 101 |
+
sf.write("output.wav", waveform[0].tolist(), model.sample_rate)
|
| 102 |
```
|
| 103 |
|
| 104 |
### Speech-to-Text (ASR)
|