prince-canuma commited on
Commit
c875bcf
·
verified ·
1 Parent(s): 9e8b8d6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -17
README.md CHANGED
@@ -61,6 +61,7 @@ from mlx_audio.sts.models.lfm_audio import (
61
  ChatState,
62
  LFMModality,
63
  )
 
64
 
65
  # Load model and processor
66
  model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-6bit")
@@ -69,32 +70,34 @@ processor = LFM2AudioProcessor.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-
69
  # Create chat state
70
  chat = ChatState(processor)
71
  chat.new_turn("system")
72
- chat.add_text("Respond with audio.")
73
  chat.end_turn()
74
  chat.new_turn("user")
75
- chat.add_text("Say: Hello, welcome to MLX Audio!")
76
  chat.end_turn()
77
  chat.new_turn("assistant")
78
 
79
  # Generate with interleaved text and audio
80
- text_out, audio_out = [], []
81
- for token, modality in model.generate_interleaved(**dict(chat), max_new_tokens=2048):
 
 
 
 
 
82
  mx.eval(token)
83
- if modality == LFMModality.TEXT:
84
- text_out.append(token)
85
- print(processor.decode_text(token[None]), end="", flush=True)
86
- else:
87
- audio_out.append(token)
88
 
89
- # Decode audio - each token is (8,) for all codebooks
90
- if audio_out:
91
- audio_codes = mx.stack(audio_out[:-1], axis=1)[None, :] # (1, 8, T)
92
- waveform = processor.decode_with_detokenizer(audio_codes)
93
- # Or use Mimi codec: waveform = processor.decode_audio(audio_codes[0])
94
 
95
- # Save audio (24kHz sample rate)
96
- import soundfile as sf
97
- sf.write("output.wav", waveform[0].tolist(), 24000)
98
  ```
99
 
100
  ### Speech-to-Text (ASR)
 
61
  ChatState,
62
  LFMModality,
63
  )
64
+ from mlx_audio.sts.models.lfm_audio.model import AUDIO_EOS_TOKEN
65
 
66
  # Load model and processor
67
  model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-6bit")
 
70
  # Create chat state
71
  chat = ChatState(processor)
72
  chat.new_turn("system")
73
+ chat.add_text("Perform TTS. Use a UK male voice.")
74
  chat.end_turn()
75
  chat.new_turn("user")
76
+ chat.add_text("Hello, welcome to MLX Audio!")
77
  chat.end_turn()
78
  chat.new_turn("assistant")
79
 
80
  # Generate with interleaved text and audio
81
+ audio_codes = []
82
+ for token, modality in model.generate_sequential(
83
+ **dict(chat),
84
+ max_new_tokens=2048,
85
+ temperature=0.8,
86
+
87
+ ):
88
  mx.eval(token)
89
+ if modality == LFMModality.AUDIO_OUT:
90
+ if token[0].item() == AUDIO_EOS_TOKEN:
91
+ break
92
+ audio_codes.append(token)
 
93
 
94
+ # Decode audio
95
+ audio_codes = mx.stack(audio_codes, axis=0)[None, :].transpose(0, 2, 1)
96
+ waveform = processor.decode_audio(audio_codes)
 
 
97
 
98
+ # Save audio (24kHz sample rate)
99
+ import soundfile as sf
100
+ sf.write("output.wav", waveform[0].tolist(), model.sample_rate)
101
  ```
102
 
103
  ### Speech-to-Text (ASR)