prince-canuma commited on
Commit
55d4de6
·
verified ·
1 Parent(s): 27220b7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -17
README.md CHANGED
@@ -62,6 +62,7 @@ from mlx_audio.sts.models.lfm_audio import (
62
  ChatState,
63
  LFMModality,
64
  )
 
65
 
66
  # Load model and processor
67
  model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-4bit")
@@ -70,32 +71,34 @@ processor = LFM2AudioProcessor.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-
70
  # Create chat state
71
  chat = ChatState(processor)
72
  chat.new_turn("system")
73
- chat.add_text("Respond with audio.")
74
  chat.end_turn()
75
  chat.new_turn("user")
76
- chat.add_text("Say: Hello, welcome to MLX Audio!")
77
  chat.end_turn()
78
  chat.new_turn("assistant")
79
 
80
  # Generate with interleaved text and audio
81
- text_out, audio_out = [], []
82
- for token, modality in model.generate_interleaved(**dict(chat), max_new_tokens=2048):
 
 
 
 
 
83
  mx.eval(token)
84
- if modality == LFMModality.TEXT:
85
- text_out.append(token)
86
- print(processor.decode_text(token[None]), end="", flush=True)
87
- else:
88
- audio_out.append(token)
89
 
90
- # Decode audio - each token is (8,) for all codebooks
91
- if audio_out:
92
- audio_codes = mx.stack(audio_out[:-1], axis=1)[None, :] # (1, 8, T)
93
- waveform = processor.decode_with_detokenizer(audio_codes)
94
- # Or use Mimi codec: waveform = processor.decode_audio(audio_codes[0])
95
 
96
- # Save audio (24kHz sample rate)
97
- import soundfile as sf
98
- sf.write("output.wav", waveform[0].tolist(), 24000)
99
  ```
100
 
101
  ### Speech-to-Text (ASR)
 
62
  ChatState,
63
  LFMModality,
64
  )
65
+ from mlx_audio.sts.models.lfm_audio.model import AUDIO_EOS_TOKEN
66
 
67
  # Load model and processor
68
  model = LFM2AudioModel.from_pretrained("mlx-community/LFM2.5-Audio-1.5B-4bit")
 
71
  # Create chat state
72
  chat = ChatState(processor)
73
  chat.new_turn("system")
74
+ chat.add_text("Perform TTS. Use a UK male voice.")
75
  chat.end_turn()
76
  chat.new_turn("user")
77
+ chat.add_text("Hello, welcome to MLX Audio!")
78
  chat.end_turn()
79
  chat.new_turn("assistant")
80
 
81
  # Generate with interleaved text and audio
82
+ audio_codes = []
83
+ for token, modality in model.generate_sequential(
84
+ **dict(chat),
85
+ max_new_tokens=2048,
86
+ temperature=0.8,
87
+
88
+ ):
89
  mx.eval(token)
90
+ if modality == LFMModality.AUDIO_OUT:
91
+ if token[0].item() == AUDIO_EOS_TOKEN:
92
+ break
93
+ audio_codes.append(token)
 
94
 
95
+ # Decode audio
96
+ audio_codes = mx.stack(audio_codes, axis=0)[None, :].transpose(0, 2, 1)
97
+ waveform = processor.decode_audio(audio_codes)
 
 
98
 
99
+ # Save audio (24kHz sample rate)
100
+ import soundfile as sf
101
+ sf.write("output.wav", waveform[0].tolist(), model.sample_rate)
102
  ```
103
 
104
  ### Speech-to-Text (ASR)