Text-to-Speech
KimiAudio
Safetensors
English
Chinese
audio
audio-language-model
speech-recognition
audio-understanding
audio-generation
chat
custom_code
Instructions to use moonshotai/Kimi-Audio-7B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- KimiAudio
How to use moonshotai/Kimi-Audio-7B-Instruct with KimiAudio:
# Example usage for KimiAudio # pip install git+https://github.com/MoonshotAI/Kimi-Audio.git from kimia_infer.api.kimia import KimiAudio model = KimiAudio(model_path="moonshotai/Kimi-Audio-7B-Instruct", load_detokenizer=True) sampling_params = { "audio_temperature": 0.8, "audio_top_k": 10, "text_temperature": 0.0, "text_top_k": 5, } # For ASR asr_audio = "asr_example.wav" messages_asr = [ {"role": "user", "message_type": "text", "content": "Please transcribe the following audio:"}, {"role": "user", "message_type": "audio", "content": asr_audio} ] _, text = model.generate(messages_asr, **sampling_params, output_type="text") print(text) # For Q&A qa_audio = "qa_example.wav" messages_conv = [{"role": "user", "message_type": "audio", "content": qa_audio}] wav, text = model.generate(messages_conv, **sampling_params, output_type="both") sf.write("output_audio.wav", wav.cpu().view(-1).numpy(), 24000) print(text) - Notebooks
- Google Colab
- Kaggle
[Fix] Fix code bugs in modeling for multiturn inference
#17
by KamioMitsuzu - opened
modeling_moonshot_kimia.py
CHANGED
|
@@ -685,14 +685,13 @@ class MoonshotKimiaModel(Qwen2PreTrainedModel):
|
|
| 685 |
.to(torch.cuda.current_device())
|
| 686 |
.to(whisper_dtype)
|
| 687 |
)
|
| 688 |
-
|
|
|
|
| 689 |
media_start_idx, media_end_idx
|
| 690 |
-
):
|
| 691 |
-
# assert whisper_emb.shape[1] == end_idx - (start_idx + 1)
|
| 692 |
|
| 693 |
feat_len = end_idx - (start_idx + 1)
|
| 694 |
whisper_input_feature_i = whisper_input_feature[seg_idx].squeeze(0)
|
| 695 |
-
assert feat_len == is_continuous_mask[seg_idx].sum()
|
| 696 |
expanded_whisper[start_idx + 1 : end_idx, :] = (
|
| 697 |
whisper_input_feature_i[:feat_len, :]
|
| 698 |
)
|
|
|
|
| 685 |
.to(torch.cuda.current_device())
|
| 686 |
.to(whisper_dtype)
|
| 687 |
)
|
| 688 |
+
assert (media_end_idx - media_start_idx).sum() - media_start_idx.shape[0] == is_continuous_mask.sum()
|
| 689 |
+
for seg_idx, ((batch_idx, start_idx), (_, end_idx)) in enumerate(zip(
|
| 690 |
media_start_idx, media_end_idx
|
| 691 |
+
)):
|
|
|
|
| 692 |
|
| 693 |
feat_len = end_idx - (start_idx + 1)
|
| 694 |
whisper_input_feature_i = whisper_input_feature[seg_idx].squeeze(0)
|
|
|
|
| 695 |
expanded_whisper[start_idx + 1 : end_idx, :] = (
|
| 696 |
whisper_input_feature_i[:feat_len, :]
|
| 697 |
)
|