from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor from qwen_omni_utils import process_mm_info import torch model_path = "./" model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa", ) processor = Qwen3OmniMoeProcessor.from_pretrained(model_path) conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a speech recognition model."} #{"type": "text", "text": "You are a helpful assistant."} ], }, { "role": "user", "content": [ {"type": "audio", "audio": "test.wav"}, {"type": "text", "text": "Transcribe the audio into text."}, ], }, ] # set use audio in video USE_AUDIO_IN_VIDEO = False # Preparation for inference text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) #print("audios: ", len(audios[0]) / 16000, text) inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) inputs = inputs.to(model.device).to(model.dtype) text_ids = model.generate(**inputs, temperature=0.01) text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False) print(text)