| | from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor |
| | from qwen_omni_utils import process_mm_info |
| | import torch |
| |
|
| | model_path = "./" |
| | model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained( |
| | model_path, |
| | torch_dtype=torch.bfloat16, |
| | device_map="auto", |
| | attn_implementation="sdpa", |
| | ) |
| | processor = Qwen3OmniMoeProcessor.from_pretrained(model_path) |
| |
|
| |
|
| | conversation = [ |
| | { |
| | "role": "system", |
| | "content": [ |
| | {"type": "text", "text": "You are a speech recognition model."} |
| | |
| | ], |
| | }, |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "audio", "audio": "test.wav"}, |
| | {"type": "text", "text": "Transcribe the audio into text."}, |
| | ], |
| | }, |
| | ] |
| |
|
| | |
| | USE_AUDIO_IN_VIDEO = False |
| |
|
| | |
| | text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
| | audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) |
| | |
| | inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) |
| | inputs = inputs.to(model.device).to(model.dtype) |
| |
|
| | text_ids = model.generate(**inputs, temperature=0.01) |
| |
|
| | text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :], |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=False) |
| | print(text) |