ReopenAI
/

Qwen3-omni-ASR-GPTQ-Int4

@@ -1,52 +1,47 @@
-from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
-from qwen_omni_utils import process_mm_info
-import torch
-model_path = "/mnt/diskhd/Backup/DownloadModel/Qwen3-Omni-30B-A3B-Instruct/Qwen3-Omni-30B-A3B-Thinker-GPTQ-W4A16/pretrain/"
-model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
-    model_path,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    attn_implementation="flash_attention_2",
-)
-processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
-conversation = [
-    {
-    "role": "system",
-    "content": [
-        {"type": "text", "text": "You are a speech recognition model."}
-        #{"type": "text", "text": "You are a helpful assistant."}
-    ],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "audio", "audio": "test.wav"},
-            {"type": "text", "text": "Transcribe the audio into text."},
-        ],
-    },
-]
-# set use audio in video
-USE_AUDIO_IN_VIDEO = False
-# Preparation for inference
-text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-#print("audios: ", len(audios[0]) / 16000, text)
-inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-inputs = inputs.to(model.device).to(model.dtype)
-text_ids = model.generate(**inputs, temperature=0.01)
-text = processor.batch_decode(text_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-print(text)
-model.save_pretrained(
-    "pretrain",
-    safe_serialization=True,
-    max_shard_size="4GB"
-)
-processor.save_pretrained("pretrain")

+from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
+from qwen_omni_utils import process_mm_info
+import torch
+model_path = "./"
+model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa",
+)
+processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
+conversation = [
+    {
+    "role": "system",
+    "content": [
+        {"type": "text", "text": "You are a speech recognition model."}
+        #{"type": "text", "text": "You are a helpful assistant."}
+    ],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "audio", "audio": "test.wav"},
+            {"type": "text", "text": "Transcribe the audio into text."},
+        ],
+    },
+]
+# set use audio in video
+USE_AUDIO_IN_VIDEO = False
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+#print("audios: ", len(audios[0]) / 16000, text)
+inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+inputs = inputs.to(model.device).to(model.dtype)
+text_ids = model.generate(**inputs, temperature=0.01)
+text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :],
+                              skip_special_tokens=True,
+                              clean_up_tokenization_spaces=False)
+print(text)