ReopenAI commited on
Commit
3acb53e
·
verified ·
1 Parent(s): 6c5ab5c

Update test.py

Browse files
Files changed (1) hide show
  1. test.py +47 -52
test.py CHANGED
@@ -1,52 +1,47 @@
1
- from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
2
- from qwen_omni_utils import process_mm_info
3
- import torch
4
-
5
- model_path = "/mnt/diskhd/Backup/DownloadModel/Qwen3-Omni-30B-A3B-Instruct/Qwen3-Omni-30B-A3B-Thinker-GPTQ-W4A16/pretrain/"
6
- model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
7
- model_path,
8
- torch_dtype=torch.bfloat16,
9
- device_map="auto",
10
- attn_implementation="flash_attention_2",
11
- )
12
- processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
13
-
14
-
15
- conversation = [
16
- {
17
- "role": "system",
18
- "content": [
19
- {"type": "text", "text": "You are a speech recognition model."}
20
- #{"type": "text", "text": "You are a helpful assistant."}
21
- ],
22
- },
23
- {
24
- "role": "user",
25
- "content": [
26
- {"type": "audio", "audio": "test.wav"},
27
- {"type": "text", "text": "Transcribe the audio into text."},
28
- ],
29
- },
30
- ]
31
-
32
- # set use audio in video
33
- USE_AUDIO_IN_VIDEO = False
34
-
35
- # Preparation for inference
36
- text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
37
- audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
38
- #print("audios: ", len(audios[0]) / 16000, text)
39
- inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
40
- inputs = inputs.to(model.device).to(model.dtype)
41
-
42
- text_ids = model.generate(**inputs, temperature=0.01)
43
- text = processor.batch_decode(text_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
44
- print(text)
45
-
46
-
47
- model.save_pretrained(
48
- "pretrain",
49
- safe_serialization=True,
50
- max_shard_size="4GB"
51
- )
52
- processor.save_pretrained("pretrain")
 
1
+ from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
2
+ from qwen_omni_utils import process_mm_info
3
+ import torch
4
+
5
+ model_path = "./"
6
+ model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
7
+ model_path,
8
+ torch_dtype=torch.bfloat16,
9
+ device_map="auto",
10
+ attn_implementation="sdpa",
11
+ )
12
+ processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
13
+
14
+
15
+ conversation = [
16
+ {
17
+ "role": "system",
18
+ "content": [
19
+ {"type": "text", "text": "You are a speech recognition model."}
20
+ #{"type": "text", "text": "You are a helpful assistant."}
21
+ ],
22
+ },
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {"type": "audio", "audio": "test.wav"},
27
+ {"type": "text", "text": "Transcribe the audio into text."},
28
+ ],
29
+ },
30
+ ]
31
+
32
+ # set use audio in video
33
+ USE_AUDIO_IN_VIDEO = False
34
+
35
+ # Preparation for inference
36
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
37
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
38
+ #print("audios: ", len(audios[0]) / 16000, text)
39
+ inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
40
+ inputs = inputs.to(model.device).to(model.dtype)
41
+
42
+ text_ids = model.generate(**inputs, temperature=0.01)
43
+
44
+ text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :],
45
+ skip_special_tokens=True,
46
+ clean_up_tokenization_spaces=False)
47
+ print(text)