ASLP-lab commited on
Commit
a9d2dde
·
verified ·
1 Parent(s): 026f52a

Create infer.py

Browse files
Files changed (1) hide show
  1. infer.py +120 -0
infer.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import torch
5
+ import warnings
6
+
7
+ # ========== 环境设置 ==========
8
+ os.environ['VLLM_USE_V1'] = '0'
9
+ os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
10
+ os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
11
+ os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6,7" #参考qwen3omni
12
+
13
+ warnings.filterwarnings('ignore')
14
+
15
+ from qwen_omni_utils import process_mm_info
16
+ from transformers import Qwen3OmniMoeProcessor
17
+ from vllm import LLM, SamplingParams
18
+
19
+ # ========== 模型加载函数 ==========
20
+ def load_model_processor(model_path):
21
+ num_gpus = torch.cuda.device_count()
22
+ print(f"检测到 {num_gpus} 个 GPU,设置 tensor_parallel_size 为 {num_gpus}")
23
+
24
+ model = LLM(
25
+ model=model_path,
26
+ trust_remote_code=True,
27
+ gpu_memory_utilization=0.90,
28
+ tensor_parallel_size=num_gpus,
29
+ max_num_seqs=4,
30
+ max_model_len=32768,
31
+ seed=1234,
32
+ )
33
+
34
+ processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
35
+ return model, processor
36
+
37
+ # ========== 单条音频推理函数 ==========
38
+ def single_inference(model, processor, audio_path):
39
+ # 构造 Prompt
40
+ prompt_text = (
41
+ "对这段音频进行多维度声学属性分析,以json格式输出text_and_paralanguage(带副语言标签的文本转录),"
42
+ "language(语言),background_sound(背景音),environment(声学环境),gender(性别),age(年龄),"
43
+ "pitch(音高),speed(语速),emotion(情绪),emotion_level(情绪强度),accent(口音),"
44
+ "tone(语气),rhythm(节奏/韵律),texture(音质),pronunciation(发音),"
45
+ "paralinguistic(副语言事件),contextual_inference(语境推理)和caption(音频的综合摘要)。"
46
+ )
47
+
48
+ # 构造模型消息
49
+ messages = [
50
+ {
51
+ "role": "user",
52
+ "content": [
53
+ {"type": "audio", "audio": audio_path},
54
+ {"type": "text", "text": prompt_text}
55
+ ]
56
+ }
57
+ ]
58
+
59
+ # 预处理
60
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
61
+ audios_data, images_data, videos_data = process_mm_info(messages, use_audio_in_video=True)
62
+
63
+ inputs = {
64
+ 'prompt': text,
65
+ 'multi_modal_data': {},
66
+ "mm_processor_kwargs": {"use_audio_in_video": True}
67
+ }
68
+
69
+ if audios_data is not None:
70
+ inputs['multi_modal_data']['audio'] = audios_data
71
+ if images_data is not None:
72
+ inputs['multi_modal_data']['image'] = images_data
73
+ if videos_data is not None:
74
+ inputs['multi_modal_data']['video'] = videos_data
75
+
76
+ # 设置采样参数
77
+ sampling_params = SamplingParams(temperature=0.01, top_p=0.1, top_k=1, max_tokens=2048)
78
+
79
+ # 执行推理
80
+ outputs = model.generate(inputs, sampling_params=sampling_params)
81
+ response = outputs[0].outputs[0].text
82
+
83
+ return response
84
+
85
+ # ========== 主入口 ==========
86
+ if __name__ == "__main__":
87
+ import multiprocessing as mp
88
+ mp.set_start_method("spawn", force=True)
89
+
90
+ # ===== 修改为你的模型路径和音频路径 =====
91
+ MODEL_PATH = "xxxx" #模型路径
92
+ AUDIO_PATH = "xxx.wav" # 请替换为实际音频路径
93
+
94
+ # 检查路径是否存在
95
+ if not os.path.exists(MODEL_PATH):
96
+ print(f"❌ 模型路径不存在: {MODEL_PATH}")
97
+ sys.exit(1)
98
+
99
+ if not os.path.exists(AUDIO_PATH):
100
+ print(f"❌ 音频文件不存在: {AUDIO_PATH}")
101
+ sys.exit(1)
102
+
103
+ print("🚀 正在加载模型...")
104
+ model, processor = load_model_processor(MODEL_PATH)
105
+
106
+ print(f"🎤 正在对音频进行推理: {AUDIO_PATH}")
107
+ response = single_inference(model, processor, AUDIO_PATH)
108
+
109
+ print("\n" + "="*50)
110
+ print("📝 模型输出:")
111
+ print(response)
112
+ print("="*50)
113
+
114
+ # 可选:尝试将输出解析为 JSON 并美化打印
115
+ try:
116
+ parsed = json.loads(response)
117
+ print("\n✅ 解析后的 JSON 内容:")
118
+ print(json.dumps(parsed, indent=2, ensure_ascii=False))
119
+ except json.JSONDecodeError:
120
+ print("\n⚠️ 模型输出并非合法 JSON,以上为原始文本。")