File size: 2,465 Bytes
5d70b88
4dd86c0
5d70b88
 
 
925eca6
5d70b88
925eca6
5d70b88
 
 
 
 
 
 
 
4dd86c0
 
5d70b88
 
 
 
4dd86c0
 
 
5d70b88
 
 
 
 
 
 
 
 
 
4dd86c0
5d70b88
4dd86c0
5d70b88
 
 
4dd86c0
5d70b88
 
4dd86c0
5d70b88
 
 
 
 
 
 
 
 
4dd86c0
5d70b88
4dd86c0
5d70b88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd86c0
5d70b88
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
# تأكد أن ملف qwen_omni_utils.py موجود في نفس المجلد
from qwen_omni_utils import process_mm_info 

MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

# 1. استخدام AutoModel بدلاً من الاسم المباشر وتفعيل trust_remote_code
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype="auto",
    device_map="auto",
    attn_implementation="flash_attention_2",
    trust_remote_code=True  # ضروري جداً للنماذج الجديدة
)

# 2. استخدام AutoProcessor وتفعيل trust_remote_code
print("Loading processor...")
processor = AutoProcessor.from_pretrained(
    MODEL_PATH, 
    trust_remote_code=True
)

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"},
            {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"},
            {"type": "text", "text": "What can you see and hear? Answer in one short sentence."}
        ],
    },
]

USE_AUDIO_IN_VIDEO = True

# Preparation for inference
print("Processing inputs...")
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

# تأكد أن دالة process_mm_info تعمل بشكل صحيح مع الروابط
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

inputs = processor(
    text=text, 
    audio=audios, 
    images=images, 
    videos=videos, 
    return_tensors="pt", 
    padding=True, 
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)

inputs = inputs.to(model.device).to(model.dtype)

# Inference
print("Generating...")
text_ids, audio = model.generate(
    **inputs, 
    speaker="Ethan", 
    thinker_return_dict_in_generate=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)

# Decoding text
output_text = processor.batch_decode(
    text_ids.sequences[:, inputs["input_ids"].shape[1] :],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(f"Output Text: {output_text}")

# Saving audio
if audio is not None:
    print("Saving audio to output.wav...")
    sf.write(
        "output.wav",
        audio.reshape(-1).detach().cpu().numpy(),
        samplerate=24000,
    )
    print("Done.")