File size: 5,245 Bytes
388f606
0e35293
 
388f606
 
 
b63da4e
388f606
a354de5
0e35293
 
388f606
a354de5
0e35293
a354de5
388f606
0e35293
388f606
0e35293
 
388f606
0e35293
 
 
 
 
 
b63da4e
a354de5
0e35293
a354de5
0e35293
388f606
a354de5
388f606
 
a354de5
0e35293
b63da4e
0e35293
b63da4e
a354de5
0e35293
a354de5
0e35293
 
 
bf18031
a354de5
0e35293
 
a354de5
 
 
 
 
388f606
a354de5
388f606
 
a354de5
 
b63da4e
a354de5
 
b63da4e
a354de5
 
 
 
 
b63da4e
a354de5
 
 
0e35293
388f606
 
0e35293
388f606
 
3120b70
bf18031
3120b70
388f606
 
b63da4e
0e35293
 
 
a354de5
 
 
0e35293
a354de5
388f606
 
 
a354de5
 
 
bf18031
a354de5
 
bf18031
0e35293
 
 
a354de5
 
 
 
bf18031
a354de5
 
 
0e35293
 
a354de5
 
 
0e35293
a354de5
 
 
 
 
 
 
 
b63da4e
a354de5
 
 
 
0e35293
a354de5
b63da4e
a354de5
 
 
 
0e35293
a354de5
388f606
 
bf18031
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import json
import os
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, TextIteratorStreamer
from threading import Thread
import time

# --- 模型配置 (保持不变,因为日志显示加载成功了) ---
MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov" 
DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"

print("🚀 启动引擎...")

# --- 1. 加载模型 ---
try:
    tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
    model = OVModelForCausalLM.from_pretrained(
        MAIN_MODEL_ID,
        ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
    )
    
    try:
        draft_model = OVModelForCausalLM.from_pretrained(
            DRAFT_MODEL_ID,
            ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
        )
        print("✅ 投机采样 (Speculative Decoding) 已激活")
    except:
        draft_model = None
        print("⚠️ 仅使用主模型推理")

except Exception as e:
    print(f"❌ 加载失败: {e}")
    model = None

# --- 2. 辅助函数 ---
def parse_system_prompt(mode, text_content, json_file):
    if mode == "文本模式":
        return text_content
    elif mode == "JSON模式":
        if json_file is None: return "You are a helpful assistant."
        try:
            with open(json_file.name, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if isinstance(data, str): return data
            return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
        except:
            return "Error parsing JSON."
    return "You are a helpful assistant."

# --- 3. 核心逻辑 (兼容旧版 Gradio 的 Tuple 格式) ---
def predict(message, history, mode, prompt_text, prompt_json):
    # history 格式: [[User1, Bot1], [User2, Bot2]]
    # message: 当前用户输入 (Str)
    
    if model is None:
        yield history + [[message, "模型加载失败"]]
        return

    # 1. 解析系统提示词
    sys_prompt = parse_system_prompt(mode, prompt_text, prompt_json)
    
    # 2. 将 Tuple 历史转换为模型需要的 List of Dicts
    model_inputs = [{"role": "system", "content": sys_prompt}]
    
    for user_msg, bot_msg in history:
        model_inputs.append({"role": "user", "content": user_msg})
        model_inputs.append({"role": "assistant", "content": bot_msg})
        
    model_inputs.append({"role": "user", "content": message})
    
    # 3. 构建输入
    text = tokenizer.apply_chat_template(model_inputs, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    gen_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=512,
        temperature=0.6,
        do_sample=True,
        top_p=0.9,
    )
    
    if draft_model is not None:
        gen_kwargs["assistant_model"] = draft_model

    # 4. 线程生成
    t = Thread(target=model.generate, kwargs=gen_kwargs)
    t.start()

    # 5. 流式输出,适配 Chatbot 格式
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        # yield 的格式必须是: history_list
        # 即: [[old_u, old_b], ..., [current_u, current_partial_b]]
        yield history + [[message, partial_text]]

# --- 4. 界面构建 ---
with gr.Blocks(title="Qwen Extreme") as demo:
    gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
    
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Accordion("设置", open=True):
                mode = gr.Radio(["文本模式", "JSON模式"], value="文本模式", label="Prompt模式")
                p_text = gr.Textbox(value="You are a helpful assistant.", lines=3, label="System Prompt")
                p_json = gr.File(label="JSON文件", file_types=[".json"], visible=False)

                def toggle(m):
                    return {p_text: gr.update(visible=m=="文本模式"), p_json: gr.update(visible=m=="JSON模式")}
                mode.change(toggle, mode, [p_text, p_json])

        with gr.Column(scale=3):
            # 关键修改:移除了 type="messages",默认就是 tuple 格式,绝对安全
            chatbot = gr.Chatbot(height=600, label="Qwen2.5-7B")
            msg = gr.Textbox(label="输入")
            with gr.Row():
                btn = gr.Button("发送", variant="primary")
                clear = gr.ClearButton([msg, chatbot])

    # 事件绑定 (简单粗暴版)
    # 当点击发送时:
    # 1. 调用 predict,传入 msg 和 chatbot(也就是history)
    # 2. 将 predict 的输出(新的history) 更新给 chatbot
    # 3. 清空 msg
    
    submit_event = msg.submit(
        predict, 
        inputs=[msg, chatbot, mode, p_text, p_json], 
        outputs=[chatbot]
    )
    msg.submit(lambda: "", None, msg) # 清空输入框

    btn_event = btn.click(
        predict, 
        inputs=[msg, chatbot, mode, p_text, p_json], 
        outputs=[chatbot]
    )
    btn.click(lambda: "", None, msg) # 清空输入框

if __name__ == "__main__":
    demo.queue().launch()