File size: 5,245 Bytes
388f606 0e35293 388f606 b63da4e 388f606 a354de5 0e35293 388f606 a354de5 0e35293 a354de5 388f606 0e35293 388f606 0e35293 388f606 0e35293 b63da4e a354de5 0e35293 a354de5 0e35293 388f606 a354de5 388f606 a354de5 0e35293 b63da4e 0e35293 b63da4e a354de5 0e35293 a354de5 0e35293 bf18031 a354de5 0e35293 a354de5 388f606 a354de5 388f606 a354de5 b63da4e a354de5 b63da4e a354de5 b63da4e a354de5 0e35293 388f606 0e35293 388f606 3120b70 bf18031 3120b70 388f606 b63da4e 0e35293 a354de5 0e35293 a354de5 388f606 a354de5 bf18031 a354de5 bf18031 0e35293 a354de5 bf18031 a354de5 0e35293 a354de5 0e35293 a354de5 b63da4e a354de5 0e35293 a354de5 b63da4e a354de5 0e35293 a354de5 388f606 bf18031 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import gradio as gr
import json
import os
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, TextIteratorStreamer
from threading import Thread
import time
# --- 模型配置 (保持不变,因为日志显示加载成功了) ---
MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"
print("🚀 启动引擎...")
# --- 1. 加载模型 ---
try:
tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
model = OVModelForCausalLM.from_pretrained(
MAIN_MODEL_ID,
ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
)
try:
draft_model = OVModelForCausalLM.from_pretrained(
DRAFT_MODEL_ID,
ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
)
print("✅ 投机采样 (Speculative Decoding) 已激活")
except:
draft_model = None
print("⚠️ 仅使用主模型推理")
except Exception as e:
print(f"❌ 加载失败: {e}")
model = None
# --- 2. 辅助函数 ---
def parse_system_prompt(mode, text_content, json_file):
if mode == "文本模式":
return text_content
elif mode == "JSON模式":
if json_file is None: return "You are a helpful assistant."
try:
with open(json_file.name, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, str): return data
return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
except:
return "Error parsing JSON."
return "You are a helpful assistant."
# --- 3. 核心逻辑 (兼容旧版 Gradio 的 Tuple 格式) ---
def predict(message, history, mode, prompt_text, prompt_json):
# history 格式: [[User1, Bot1], [User2, Bot2]]
# message: 当前用户输入 (Str)
if model is None:
yield history + [[message, "模型加载失败"]]
return
# 1. 解析系统提示词
sys_prompt = parse_system_prompt(mode, prompt_text, prompt_json)
# 2. 将 Tuple 历史转换为模型需要的 List of Dicts
model_inputs = [{"role": "system", "content": sys_prompt}]
for user_msg, bot_msg in history:
model_inputs.append({"role": "user", "content": user_msg})
model_inputs.append({"role": "assistant", "content": bot_msg})
model_inputs.append({"role": "user", "content": message})
# 3. 构建输入
text = tokenizer.apply_chat_template(model_inputs, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=512,
temperature=0.6,
do_sample=True,
top_p=0.9,
)
if draft_model is not None:
gen_kwargs["assistant_model"] = draft_model
# 4. 线程生成
t = Thread(target=model.generate, kwargs=gen_kwargs)
t.start()
# 5. 流式输出,适配 Chatbot 格式
partial_text = ""
for new_text in streamer:
partial_text += new_text
# yield 的格式必须是: history_list
# 即: [[old_u, old_b], ..., [current_u, current_partial_b]]
yield history + [[message, partial_text]]
# --- 4. 界面构建 ---
with gr.Blocks(title="Qwen Extreme") as demo:
gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
with gr.Row():
with gr.Column(scale=1):
with gr.Accordion("设置", open=True):
mode = gr.Radio(["文本模式", "JSON模式"], value="文本模式", label="Prompt模式")
p_text = gr.Textbox(value="You are a helpful assistant.", lines=3, label="System Prompt")
p_json = gr.File(label="JSON文件", file_types=[".json"], visible=False)
def toggle(m):
return {p_text: gr.update(visible=m=="文本模式"), p_json: gr.update(visible=m=="JSON模式")}
mode.change(toggle, mode, [p_text, p_json])
with gr.Column(scale=3):
# 关键修改:移除了 type="messages",默认就是 tuple 格式,绝对安全
chatbot = gr.Chatbot(height=600, label="Qwen2.5-7B")
msg = gr.Textbox(label="输入")
with gr.Row():
btn = gr.Button("发送", variant="primary")
clear = gr.ClearButton([msg, chatbot])
# 事件绑定 (简单粗暴版)
# 当点击发送时:
# 1. 调用 predict,传入 msg 和 chatbot(也就是history)
# 2. 将 predict 的输出(新的history) 更新给 chatbot
# 3. 清空 msg
submit_event = msg.submit(
predict,
inputs=[msg, chatbot, mode, p_text, p_json],
outputs=[chatbot]
)
msg.submit(lambda: "", None, msg) # 清空输入框
btn_event = btn.click(
predict,
inputs=[msg, chatbot, mode, p_text, p_json],
outputs=[chatbot]
)
btn.click(lambda: "", None, msg) # 清空输入框
if __name__ == "__main__":
demo.queue().launch() |