Spaces:

wkplhc
/

adfege

Sleeping

App Files Files Community

wkplhc commited on Dec 10, 2025

Commit

b63da4e

verified ·

1 Parent(s): bf18031

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -56

app.py CHANGED Viewed

@@ -4,81 +4,83 @@ import os
 from optimum.intel import OVModelForCausalLM
 from transformers import AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-import gc
-# --- 配置区 ---
-# 8B 主模型
 MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
-# 0.5B 助手模型 (投机采样核心)
 DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"
-print("🚀 正在初始化极速推理引擎...")
-# --- 1. 加载模型 ---
 try:
     tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
-    print(f"Loading Main Model: {MAIN_MODEL_ID}...")
     model = OVModelForCausalLM.from_pretrained(
         MAIN_MODEL_ID,
         ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
     )
-    print(f"Loading Draft Model: {DRAFT_MODEL_ID}...")
     try:
         draft_model = OVModelForCausalLM.from_pretrained(
             DRAFT_MODEL_ID,
             ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
         )
-        print("✅ 投机采样加速已激活 (Main + Draft)")
     except Exception as e:
-        print(f"⚠️ 助手模型加载失败，降级为普通推理: {e}")
         draft_model = None
 except Exception as e:
-    print(f"❌ 致命错误: {e}")
     model = None
     tokenizer = None
-# --- 2. 辅助函数 ---
 def parse_system_prompt(mode, text_content, json_file):
-    if mode == "文本模式 (Text)":
         return text_content
-    elif mode == "JSON模式 (File)":
         if json_file is None:
             return "You are a helpful assistant."
         try:
             with open(json_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             if isinstance(data, str): return data
             return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
         except:
-            return "Error parsing JSON"
     return "You are a helpful assistant."
-# --- 3. 核心生成逻辑 (适配 Tuple 历史格式) ---
-def generate_response(history, mode, prompt_text, prompt_json):
     if model is None:
-        yield history + [["", "模型加载失败"]]
         return
-    # 1. 提取当前问题和历史
-    # Gradio Tuple 格式: [[q1, a1], [q2, a2], [curr_q, None]]
-    user_message = history[-1][0]
-    past_history = history[:-1]
-    # 2. 构建 Prompt
-    system_prompt = parse_system_prompt(mode, prompt_text, prompt_json)
-    messages = [{"role": "system", "content": system_prompt}]
-    for user_msg, bot_msg in past_history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
-    messages.append({"role": "user", "content": user_message})
     # 3. 准备推理
-    text_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(text_input, return_tensors="pt")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -90,58 +92,68 @@ def generate_response(history, mode, prompt_text, prompt_json):
         do_sample=True,
         top_p=0.9,
     )
-    # 投机采样注入
     if draft_model is not None:
         gen_kwargs["assistant_model"] = draft_model
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
-    # 4. 流式更新
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
-        # 更新 history 的最后一条记录
-        history[-1][1] = partial_text
         yield history
-# --- 4. 构建 UI (修复版) ---
-# 移除 theme 参数放在 Blocks 初始化里，部分旧版本不支持
-with gr.Blocks(title="Qwen Turbo CPU") as demo:
     gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
-    gr.Markdown("OpenVINO INT4 量化 + 投机采样 (Draft Model) 加速版")
     with gr.Row():
         with gr.Column(scale=1):
-            with gr.Accordion("🛠️ 提示词设置", open=True):
-                mode_radio = gr.Radio(["文本模式 (Text)", "JSON模式 (File)"], label="模式", value="文本模式 (Text)")
                 sys_text = gr.Textbox(label="System Prompt", value="You are a helpful assistant.", lines=3)
                 sys_json = gr.File(label="JSON Config", file_types=[".json"], visible=False)
                 def update_vis(m):
-                    return {sys_text: gr.update(visible=(m=="文本模式 (Text)")), sys_json: gr.update(visible=(m!="文本模式 (Text)"))}
                 mode_radio.change(update_vis, [mode_radio], [sys_text, sys_json])
         with gr.Column(scale=3):
-            # 关键修复：移除 type="messages"，使用默认的 Tuple 格式
-            chatbot = gr.Chatbot(height=600, label="Qwen2.5-7B (Accel)")
             msg = gr.Textbox(label="输入消息", placeholder="Enter 发送...")
             with gr.Row():
                 submit_btn = gr.Button("发送", variant="primary")
                 clear_btn = gr.ClearButton([msg, chatbot])
-    # 事件处理 logic (适配 Tuple)
-    def user_fn(user_message, history):
-        # 用户发言时，追加 [msg, None] 到历史
-        return "", history + [[user_message, None]]
-    # 绑定回车和点击
-    msg.submit(user_fn, [msg, chatbot], [msg, chatbot], queue=False).then(
-        generate_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
-    submit_btn.click(user_fn, [msg, chatbot], [msg, chatbot], queue=False).then(
-        generate_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
 if __name__ == "__main__":

 from optimum.intel import OVModelForCausalLM
 from transformers import AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+import time
+# --- 模型配置区 ---
+# 8B 主模型 (INT4 量化)
 MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
+# 0.5B 助手模型 (用于投机采样加速)
 DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"
+print("🚀 初始化引擎中...")
+# --- 1. 加载模型 (OpenVINO + 投机采样) ---
 try:
     tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
+    print(f"Loading Main: {MAIN_MODEL_ID}...")
     model = OVModelForCausalLM.from_pretrained(
         MAIN_MODEL_ID,
         ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
     )
+    print(f"Loading Draft: {DRAFT_MODEL_ID}...")
     try:
         draft_model = OVModelForCausalLM.from_pretrained(
             DRAFT_MODEL_ID,
             ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
         )
+        print("✅ 投机采样 (Speculative Decoding) 已激活")
     except Exception as e:
+        print(f"⚠️ 助手模型加载失败，将使用普通模式: {e}")
         draft_model = None
 except Exception as e:
+    print(f"❌ 模型加载严重错误: {e}")
     model = None
     tokenizer = None
+# --- 2. 辅助工具：解析 Prompt ---
 def parse_system_prompt(mode, text_content, json_file):
+    if mode == "文本模式":
         return text_content
+    elif mode == "JSON模式":
         if json_file is None:
             return "You are a helpful assistant."
         try:
             with open(json_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            # 兼容多种 JSON 格式
             if isinstance(data, str): return data
             return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
         except:
+            return "Error parsing JSON file."
     return "You are a helpful assistant."
+# --- 3. 核心生成逻辑 (适配 Messages 格式) ---
+def chat_response(history, mode, prompt_text, prompt_json):
     if model is None:
+        history.append({"role": "assistant", "content": "模型加载失败，请检查 Logs。"})
+        yield history
         return
+    # history 现在的格式是:
+    # [{'role': 'user', 'content': '你好'}, {'role': 'assistant', 'content': '...'}]
+    # 1. 获取用户最新的输入 (最后一条 user 消息)
+    # Gradio 的 type="messages" 会自动把用户输入加到 history 里传进来
+    # 所以我们不需要手动 history.append(user_input)
+    # 2. 构建推理用的 Prompt (在最前面插入 System Prompt)
+    system_prompt_content = parse_system_prompt(mode, prompt_text, prompt_json)
+    # 构建给模型看的 messages (临时列表，不影响 UI 显示)
+    model_messages = [{"role": "system", "content": system_prompt_content}]
+    model_messages.extend(history)
     # 3. 准备推理
+    input_text = tokenizer.apply_chat_template(model_messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(input_text, return_tensors="pt")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         do_sample=True,
         top_p=0.9,
     )
     if draft_model is not None:
         gen_kwargs["assistant_model"] = draft_model
+    # 4. 启动生成
     thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
+    # 5. UI 更新 (流式)
+    # 先添加一个空的 assistant 消息占位
+    history.append({"role": "assistant", "content": ""})
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
+        # 更新 history 的最后一条消息
+        history[-1]['content'] = partial_text
         yield history
+# --- 4. 构建界面 ---
+with gr.Blocks(title="Qwen Turbo") as demo:
     gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
     with gr.Row():
         with gr.Column(scale=1):
+            with gr.Accordion("🛠️ 设置提示词", open=True):
+                mode_radio = gr.Radio(["文本模式", "JSON模式"], label="模式", value="文本模式")
                 sys_text = gr.Textbox(label="System Prompt", value="You are a helpful assistant.", lines=3)
                 sys_json = gr.File(label="JSON Config", file_types=[".json"], visible=False)
                 def update_vis(m):
+                    return {sys_text: gr.update(visible=(m=="文本模式")), sys_json: gr.update(visible=(m!="文本模式"))}
                 mode_radio.change(update_vis, [mode_radio], [sys_text, sys_json])
         with gr.Column(scale=3):
+            # 关键点：这里显式指定 type="messages"
+            chatbot = gr.Chatbot(height=600, type="messages", label="Qwen2.5-7B (Accel)")
             msg = gr.Textbox(label="输入消息", placeholder="Enter 发送...")
             with gr.Row():
                 submit_btn = gr.Button("发送", variant="primary")
                 clear_btn = gr.ClearButton([msg, chatbot])
+    # --- 事件绑定 (核心修正) ---
+    # 1. 用户输入处理：直接把用户消息加到 history，并清空输入框
+    def user_turn(user_message, history):
+        return "", history + [{"role": "user", "content": user_message}]
+    # 2. 机器人回复处理：调用生成函数
+    # 注意：generate_response 会 yield 更新后的 history
+    msg.submit(
+        user_turn, [msg, chatbot], [msg, chatbot], queue=False
+    ).then(
+        chat_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
+    submit_btn.click(
+        user_turn, [msg, chatbot], [msg, chatbot], queue=False
+    ).then(
+        chat_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
 if __name__ == "__main__":