Spaces:

wkplhc
/

adfege

Sleeping

App Files Files Community

wkplhc commited on Dec 10, 2025

Commit

a354de5

verified ·

1 Parent(s): b63da4e

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -78

app.py CHANGED Viewed

@@ -6,81 +6,73 @@ from transformers import AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 import time
-# --- 模型配置区 ---
-# 8B 主模型 (INT4 量化)
 MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
-# 0.5B 助手模型 (用于投机采样加速)
 DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"
-print("🚀 初始化引擎中...")
-# --- 1. 加载模型 (OpenVINO + 投机采样) ---
 try:
     tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
-    print(f"Loading Main: {MAIN_MODEL_ID}...")
     model = OVModelForCausalLM.from_pretrained(
         MAIN_MODEL_ID,
         ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
     )
-    print(f"Loading Draft: {DRAFT_MODEL_ID}...")
     try:
         draft_model = OVModelForCausalLM.from_pretrained(
             DRAFT_MODEL_ID,
             ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
         )
         print("✅ 投机采样 (Speculative Decoding) 已激活")
-    except Exception as e:
-        print(f"⚠️ 助手模型加载失败，将使用普通模式: {e}")
         draft_model = None
 except Exception as e:
-    print(f"❌ 模型加载严重错误: {e}")
     model = None
-    tokenizer = None
-# --- 2. 辅助工具：解析 Prompt ---
 def parse_system_prompt(mode, text_content, json_file):
     if mode == "文本模式":
         return text_content
     elif mode == "JSON模式":
-        if json_file is None:
-            return "You are a helpful assistant."
         try:
-            with open(json_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            # 兼容多种 JSON 格式
             if isinstance(data, str): return data
             return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
         except:
-            return "Error parsing JSON file."
     return "You are a helpful assistant."
-# --- 3. 核心生成逻辑 (适配 Messages 格式) ---
-def chat_response(history, mode, prompt_text, prompt_json):
     if model is None:
-        history.append({"role": "assistant", "content": "模型加载失败，请检查 Logs。"})
-        yield history
         return
-    # history 现在的格式是:
-    # [{'role': 'user', 'content': '你好'}, {'role': 'assistant', 'content': '...'}]
-    # 1. 获取用户最新的输入 (最后一条 user 消息)
-    # Gradio 的 type="messages" 会自动把用户输入加到 history 里传进来
-    # 所以我们不需要手动 history.append(user_input)
-    # 2. 构建推理用的 Prompt (在最前面插入 System Prompt)
-    system_prompt_content = parse_system_prompt(mode, prompt_text, prompt_json)
-    # 构建给模型看的 messages (临时列表，不影响 UI 显示)
-    model_messages = [{"role": "system", "content": system_prompt_content}]
-    model_messages.extend(history)
-    # 3. 准备推理
-    input_text = tokenizer.apply_chat_template(model_messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(input_text, return_tensors="pt")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -96,65 +88,60 @@ def chat_response(history, mode, prompt_text, prompt_json):
     if draft_model is not None:
         gen_kwargs["assistant_model"] = draft_model
-    # 4. 启动生成
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-    # 5. UI 更新 (流式)
-    # 先添加一个空的 assistant 消息占位
-    history.append({"role": "assistant", "content": ""})
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
-        # 更新 history 的最后一条消息
-        history[-1]['content'] = partial_text
-        yield history
-# --- 4. 构建界面 ---
-with gr.Blocks(title="Qwen Turbo") as demo:
     gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
     with gr.Row():
         with gr.Column(scale=1):
-            with gr.Accordion("🛠️ 设置提示词", open=True):
-                mode_radio = gr.Radio(["文本模式", "JSON模式"], label="模式", value="文本模式")
-                sys_text = gr.Textbox(label="System Prompt", value="You are a helpful assistant.", lines=3)
-                sys_json = gr.File(label="JSON Config", file_types=[".json"], visible=False)
-                def update_vis(m):
-                    return {sys_text: gr.update(visible=(m=="文本模式")), sys_json: gr.update(visible=(m!="文本模式"))}
-                mode_radio.change(update_vis, [mode_radio], [sys_text, sys_json])
         with gr.Column(scale=3):
-            # 关键点：这里显式指定 type="messages"
-            chatbot = gr.Chatbot(height=600, type="messages", label="Qwen2.5-7B (Accel)")
-            msg = gr.Textbox(label="输入消息", placeholder="Enter 发送...")
             with gr.Row():
-                submit_btn = gr.Button("发送", variant="primary")
-                clear_btn = gr.ClearButton([msg, chatbot])
-    # --- 事件绑定 (核心修正) ---
-    # 1. 用户输入处理：直接把用户消息加到 history，并清空输入框
-    def user_turn(user_message, history):
-        return "", history + [{"role": "user", "content": user_message}]
-    # 2. 机器人回复处理：调用生成函数
-    # 注意：generate_response 会 yield 更新后的 history
-    msg.submit(
-        user_turn, [msg, chatbot], [msg, chatbot], queue=False
-    ).then(
-        chat_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
-    submit_btn.click(
-        user_turn, [msg, chatbot], [msg, chatbot], queue=False
-    ).then(
-        chat_response, [chatbot, mode_radio, sys_text, sys_json], [chatbot]
     )
 if __name__ == "__main__":
     demo.queue().launch()

 from threading import Thread
 import time
+# --- 模型配置 (保持不变，因为日志显示加载成功了) ---
 MAIN_MODEL_ID = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
 DRAFT_MODEL_ID = "hsuwill000/Qwen2.5-0.5B-Instruct-openvino-4bit"
+print("🚀 启动引擎...")
+# --- 1. 加载模型 ---
 try:
     tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
     model = OVModelForCausalLM.from_pretrained(
         MAIN_MODEL_ID,
         ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
     )
     try:
         draft_model = OVModelForCausalLM.from_pretrained(
             DRAFT_MODEL_ID,
             ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""},
         )
         print("✅ 投机采样 (Speculative Decoding) 已激活")
+    except:
         draft_model = None
+        print("⚠️ 仅使用主模型推理")
 except Exception as e:
+    print(f"❌ 加载失败: {e}")
     model = None
+# --- 2. 辅助函数 ---
 def parse_system_prompt(mode, text_content, json_file):
     if mode == "文本模式":
         return text_content
     elif mode == "JSON模式":
+        if json_file is None: return "You are a helpful assistant."
         try:
+            with open(json_file.name, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             if isinstance(data, str): return data
             return data.get("system_prompt") or data.get("system") or data.get("prompt") or str(data)
         except:
+            return "Error parsing JSON."
     return "You are a helpful assistant."
+# --- 3. 核心逻辑 (兼容旧版 Gradio 的 Tuple 格式) ---
+def predict(message, history, mode, prompt_text, prompt_json):
+    # history 格式: [[User1, Bot1], [User2, Bot2]]
+    # message: 当前用户输入 (Str)
     if model is None:
+        yield history + [[message, "模型加载失败"]]
         return
+    # 1. 解析系统提示词
+    sys_prompt = parse_system_prompt(mode, prompt_text, prompt_json)
+    # 2. 将 Tuple 历史转换为模型需要的 List of Dicts
+    model_inputs = [{"role": "system", "content": sys_prompt}]
+    for user_msg, bot_msg in history:
+        model_inputs.append({"role": "user", "content": user_msg})
+        model_inputs.append({"role": "assistant", "content": bot_msg})
+    model_inputs.append({"role": "user", "content": message})
+    # 3. 构建输入
+    text = tokenizer.apply_chat_template(model_inputs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt")
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     if draft_model is not None:
         gen_kwargs["assistant_model"] = draft_model
+    # 4. 线程生成
+    t = Thread(target=model.generate, kwargs=gen_kwargs)
+    t.start()
+    # 5. 流式输出，适配 Chatbot 格式
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
+        # yield 的格式必须是: history_list
+        # 即: [[old_u, old_b], ..., [current_u, current_partial_b]]
+        yield history + [[message, partial_text]]
+# --- 4. 界面构建 ---
+with gr.Blocks(title="Qwen Extreme") as demo:
     gr.Markdown("## ⚡ Qwen OpenVINO + Speculative Decoding")
     with gr.Row():
         with gr.Column(scale=1):
+            with gr.Accordion("设置", open=True):
+                mode = gr.Radio(["文本模式", "JSON模式"], value="文本模式", label="Prompt模式")
+                p_text = gr.Textbox(value="You are a helpful assistant.", lines=3, label="System Prompt")
+                p_json = gr.File(label="JSON文件", file_types=[".json"], visible=False)
+                def toggle(m):
+                    return {p_text: gr.update(visible=m=="文本模式"), p_json: gr.update(visible=m=="JSON模式")}
+                mode.change(toggle, mode, [p_text, p_json])
         with gr.Column(scale=3):
+            # 关键修改：移除了 type="messages"，默认就是 tuple 格式，绝对安全
+            chatbot = gr.Chatbot(height=600, label="Qwen2.5-7B")
+            msg = gr.Textbox(label="输入")
             with gr.Row():
+                btn = gr.Button("发送", variant="primary")
+                clear = gr.ClearButton([msg, chatbot])
+    # 事件绑定 (简单粗暴版)
+    # 当点击发送时：
+    # 1. 调用 predict，传入 msg 和 chatbot(也就是history)
+    # 2. 将 predict 的输出(新的history) 更新给 chatbot
+    # 3. 清空 msg
+    submit_event = msg.submit(
+        predict,
+        inputs=[msg, chatbot, mode, p_text, p_json],
+        outputs=[chatbot]
     )
+    msg.submit(lambda: "", None, msg) # 清空输入框
+    btn_event = btn.click(
+        predict,
+        inputs=[msg, chatbot, mode, p_text, p_json],
+        outputs=[chatbot]
     )
+    btn.click(lambda: "", None, msg) # 清空输入框
 if __name__ == "__main__":
     demo.queue().launch()