Update app.py
Browse files
app.py
CHANGED
|
@@ -1,102 +1,82 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
-
import json
|
| 5 |
|
| 6 |
# --- 配置 ---
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# --- 核心对话函数 ---
|
| 13 |
def predict(message, history):
|
| 14 |
"""
|
| 15 |
-
|
| 16 |
-
:param message: 用户当前发送的消息 (str)
|
| 17 |
-
:param history: 对话历史 (list of lists),格式为 [[user_msg, assistant_msg], ...]
|
| 18 |
-
:return: 一个生成器 (generator),逐字(token)返回模型的响应
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
headers = {
|
| 24 |
-
"Authorization": f"Bearer {HF_TOKEN}",
|
| 25 |
-
"Content-Type": "application/json"
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
# 1. 格式化对话历史以符合API要求
|
| 29 |
-
# API需要一个包含所有对话的列表,格式为 {"role": "user", "content": "..."} 或 {"role": "assistant", "content": "..."}
|
| 30 |
messages = []
|
| 31 |
for turn in history:
|
| 32 |
user_msg, assistant_msg = turn
|
| 33 |
messages.append({"role": "user", "content": user_msg})
|
| 34 |
messages.append({"role": "assistant", "content": assistant_msg})
|
| 35 |
-
|
| 36 |
-
# 添加当前用户消息
|
| 37 |
messages.append({"role": "user", "content": message})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
# 2.
|
| 40 |
-
|
| 41 |
-
payload = {
|
| 42 |
-
"inputs": messages,
|
| 43 |
-
"parameters": {
|
| 44 |
-
"max_new_tokens": 2048, # 根据需要调整
|
| 45 |
-
"temperature": 0.7,
|
| 46 |
-
"top_p": 0.95,
|
| 47 |
-
"repetition_penalty": 1.1,
|
| 48 |
-
"return_full_text": False,
|
| 49 |
-
},
|
| 50 |
-
"stream": True
|
| 51 |
-
}
|
| 52 |
|
| 53 |
-
# 3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
full_response = ""
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
# 检查HTTP响应状态码
|
| 59 |
-
response.raise_for_status()
|
| 60 |
-
|
| 61 |
-
# 逐行读取流式响应
|
| 62 |
-
for line in response.iter_lines():
|
| 63 |
-
if line:
|
| 64 |
-
# 流式响应通常以 "data:" 开头,后跟一个JSON对象
|
| 65 |
-
decoded_line = line.decode('utf-8')
|
| 66 |
-
if decoded_line.startswith("data:"):
|
| 67 |
-
try:
|
| 68 |
-
# 解析JSON
|
| 69 |
-
json_data = json.loads(decoded_line[5:])
|
| 70 |
-
# 提取token文本
|
| 71 |
-
token = json_data.get("token", {}).get("text", "")
|
| 72 |
-
if token:
|
| 73 |
-
full_response += token
|
| 74 |
-
yield full_response
|
| 75 |
-
except json.JSONDecodeError:
|
| 76 |
-
# 忽略无法解析的行
|
| 77 |
-
continue
|
| 78 |
-
|
| 79 |
-
except requests.exceptions.RequestException as e:
|
| 80 |
-
print(f"API请求错误: {e}")
|
| 81 |
-
yield f"抱歉,与模型API通信时发生错误: {e}"
|
| 82 |
-
except Exception as e:
|
| 83 |
-
print(f"发生未知错误: {e}")
|
| 84 |
-
yield f"抱歉,发生了一个未知错误: {e}"
|
| 85 |
|
| 86 |
# --- 创建并启动Gradio界面 ---
|
| 87 |
-
|
| 88 |
-
# 使用gr.ChatInterface,它为聊天机器人提供了完整的UI
|
| 89 |
-
# fn=predict 指定了处理逻辑的函数
|
| 90 |
-
# streaming=True 告诉Gradio我们的函数是流式的(使用yield)
|
| 91 |
-
# Gradio 4.44.1中,ChatInterface会自动处理stream参数,我们只需确保函数是生成器
|
| 92 |
demo = gr.ChatInterface(
|
| 93 |
fn=predict,
|
| 94 |
-
title="小Q老师 - 基础问答",
|
| 95 |
-
description="
|
| 96 |
examples=[["你好"], ["请用python写一个快速排序算法"], ["给我讲个笑话吧"]],
|
| 97 |
cache_examples=False,
|
| 98 |
)
|
| 99 |
|
| 100 |
if __name__ == "__main__":
|
| 101 |
-
|
| 102 |
-
demo.launch() # 在Hugging Face Spaces上运行时使用
|
|
|
|
| 1 |
+
|
| 2 |
import gradio as gr
|
| 3 |
+
from threading import Thread
|
| 4 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
| 5 |
+
import torch
|
| 6 |
import os
|
|
|
|
| 7 |
|
| 8 |
# --- 配置 ---
|
| 9 |
+
# 我们不再需要API Token,因为模型在本地运行
|
| 10 |
+
MODEL_ID = "badanwang/teacher_basic_qwen3-0.6b"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
print("开始加载模型和分词器...")
|
| 14 |
+
try:
|
| 15 |
+
# 确保使用 trust_remote_code=True,因为Qwen模型需要加载自定义代码
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 17 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 18 |
+
MODEL_ID,
|
| 19 |
+
torch_dtype="auto", # 使用适合CPU的类型,如torch.float32
|
| 20 |
+
device_map="auto", # 自动将模型加载到可用设备(这里是CPU)
|
| 21 |
+
trust_remote_code=True
|
| 22 |
+
)
|
| 23 |
+
print("模型和分词器加载成功!")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"模型加载失败: {e}")
|
| 26 |
+
# 如果模型加载失败,应用将无法工作,这里可以抛出异常或退出
|
| 27 |
+
raise gr.Error(f"关键错误:无法加载模型 {MODEL_ID}。错误信息: {e}")
|
| 28 |
+
|
| 29 |
|
| 30 |
# --- 核心对话函数 ---
|
| 31 |
def predict(message, history):
|
| 32 |
"""
|
| 33 |
+
主函数,使用加载到本地的模型进行流式对话。
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
+
# 1. 格式化对话历史
|
| 36 |
+
# Qwen的模板要求一个特殊的列表格式
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
messages = []
|
| 38 |
for turn in history:
|
| 39 |
user_msg, assistant_msg = turn
|
| 40 |
messages.append({"role": "user", "content": user_msg})
|
| 41 |
messages.append({"role": "assistant", "content": assistant_msg})
|
|
|
|
|
|
|
| 42 |
messages.append({"role": "user", "content": message})
|
| 43 |
+
|
| 44 |
+
# 使用分词器的 apply_chat_template 方法来正确格式化输入
|
| 45 |
+
model_inputs = tokenizer.apply_chat_template(
|
| 46 |
+
messages,
|
| 47 |
+
add_generation_prompt=True,
|
| 48 |
+
return_tensors="pt"
|
| 49 |
+
).to(model.device) # 确保输入张量和模型在同一设备上
|
| 50 |
|
| 51 |
+
# 2. 设置流式输出
|
| 52 |
+
streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# 3. 在一个单独的线程中运行生成,以避免阻塞UI
|
| 55 |
+
generation_kwargs = dict(
|
| 56 |
+
inputs=model_inputs,
|
| 57 |
+
streamer=streamer,
|
| 58 |
+
max_new_tokens=2048,
|
| 59 |
+
do_sample=True,
|
| 60 |
+
temperature=0.7,
|
| 61 |
+
top_p=0.95,
|
| 62 |
+
)
|
| 63 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 64 |
+
thread.start()
|
| 65 |
+
|
| 66 |
+
# 4. 从streamer中yield每个新生成的token
|
| 67 |
full_response = ""
|
| 68 |
+
for new_text in streamer:
|
| 69 |
+
full_response += new_text
|
| 70 |
+
yield full_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# --- 创建并启动Gradio界面 ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
demo = gr.ChatInterface(
|
| 74 |
fn=predict,
|
| 75 |
+
title="小Q老师 - 基础问答 (本地加载)",
|
| 76 |
+
description=f"直接在Space中运行 {MODEL_ID} 模型进行流式对话。CPU推理可能较慢,请耐心等待。",
|
| 77 |
examples=[["你好"], ["请用python写一个快速排序算法"], ["给我讲个笑话吧"]],
|
| 78 |
cache_examples=False,
|
| 79 |
)
|
| 80 |
|
| 81 |
if __name__ == "__main__":
|
| 82 |
+
demo.launch()
|
|
|