Spaces:
Running
Running
File size: 2,011 Bytes
68e2d25 47c378b 1716d5b 7affaf6 47c378b 7affaf6 47c378b 7affaf6 47c378b 1716d5b 68e2d25 47c378b 7affaf6 47c378b 1716d5b 47c378b 7affaf6 47c378b 7affaf6 47c378b 1716d5b 47c378b f1a0f6f 7affaf6 47c378b 68e2d25 7affaf6 1716d5b 47c378b 1716d5b 47c378b 1716d5b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# 加载模型和分词器(适配 Qwen1.5-1.8B-Chat 并保留优化配置)
model_name = "Qwen/Qwen1.5-1.8B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # 半精度减少内存占用
device_map="auto",
load_in_4bit=True, # 4-bit 量化降低内存压力
bnb_4bit_compute_dtype=torch.float16
)
# 优化后的聊天函数(适配 Qwen 的对话模板)
def chat_with_model(message, history):
# 只保留最近 3 轮历史,减少计算量
history = history[-3:]
messages = []
# 拼接历史对话
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
# 加入当前用户消息
messages.append({"role": "user", "content": message})
# 生成模型输入
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
# 推理生成回复
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.7,
repetition_penalty=1.1,
do_sample=True
)
# 解码并提取回复
bot_response = tokenizer.decode(
outputs[0][inputs["input_ids"].shape[-1]:],
skip_special_tokens=True
).strip()
return bot_response
# 启动 Gradio 界面(保留资源优化配置)
if __name__ == "__main__":
gr.ChatInterface(
fn=chat_with_model,
title="轻量聊天助手",
description="基于 Qwen1.5-1.8B-Chat 适配 2 核 16G 配置"
).launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
inline=False
) |