my-llm-api / app.py
han145's picture
Create app.py
cd60a14 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
# 全局变量,避免重复加载
model = None
tokenizer = None
def load_model():
"""加载模型和分词器"""
global model, tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # 使用半精度减少内存占用
device_map="auto", # 自动分配至CPU
low_cpu_mem_usage=True # 优化CPU内存使用
)
print("模型加载成功!")
except Exception as e:
print(f"模型加载失败: {e}")
def openai_compatible_api(message, history):
"""处理OpenAI格式的请求"""
if model is None:
load_model()
# 将历史记录和当前消息组合成对话格式
# 这里需要根据DeepSeek模型要求的模板格式化工序
# 以下是一个简化示例,您需要根据模型的具体要求调整
prompt = f"\n\nHuman: {message}\n\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取模型生成的部分
generated_text = response.split("Assistant:")[-1].strip()
# 返回OpenAI兼容格式
return {
"choices": [{
"message": {
"role": "assistant",
"content": generated_text
}
}]
}
# 在Gradio界面启动前加载模型(可选,或等待第一个请求时加载)
load_model()
# 创建Gradio界面,但隐藏默认的Web UI,专注于API
demo = gr.ChatInterface(
fn=openai_compatible_api,
title="DeepSeek API Service",
description="OpenAI-compatible API for DeepSeek-R1"
)
# 禁用自动创建公共链接,仅以API模式运行
if __name__ == "__main__":
demo.launch(show_api=True, server_name="0.0.0.0", server_port=7860)