import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import json # 全局变量,避免重复加载 model = None tokenizer = None def load_model(): """加载模型和分词器""" global model, tokenizer model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # 使用半精度减少内存占用 device_map="auto", # 自动分配至CPU low_cpu_mem_usage=True # 优化CPU内存使用 ) print("模型加载成功!") except Exception as e: print(f"模型加载失败: {e}") def openai_compatible_api(message, history): """处理OpenAI格式的请求""" if model is None: load_model() # 将历史记录和当前消息组合成对话格式 # 这里需要根据DeepSeek模型要求的模板格式化工序 # 以下是一个简化示例,您需要根据模型的具体要求调整 prompt = f"\n\nHuman: {message}\n\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.95, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # 提取模型生成的部分 generated_text = response.split("Assistant:")[-1].strip() # 返回OpenAI兼容格式 return { "choices": [{ "message": { "role": "assistant", "content": generated_text } }] } # 在Gradio界面启动前加载模型(可选,或等待第一个请求时加载) load_model() # 创建Gradio界面,但隐藏默认的Web UI,专注于API demo = gr.ChatInterface( fn=openai_compatible_api, title="DeepSeek API Service", description="OpenAI-compatible API for DeepSeek-R1" ) # 禁用自动创建公共链接,仅以API模式运行 if __name__ == "__main__": demo.launch(show_api=True, server_name="0.0.0.0", server_port=7860)