update
Browse files- app.py +89 -8
- requirements.txt +1 -0
- utils/chat_request.py +12 -0
- utils/chat_response.py +115 -0
- utils/model.py +105 -0
app.py
CHANGED
|
@@ -1,14 +1,95 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# 初始化 FastAPI 应用
|
| 4 |
app = FastAPI(title="HF-Model-Runner API", version="0.0.1")
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
@app.get("/")
|
| 9 |
-
def
|
| 10 |
-
return {
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# 导入 utils 模块
|
| 6 |
+
from utils.chat_request import ChatRequest
|
| 7 |
+
from utils.chat_response import create_chat_response, ChatResponse
|
| 8 |
+
from utils.model import check_model, initialize_pipeline, download_model, DownloadRequest
|
| 9 |
+
|
| 10 |
+
# 全局变量
|
| 11 |
+
model_name = None
|
| 12 |
+
pipe = None
|
| 13 |
+
tokenizer = None
|
| 14 |
|
| 15 |
# 初始化 FastAPI 应用
|
| 16 |
app = FastAPI(title="HF-Model-Runner API", version="0.0.1")
|
| 17 |
|
| 18 |
+
@app.on_event("startup")
|
| 19 |
+
async def startup_event():
|
| 20 |
+
"""
|
| 21 |
+
应用启动时初始化 pipeline
|
| 22 |
+
"""
|
| 23 |
+
global pipe, tokenizer, model_name
|
| 24 |
+
|
| 25 |
+
# 加载 .env 文件
|
| 26 |
+
load_dotenv()
|
| 27 |
+
|
| 28 |
+
# 从 .env 获取默认模型名称,如果没有则使用默认值
|
| 29 |
+
default_model = os.getenv("DEFAULT_MODEL_NAME", "unsloth/functiongemma-270m-it")
|
| 30 |
+
print(f"应用启动,正在初始化模型: {default_model}")
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
pipe, tokenizer, success = initialize_pipeline(default_model)
|
| 34 |
+
if success:
|
| 35 |
+
model_name = default_model
|
| 36 |
+
print(f"✓ 模型 {default_model} 初始化成功")
|
| 37 |
+
else:
|
| 38 |
+
print(f"✗ 模型 {default_model} 初始化失败")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"✗ 启动时模型初始化失败: {e}")
|
| 41 |
|
| 42 |
@app.get("/")
|
| 43 |
+
async def read_root():
|
| 44 |
+
return {"message": "Welcome to HF-Model-Runner API! Visit /docs for API documentation."}
|
| 45 |
+
|
| 46 |
+
@app.post("/download")
|
| 47 |
+
async def download_model_endpoint(request: DownloadRequest):
|
| 48 |
+
"""
|
| 49 |
+
下载指定的 HuggingFace 模型
|
| 50 |
+
"""
|
| 51 |
+
global pipe, tokenizer, model_name
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
success, message = download_model(request.model)
|
| 55 |
+
if success:
|
| 56 |
+
# 下载成功后,直接初始化该模型
|
| 57 |
+
pipe, tokenizer, init_success = initialize_pipeline(request.model)
|
| 58 |
+
if init_success:
|
| 59 |
+
model_name = request.model
|
| 60 |
+
return {
|
| 61 |
+
"status": "success",
|
| 62 |
+
"message": message,
|
| 63 |
+
"loaded": True,
|
| 64 |
+
"current_model": model_name
|
| 65 |
+
}
|
| 66 |
+
else:
|
| 67 |
+
return {
|
| 68 |
+
"status": "success",
|
| 69 |
+
"message": message,
|
| 70 |
+
"loaded": False,
|
| 71 |
+
"error": "模型下载成功但初始化失败"
|
| 72 |
+
}
|
| 73 |
+
else:
|
| 74 |
+
raise HTTPException(status_code=500, detail=message)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 77 |
+
|
| 78 |
+
@app.post("/v1/chat/completions", response_model=ChatResponse)
|
| 79 |
+
async def chat_completions(request: ChatRequest):
|
| 80 |
+
"""
|
| 81 |
+
OpenAI 兼容的聊天完成接口
|
| 82 |
+
"""
|
| 83 |
+
global pipe, tokenizer, model_name
|
| 84 |
+
|
| 85 |
+
# 检查模型是否匹配,如果请求的模型与当前加载的模型不同,需要重新初始化
|
| 86 |
+
if request.model != model_name:
|
| 87 |
+
pipe, tokenizer, success = initialize_pipeline(request.model)
|
| 88 |
+
if not success:
|
| 89 |
+
raise HTTPException(status_code=500, detail="模型初始化失败")
|
| 90 |
+
model_name = request.model
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
return create_chat_response(request, pipe, tokenizer)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
raise HTTPException(status_code=500, detail=str(e))
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ huggingface_hub
|
|
| 5 |
torch
|
| 6 |
accelerate
|
| 7 |
python-multipart
|
|
|
|
|
|
| 5 |
torch
|
| 6 |
accelerate
|
| 7 |
python-multipart
|
| 8 |
+
python-dotenv
|
utils/chat_request.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class ChatRequest(BaseModel):
|
| 6 |
+
model: Optional[str] = "gpt-3.5-turbo"
|
| 7 |
+
messages: List[Dict[str, Any]]
|
| 8 |
+
temperature: Optional[float] = 1.0
|
| 9 |
+
max_tokens: Optional[int] = None
|
| 10 |
+
top_p: Optional[float] = 1.0
|
| 11 |
+
frequency_penalty: Optional[float] = 0.0
|
| 12 |
+
presence_penalty: Optional[float] = 0.0
|
utils/chat_response.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
import time
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# 聊天响应模型
|
| 7 |
+
class ChatChoice(BaseModel):
|
| 8 |
+
index: int
|
| 9 |
+
message: Dict[str, str]
|
| 10 |
+
finish_reason: str
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ChatUsage(BaseModel):
|
| 14 |
+
prompt_tokens: int
|
| 15 |
+
completion_tokens: int
|
| 16 |
+
total_tokens: int
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ChatResponse(BaseModel):
|
| 20 |
+
id: str
|
| 21 |
+
object: str
|
| 22 |
+
created: int
|
| 23 |
+
model: str
|
| 24 |
+
choices: List[ChatChoice]
|
| 25 |
+
usage: ChatUsage
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def convert_json_format(input_data):
|
| 29 |
+
"""转换 pipeline 输出格式"""
|
| 30 |
+
output_generations = []
|
| 31 |
+
for item in input_data:
|
| 32 |
+
generated_text_list = item.get('generated_text', [])
|
| 33 |
+
|
| 34 |
+
assistant_content = ""
|
| 35 |
+
for message in generated_text_list:
|
| 36 |
+
if message.get('role') == 'assistant':
|
| 37 |
+
assistant_content = message.get('content', '')
|
| 38 |
+
break
|
| 39 |
+
|
| 40 |
+
# 移除 </think>...</think> 标签
|
| 41 |
+
clean_content = re.sub(r'\s*', '', assistant_content, flags=re.DOTALL).strip()
|
| 42 |
+
|
| 43 |
+
output_generations.append([
|
| 44 |
+
{
|
| 45 |
+
"text": clean_content,
|
| 46 |
+
"generationInfo": {
|
| 47 |
+
"finish_reason": "stop"
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
])
|
| 51 |
+
|
| 52 |
+
return {"generations": output_generations}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def create_chat_response(request: Any, pipe=None, tokenizer=None) -> ChatResponse:
|
| 56 |
+
"""
|
| 57 |
+
创建聊天响应 - 使用 pipeline 生成实际响应
|
| 58 |
+
"""
|
| 59 |
+
if pipe is None:
|
| 60 |
+
# 如果 pipeline 未初始化,返回模拟响应
|
| 61 |
+
response_message = {
|
| 62 |
+
"role": "assistant",
|
| 63 |
+
"content": "模型正在初始化中,请稍后重试..."
|
| 64 |
+
}
|
| 65 |
+
completion_text = response_message["content"]
|
| 66 |
+
else:
|
| 67 |
+
# 使用 pipeline 生成响应
|
| 68 |
+
messages = request.messages
|
| 69 |
+
|
| 70 |
+
# 从 request 获取 max_new_tokens,如果没有则使用默认值 1000
|
| 71 |
+
# max_new_tokens = request.max_tokens if request.max_tokens is not None else 1000
|
| 72 |
+
max_new_tokens = request.max_tokens if request.max_tokens is not None else None
|
| 73 |
+
|
| 74 |
+
# 调用 pipeline
|
| 75 |
+
result = pipe(messages, max_new_tokens=max_new_tokens)
|
| 76 |
+
# result = pipe(messages)
|
| 77 |
+
|
| 78 |
+
# 转换格式
|
| 79 |
+
converted_result = convert_json_format(result)
|
| 80 |
+
|
| 81 |
+
# 获取生成的文本
|
| 82 |
+
completion_text = converted_result["generations"][0][0]["text"]
|
| 83 |
+
|
| 84 |
+
response_message = {
|
| 85 |
+
"role": "assistant",
|
| 86 |
+
"content": completion_text
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
# 计算 token 数量
|
| 90 |
+
if tokenizer:
|
| 91 |
+
prompt_tokens = sum(len(tokenizer.encode(msg.get("content", ""))) for msg in request.messages)
|
| 92 |
+
completion_tokens = len(tokenizer.encode(completion_text))
|
| 93 |
+
else:
|
| 94 |
+
# 简化估算
|
| 95 |
+
prompt_tokens = sum(len(msg.get("content", "")) for msg in request.messages) // 4
|
| 96 |
+
completion_tokens = len(completion_text) // 4
|
| 97 |
+
|
| 98 |
+
return ChatResponse(
|
| 99 |
+
id=f"chatcmpl-{int(time.time())}",
|
| 100 |
+
object="chat.completion",
|
| 101 |
+
created=int(time.time()),
|
| 102 |
+
model=request.model,
|
| 103 |
+
choices=[
|
| 104 |
+
ChatChoice(
|
| 105 |
+
index=0,
|
| 106 |
+
message=response_message,
|
| 107 |
+
finish_reason="stop"
|
| 108 |
+
)
|
| 109 |
+
],
|
| 110 |
+
usage=ChatUsage(
|
| 111 |
+
prompt_tokens=prompt_tokens,
|
| 112 |
+
completion_tokens=completion_tokens,
|
| 113 |
+
total_tokens=prompt_tokens + completion_tokens
|
| 114 |
+
)
|
| 115 |
+
)
|
utils/model.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
| 5 |
+
from huggingface_hub import login
|
| 6 |
+
from fastapi import HTTPException
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DownloadRequest(BaseModel):
|
| 11 |
+
model: str
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def check_model(model_name):
|
| 15 |
+
"""
|
| 16 |
+
检查模型是否存在
|
| 17 |
+
参数: model_name - 从 request 传递过来的模型名称
|
| 18 |
+
返回: (model_name, cache_dir, success)
|
| 19 |
+
"""
|
| 20 |
+
cache_dir = "./my_model_cache"
|
| 21 |
+
|
| 22 |
+
# 检查模型是否已存在于缓存中
|
| 23 |
+
model_path = Path(cache_dir) / f"models--{model_name.replace('/', '--')}"
|
| 24 |
+
snapshot_path = model_path / "snapshots"
|
| 25 |
+
|
| 26 |
+
if snapshot_path.exists() and any(snapshot_path.iterdir()):
|
| 27 |
+
print(f"✓ 模型 {model_name} 已存在于缓存中")
|
| 28 |
+
try:
|
| 29 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
| 30 |
+
return model_name, cache_dir, True
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"⚠ 加载现有模型失败: {e}")
|
| 33 |
+
return model_name, cache_dir, False
|
| 34 |
+
else:
|
| 35 |
+
raise HTTPException(status_code=404, detail=f"模型 `{model_name}` 不存在,请先下载")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def download_model(model_name):
|
| 39 |
+
"""
|
| 40 |
+
下载指定的模型
|
| 41 |
+
参数: model_name - 要下载的模型名称
|
| 42 |
+
返回: (success, message)
|
| 43 |
+
"""
|
| 44 |
+
cache_dir = "./my_model_cache"
|
| 45 |
+
|
| 46 |
+
print(f"开始下载模型: {model_name}")
|
| 47 |
+
print(f"缓存目录: {cache_dir}")
|
| 48 |
+
|
| 49 |
+
# 登录 Hugging Face(可选,用于需要认证的模型)
|
| 50 |
+
token = os.getenv("HUGGINGFACE_TOKEN")
|
| 51 |
+
if token:
|
| 52 |
+
try:
|
| 53 |
+
print("登录 Hugging Face...")
|
| 54 |
+
login(token=token)
|
| 55 |
+
print("✓ HuggingFace 登录成功!")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"⚠ 登录失败: {e}")
|
| 58 |
+
print("继续使用公开模型")
|
| 59 |
+
else:
|
| 60 |
+
print("ℹ 未设置 HUGGINGFACE_TOKEN - 仅使用公开模型")
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
# 下载 tokenizer
|
| 64 |
+
print("正在下载 tokenizer...")
|
| 65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
| 66 |
+
print("✓ Tokenizer 下载成功!")
|
| 67 |
+
|
| 68 |
+
# 下载模型
|
| 69 |
+
print("正在下载模型...")
|
| 70 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
| 71 |
+
print("✓ 模型下载成功!")
|
| 72 |
+
|
| 73 |
+
print(f"✓ 模型和 tokenizer 已成功下载到 {cache_dir}")
|
| 74 |
+
return True, f"模型 {model_name} 下载成功"
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"✗ 下载模型时出错: {e}")
|
| 78 |
+
return False, f"下载失败: {str(e)}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def initialize_pipeline(model_name):
|
| 82 |
+
"""
|
| 83 |
+
使用模型初始化 pipeline
|
| 84 |
+
参数: model_name - 从 request 传递过来的模型名称
|
| 85 |
+
返回: (pipe, tokenizer, success)
|
| 86 |
+
"""
|
| 87 |
+
model_name, cache_dir, success = check_model(model_name)
|
| 88 |
+
|
| 89 |
+
if not success:
|
| 90 |
+
return None, None, False
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
# 确保 tokenizer 已加载
|
| 94 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
| 95 |
+
|
| 96 |
+
print(f"使用 {model_name} 初始化 pipeline...")
|
| 97 |
+
# 移除 cache_dir 参数,只传递给 AutoTokenizer 和 AutoModelForCausalLM
|
| 98 |
+
pipe = pipeline("text-generation", model=model_name, tokenizer=tokenizer)
|
| 99 |
+
print("✓ Pipeline 初始化成功!")
|
| 100 |
+
|
| 101 |
+
return pipe, tokenizer, True
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"✗ Pipeline 初始化失败: {e}")
|
| 105 |
+
return None, None, False
|