ray-lei commited on
Commit
bc06cb3
·
verified ·
1 Parent(s): f2f5cda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -27
app.py CHANGED
@@ -1,43 +1,63 @@
1
  import os
 
 
 
 
 
 
2
  os.environ["HF_HOME"] = "/tmp"
3
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
4
  os.environ["HF_HUB_CACHE"] = "/tmp"
5
 
6
- from fastapi import FastAPI
7
- from vllm import LLM, SamplingParams
8
- from pydantic import BaseModel
9
- from fastapi.responses import JSONResponse
10
- import uvicorn
11
 
12
- # 1. 加载模型(会自动从 HuggingFace Hub 拉取)
13
- llm = LLM(model="Qwen/Qwen2.5-Coder-7B-Instruct",download_dir="/tmp")
14
- sampling = SamplingParams(temperature=0.2, max_tokens=1024)
15
 
16
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- class Message(BaseModel):
19
- role: str
20
- content: str
 
 
 
 
 
21
 
22
- class ChatRequest(BaseModel):
23
- model: str
24
- messages: list[Message]
25
- max_tokens: int = 1024
26
 
27
- @app.post("/v1/chat/completions")
28
- def chat(req: ChatRequest):
29
- prompt = "\n".join([f"{m.role}: {m.content}" for m in req.messages])
30
- outputs = llm.generate([prompt], sampling)
31
- text = outputs[0].outputs[0].text
32
  return JSONResponse({
33
  "id": "cmpl-1",
34
- "object": "chat.completion",
35
  "choices": [
36
- {"index": 0,
37
- "message": {"role": "assistant", "content": text},
38
- "finish_reason": "stop"}
39
  ]
40
  })
41
 
42
- if __name__ == "__main__":
43
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
1
  import os
2
+ from fastapi import FastAPI, Request
3
+ from fastapi.responses import JSONResponse
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import torch
6
+
7
+ # 设置缓存目录,避免 /.cache 权限问题
8
  os.environ["HF_HOME"] = "/tmp"
9
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
10
  os.environ["HF_HUB_CACHE"] = "/tmp"
11
 
12
+ # 初始化 FastAPI
13
+ app = FastAPI()
 
 
 
14
 
15
+ # 模型 ID
16
+ MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
 
17
 
18
+ print("Loading model... (this may take a while the first time)")
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, cache_dir="/tmp")
20
+
21
+ # 加载模型到 GPU (T4 支持 bfloat16,显存不够可换成 float16)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ MODEL_ID,
24
+ device_map="auto",
25
+ torch_dtype=torch.bfloat16,
26
+ trust_remote_code=True,
27
+ cache_dir="/tmp"
28
+ )
29
+ model.eval()
30
+ print("Model loaded.")
31
+
32
+ # 生成接口 (兼容 OpenAI /v1/completions 简单版)
33
+ @app.post("/v1/completions")
34
+ async def completions(request: Request):
35
+ data = await request.json()
36
+ prompt = data.get("prompt") or ""
37
+ max_tokens = data.get("max_tokens", 512)
38
+
39
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
40
 
41
+ with torch.no_grad():
42
+ outputs = model.generate(
43
+ **inputs,
44
+ max_new_tokens=max_tokens,
45
+ do_sample=True,
46
+ temperature=0.7,
47
+ top_p=0.9,
48
+ )
49
 
50
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
51
 
52
+ # OpenAI API 格式返回
 
 
 
 
53
  return JSONResponse({
54
  "id": "cmpl-1",
55
+ "object": "text_completion",
56
  "choices": [
57
+ {"index": 0, "text": text, "finish_reason": "stop"}
 
 
58
  ]
59
  })
60
 
61
+ @app.get("/")
62
+ def root():
63
+ return {"status": "ok", "model": MODEL_ID}