triflix commited on
Commit
a5be475
Β·
verified Β·
1 Parent(s): a0fd3f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """OpenAI-compatible API server with streaming for Qwen3-0.6B."""
3
+
4
+ import glob, json, os, time, uuid
5
+ from contextlib import asynccontextmanager
6
+
7
+ from fastapi import FastAPI, Request
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.responses import JSONResponse, StreamingResponse
10
+ from llama_cpp import Llama
11
+
12
+ # ── locate model ────────────────────────────────────────────────
13
+ MODEL_DIR = os.environ.get("MODEL_DIR", "/home/user/models")
14
+ gguf_files = glob.glob(os.path.join(MODEL_DIR, "**", "*.gguf"), recursive=True)
15
+ if not gguf_files:
16
+ raise RuntimeError(f"No .gguf model found in {MODEL_DIR}")
17
+ MODEL_PATH = gguf_files[0]
18
+ MODEL_ID = "qwen3-0.6b"
19
+
20
+ # ── lifespan (load model once) ──────────────────────────────────
21
+ llm: Llama | None = None
22
+
23
+ @asynccontextmanager
24
+ async def lifespan(application: FastAPI):
25
+ global llm
26
+ print(f"Loading model: {MODEL_PATH}")
27
+ llm = Llama(
28
+ model_path=MODEL_PATH,
29
+ n_ctx=2048,
30
+ n_threads=int(os.environ.get("N_THREADS", 2)),
31
+ chat_format="chatml", # Qwen3 uses ChatML
32
+ verbose=False,
33
+ )
34
+ print("Model loaded βœ“")
35
+ yield
36
+ del llm
37
+
38
+ app = FastAPI(title="Qwen3-0.6B API", lifespan=lifespan)
39
+
40
+ app.add_middleware(
41
+ CORSMiddleware,
42
+ allow_origins=["*"],
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ # ── helpers ─────────────────────────────────────────────────────
48
+ def _id():
49
+ return f"chatcmpl-{uuid.uuid4().hex[:12]}"
50
+
51
+ def _ts():
52
+ return int(time.time())
53
+
54
+ # ── routes ──────────────────────────────────────────────────────
55
+ @app.get("/")
56
+ async def health():
57
+ return {"status": "ok", "model": MODEL_ID}
58
+
59
+ @app.get("/v1/models")
60
+ async def list_models():
61
+ return {
62
+ "object": "list",
63
+ "data": [
64
+ {
65
+ "id": MODEL_ID,
66
+ "object": "model",
67
+ "created": _ts(),
68
+ "owned_by": "qwen",
69
+ }
70
+ ],
71
+ }
72
+
73
+ # ── /v1/chat/completions ───────────────────────────────────────
74
+ @app.post("/v1/chat/completions")
75
+ async def chat_completions(request: Request):
76
+ body = await request.json()
77
+ messages = body.get("messages", [])
78
+ stream = body.get("stream", False)
79
+ temperature = body.get("temperature", 0.7)
80
+ max_tokens = body.get("max_tokens", 512)
81
+ top_p = body.get("top_p", 0.9)
82
+ top_k = body.get("top_k", 40)
83
+
84
+ params = dict(
85
+ messages=messages,
86
+ temperature=temperature,
87
+ max_tokens=max_tokens,
88
+ top_p=top_p,
89
+ top_k=top_k,
90
+ stream=stream,
91
+ )
92
+
93
+ if stream:
94
+ return StreamingResponse(
95
+ _stream_chat(params),
96
+ media_type="text/event-stream",
97
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
98
+ )
99
+
100
+ result = llm.create_chat_completion(**params)
101
+ return JSONResponse(content=result)
102
+
103
+
104
+ async def _stream_chat(params: dict):
105
+ try:
106
+ for chunk in llm.create_chat_completion(**params):
107
+ yield f"data: {json.dumps(chunk)}\n\n"
108
+ except Exception as e:
109
+ err = {"error": {"message": str(e), "type": "server_error"}}
110
+ yield f"data: {json.dumps(err)}\n\n"
111
+ yield "data: [DONE]\n\n"
112
+
113
+
114
+ # ── /v1/completions (text completion) ──────────────────────────
115
+ @app.post("/v1/completions")
116
+ async def completions(request: Request):
117
+ body = await request.json()
118
+ params = dict(
119
+ prompt=body.get("prompt", ""),
120
+ max_tokens=body.get("max_tokens", 512),
121
+ temperature=body.get("temperature", 0.7),
122
+ top_p=body.get("top_p", 0.9),
123
+ stream=body.get("stream", False),
124
+ )
125
+
126
+ if params["stream"]:
127
+ return StreamingResponse(
128
+ _stream_completion(params),
129
+ media_type="text/event-stream",
130
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
131
+ )
132
+
133
+ return JSONResponse(content=llm.create_completion(**params))
134
+
135
+
136
+ async def _stream_completion(params: dict):
137
+ try:
138
+ for chunk in llm.create_completion(**params):
139
+ yield f"data: {json.dumps(chunk)}\n\n"
140
+ except Exception as e:
141
+ err = {"error": {"message": str(e), "type": "server_error"}}
142
+ yield f"data: {json.dumps(err)}\n\n"
143
+ yield "data: [DONE]\n\n"
144
+
145
+
146
+ # ── main ─────────────────────────────────────────────────────���──
147
+ if __name__ == "__main__":
148
+ import uvicorn
149
+ uvicorn.run(app, host="0.0.0.0", port=7860)