| |
| """OpenAI-compatible API server with streaming for Qwen3-0.6B.""" |
|
|
| import glob, json, os, time, uuid |
| from contextlib import asynccontextmanager |
|
|
| from fastapi import FastAPI, Request |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import JSONResponse, StreamingResponse |
| from llama_cpp import Llama |
|
|
| |
| MODEL_DIR = os.environ.get("MODEL_DIR", "/home/user/models") |
| gguf_files = glob.glob(os.path.join(MODEL_DIR, "**", "*.gguf"), recursive=True) |
| if not gguf_files: |
| raise RuntimeError(f"No .gguf model found in {MODEL_DIR}") |
| MODEL_PATH = gguf_files[0] |
| MODEL_ID = "qwen3-0.6b" |
|
|
| |
| llm: Llama | None = None |
|
|
| @asynccontextmanager |
| async def lifespan(application: FastAPI): |
| global llm |
| print(f"Loading model: {MODEL_PATH}") |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=2048, |
| n_threads=int(os.environ.get("N_THREADS", 2)), |
| chat_format="chatml", |
| verbose=False, |
| ) |
| print("Model loaded β") |
| yield |
| del llm |
|
|
| app = FastAPI(title="Qwen3-0.6B API", lifespan=lifespan) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| def _id(): |
| return f"chatcmpl-{uuid.uuid4().hex[:12]}" |
|
|
| def _ts(): |
| return int(time.time()) |
|
|
| |
| @app.get("/") |
| async def health(): |
| return {"status": "ok", "model": MODEL_ID} |
|
|
| @app.get("/v1/models") |
| async def list_models(): |
| return { |
| "object": "list", |
| "data": [ |
| { |
| "id": MODEL_ID, |
| "object": "model", |
| "created": _ts(), |
| "owned_by": "qwen", |
| } |
| ], |
| } |
|
|
| |
| @app.post("/v1/chat/completions") |
| async def chat_completions(request: Request): |
| body = await request.json() |
| messages = body.get("messages", []) |
| stream = body.get("stream", False) |
| temperature = body.get("temperature", 0.7) |
| max_tokens = body.get("max_tokens", 512) |
| top_p = body.get("top_p", 0.9) |
| top_k = body.get("top_k", 40) |
|
|
| params = dict( |
| messages=messages, |
| temperature=temperature, |
| max_tokens=max_tokens, |
| top_p=top_p, |
| top_k=top_k, |
| stream=stream, |
| ) |
|
|
| if stream: |
| return StreamingResponse( |
| _stream_chat(params), |
| media_type="text/event-stream", |
| headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, |
| ) |
|
|
| result = llm.create_chat_completion(**params) |
| return JSONResponse(content=result) |
|
|
|
|
| async def _stream_chat(params: dict): |
| try: |
| for chunk in llm.create_chat_completion(**params): |
| yield f"data: {json.dumps(chunk)}\n\n" |
| except Exception as e: |
| err = {"error": {"message": str(e), "type": "server_error"}} |
| yield f"data: {json.dumps(err)}\n\n" |
| yield "data: [DONE]\n\n" |
|
|
|
|
| |
| @app.post("/v1/completions") |
| async def completions(request: Request): |
| body = await request.json() |
| params = dict( |
| prompt=body.get("prompt", ""), |
| max_tokens=body.get("max_tokens", 512), |
| temperature=body.get("temperature", 0.7), |
| top_p=body.get("top_p", 0.9), |
| stream=body.get("stream", False), |
| ) |
|
|
| if params["stream"]: |
| return StreamingResponse( |
| _stream_completion(params), |
| media_type="text/event-stream", |
| headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, |
| ) |
|
|
| return JSONResponse(content=llm.create_completion(**params)) |
|
|
|
|
| async def _stream_completion(params: dict): |
| try: |
| for chunk in llm.create_completion(**params): |
| yield f"data: {json.dumps(chunk)}\n\n" |
| except Exception as e: |
| err = {"error": {"message": str(e), "type": "server_error"}} |
| yield f"data: {json.dumps(err)}\n\n" |
| yield "data: [DONE]\n\n" |
|
|
|
|
| |
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |