from fastapi import FastAPI, Request, HTTPException, Depends from fastapi.responses import StreamingResponse from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from llama_cpp import Llama import json import time import uuid app = FastAPI() security = HTTPBearer() API_KEY = "connectkey" MODEL_ID = "glm-4.7-flash" # IQ1_S = 9.25 GB — single file, pasuje na CPU Upgrade (16GB RAM) print("==> Loading GLM-4.7-Flash IQ1_S (9.25 GB) from HF...") llm = Llama.from_pretrained( repo_id="unsloth/GLM-4.7-Flash-GGUF", filename="GLM-4.7-Flash-IQ1_S.gguf", n_ctx=8192, n_threads=4, n_batch=512, verbose=False, ) print("==> Model loaded!") def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)): if credentials.credentials != API_KEY: raise HTTPException(status_code=401, detail="Invalid API key") return credentials.credentials @app.get("/v1/models") async def list_models(key: str = Depends(verify_key)): return { "object": "list", "data": [{ "id": MODEL_ID, "object": "model", "created": int(time.time()), "owned_by": "unsloth", }] } @app.post("/v1/chat/completions") async def chat_completions(request: Request, key: str = Depends(verify_key)): body = await request.json() messages = body.get("messages", []) stream = body.get("stream", False) max_tokens = body.get("max_tokens", 1024) temperature = body.get("temperature", 1.0) top_p = body.get("top_p", 0.95) min_p = body.get("min_p", 0.01) stop = body.get("stop", None) completion_id = f"chatcmpl-{uuid.uuid4().hex}" created = int(time.time()) if stream: def generate(): for chunk in llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, min_p=min_p, stop=stop, stream=True, ): delta = chunk["choices"][0].get("delta", {}) finish_reason = chunk["choices"][0].get("finish_reason") data = { "id": completion_id, "object": "chat.completion.chunk", "created": created, "model": MODEL_ID, "choices": [{ "index": 0, "delta": delta, "finish_reason": finish_reason, }] } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") else: result = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, min_p=min_p, stop=stop, stream=False, ) return { "id": completion_id, "object": "chat.completion", "created": created, "model": MODEL_ID, "choices": result["choices"], "usage": result.get("usage", {}), } @app.post("/v1/completions") async def completions(request: Request, key: str = Depends(verify_key)): body = await request.json() prompt = body.get("prompt", "") stream = body.get("stream", False) max_tokens = body.get("max_tokens", 512) temperature = body.get("temperature", 1.0) top_p = body.get("top_p", 0.95) min_p = body.get("min_p", 0.01) stop = body.get("stop", None) completion_id = f"cmpl-{uuid.uuid4().hex}" created = int(time.time()) if stream: def generate(): for chunk in llm.create_completion( prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, min_p=min_p, stop=stop, stream=True, ): data = { "id": completion_id, "object": "text_completion", "created": created, "model": MODEL_ID, "choices": chunk["choices"], } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") else: result = llm.create_completion( prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, min_p=min_p, stop=stop, stream=False, ) return { "id": completion_id, "object": "text_completion", "created": created, "model": MODEL_ID, "choices": result["choices"], "usage": result.get("usage", {}), } @app.get("/health") async def health(): return {"status": "ok", "model": MODEL_ID}