Spaces:
Build error
Build error
| from fastapi import FastAPI, Request, HTTPException, Depends | |
| from fastapi.responses import StreamingResponse | |
| from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials | |
| from llama_cpp import Llama | |
| import json | |
| import time | |
| import uuid | |
| app = FastAPI() | |
| security = HTTPBearer() | |
| API_KEY = "connectkey" | |
| MODEL_ID = "glm-4.7-flash" | |
| # IQ1_S = 9.25 GB — single file, pasuje na CPU Upgrade (16GB RAM) | |
| print("==> Loading GLM-4.7-Flash IQ1_S (9.25 GB) from HF...") | |
| llm = Llama.from_pretrained( | |
| repo_id="unsloth/GLM-4.7-Flash-GGUF", | |
| filename="GLM-4.7-Flash-IQ1_S.gguf", | |
| n_ctx=8192, | |
| n_threads=4, | |
| n_batch=512, | |
| verbose=False, | |
| ) | |
| print("==> Model loaded!") | |
| def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)): | |
| if credentials.credentials != API_KEY: | |
| raise HTTPException(status_code=401, detail="Invalid API key") | |
| return credentials.credentials | |
| async def list_models(key: str = Depends(verify_key)): | |
| return { | |
| "object": "list", | |
| "data": [{ | |
| "id": MODEL_ID, | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "unsloth", | |
| }] | |
| } | |
| async def chat_completions(request: Request, key: str = Depends(verify_key)): | |
| body = await request.json() | |
| messages = body.get("messages", []) | |
| stream = body.get("stream", False) | |
| max_tokens = body.get("max_tokens", 1024) | |
| temperature = body.get("temperature", 1.0) | |
| top_p = body.get("top_p", 0.95) | |
| min_p = body.get("min_p", 0.01) | |
| stop = body.get("stop", None) | |
| completion_id = f"chatcmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| if stream: | |
| def generate(): | |
| for chunk in llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| min_p=min_p, | |
| stop=stop, | |
| stream=True, | |
| ): | |
| delta = chunk["choices"][0].get("delta", {}) | |
| finish_reason = chunk["choices"][0].get("finish_reason") | |
| data = { | |
| "id": completion_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": delta, | |
| "finish_reason": finish_reason, | |
| }] | |
| } | |
| yield f"data: {json.dumps(data)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(generate(), media_type="text/event-stream") | |
| else: | |
| result = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| min_p=min_p, | |
| stop=stop, | |
| stream=False, | |
| ) | |
| return { | |
| "id": completion_id, | |
| "object": "chat.completion", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": result["choices"], | |
| "usage": result.get("usage", {}), | |
| } | |
| async def completions(request: Request, key: str = Depends(verify_key)): | |
| body = await request.json() | |
| prompt = body.get("prompt", "") | |
| stream = body.get("stream", False) | |
| max_tokens = body.get("max_tokens", 512) | |
| temperature = body.get("temperature", 1.0) | |
| top_p = body.get("top_p", 0.95) | |
| min_p = body.get("min_p", 0.01) | |
| stop = body.get("stop", None) | |
| completion_id = f"cmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| if stream: | |
| def generate(): | |
| for chunk in llm.create_completion( | |
| prompt=prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| min_p=min_p, | |
| stop=stop, | |
| stream=True, | |
| ): | |
| data = { | |
| "id": completion_id, | |
| "object": "text_completion", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": chunk["choices"], | |
| } | |
| yield f"data: {json.dumps(data)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(generate(), media_type="text/event-stream") | |
| else: | |
| result = llm.create_completion( | |
| prompt=prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| min_p=min_p, | |
| stop=stop, | |
| stream=False, | |
| ) | |
| return { | |
| "id": completion_id, | |
| "object": "text_completion", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": result["choices"], | |
| "usage": result.get("usage", {}), | |
| } | |
| async def health(): | |
| return {"status": "ok", "model": MODEL_ID} | |