glm-4.7-flash / app.py
cometapii's picture
Upload 3 files
2fb3ee9 verified
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from llama_cpp import Llama
import json
import time
import uuid
app = FastAPI()
security = HTTPBearer()
API_KEY = "connectkey"
MODEL_ID = "glm-4.7-flash"
# IQ1_S = 9.25 GB — single file, pasuje na CPU Upgrade (16GB RAM)
print("==> Loading GLM-4.7-Flash IQ1_S (9.25 GB) from HF...")
llm = Llama.from_pretrained(
repo_id="unsloth/GLM-4.7-Flash-GGUF",
filename="GLM-4.7-Flash-IQ1_S.gguf",
n_ctx=8192,
n_threads=4,
n_batch=512,
verbose=False,
)
print("==> Model loaded!")
def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
if credentials.credentials != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API key")
return credentials.credentials
@app.get("/v1/models")
async def list_models(key: str = Depends(verify_key)):
return {
"object": "list",
"data": [{
"id": MODEL_ID,
"object": "model",
"created": int(time.time()),
"owned_by": "unsloth",
}]
}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request, key: str = Depends(verify_key)):
body = await request.json()
messages = body.get("messages", [])
stream = body.get("stream", False)
max_tokens = body.get("max_tokens", 1024)
temperature = body.get("temperature", 1.0)
top_p = body.get("top_p", 0.95)
min_p = body.get("min_p", 0.01)
stop = body.get("stop", None)
completion_id = f"chatcmpl-{uuid.uuid4().hex}"
created = int(time.time())
if stream:
def generate():
for chunk in llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
min_p=min_p,
stop=stop,
stream=True,
):
delta = chunk["choices"][0].get("delta", {})
finish_reason = chunk["choices"][0].get("finish_reason")
data = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": MODEL_ID,
"choices": [{
"index": 0,
"delta": delta,
"finish_reason": finish_reason,
}]
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
result = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
min_p=min_p,
stop=stop,
stream=False,
)
return {
"id": completion_id,
"object": "chat.completion",
"created": created,
"model": MODEL_ID,
"choices": result["choices"],
"usage": result.get("usage", {}),
}
@app.post("/v1/completions")
async def completions(request: Request, key: str = Depends(verify_key)):
body = await request.json()
prompt = body.get("prompt", "")
stream = body.get("stream", False)
max_tokens = body.get("max_tokens", 512)
temperature = body.get("temperature", 1.0)
top_p = body.get("top_p", 0.95)
min_p = body.get("min_p", 0.01)
stop = body.get("stop", None)
completion_id = f"cmpl-{uuid.uuid4().hex}"
created = int(time.time())
if stream:
def generate():
for chunk in llm.create_completion(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
min_p=min_p,
stop=stop,
stream=True,
):
data = {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": MODEL_ID,
"choices": chunk["choices"],
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
result = llm.create_completion(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
min_p=min_p,
stop=stop,
stream=False,
)
return {
"id": completion_id,
"object": "text_completion",
"created": created,
"model": MODEL_ID,
"choices": result["choices"],
"usage": result.get("usage", {}),
}
@app.get("/health")
async def health():
return {"status": "ok", "model": MODEL_ID}