lila-deploy-scripts / lila_server.py
iryahayri's picture
Create lila_server.py
556b039 verified
import os, time, uuid, json
import torch
from unsloth import FastModel
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
MODEL = "iryahayri/lila-mirror-v1-merged"
SERVED = "lila-v1"
print("Loading Lila...")
model, tokenizer = FastModel.from_pretrained(
model_name=MODEL,
max_seq_length=4096,
load_in_4bit=False,
dtype=torch.bfloat16,
)
model.eval()
tok = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
print("Model loaded.")
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/v1/models")
async def models():
return {"object": "list", "data": [{"id": SERVED, "object": "model", "owned_by": "hayri"}]}
@app.post("/v1/chat/completions")
async def chat(req: Request):
body = await req.json()
msgs = body.get("messages", [])
prompt = tok.apply_chat_template(
msgs,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False,
)
inputs = tok(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=body.get("max_tokens", 200),
temperature=body.get("temperature", 0.85),
top_p=body.get("top_p", 0.9),
repetition_penalty=body.get("repetition_penalty", 1.15),
do_sample=True,
pad_token_id=tok.eos_token_id,
)
text = tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return JSONResponse({
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": SERVED,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": text},
"finish_reason": "stop",
}],
})
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)