iryahayri
/

lila-deploy-scripts

Model card Files Files and versions

lila-deploy-scripts / lila_server.py

iryahayri's picture

Create lila_server.py

556b039 verified 18 days ago

history blame contribute delete

2.06 kB

	import os, time, uuid, json
	import torch
	from unsloth import FastModel
	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware

	MODEL = "iryahayri/lila-mirror-v1-merged"
	SERVED = "lila-v1"

	print("Loading Lila...")
	model, tokenizer = FastModel.from_pretrained(
	model_name=MODEL,
	max_seq_length=4096,
	load_in_4bit=False,
	dtype=torch.bfloat16,
	)
	model.eval()
	tok = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
	print("Model loaded.")

	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.get("/v1/models")
	async def models():
	return {"object": "list", "data": [{"id": SERVED, "object": "model", "owned_by": "hayri"}]}

	@app.post("/v1/chat/completions")
	async def chat(req: Request):
	body = await req.json()
	msgs = body.get("messages", [])
	prompt = tok.apply_chat_template(
	msgs,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)
	inputs = tok(prompt, return_tensors="pt").to("cuda")
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=body.get("max_tokens", 200),
	temperature=body.get("temperature", 0.85),
	top_p=body.get("top_p", 0.9),
	repetition_penalty=body.get("repetition_penalty", 1.15),
	do_sample=True,
	pad_token_id=tok.eos_token_id,
	)
	text = tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
	return JSONResponse({
	"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
	"object": "chat.completion",
	"created": int(time.time()),
	"model": SERVED,
	"choices": [{
	"index": 0,
	"message": {"role": "assistant", "content": text},
	"finish_reason": "stop",
	}],
	})

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)