| import os, time, uuid, json |
| import torch |
| from unsloth import FastModel |
| from fastapi import FastAPI, Request |
| from fastapi.responses import JSONResponse |
| from fastapi.middleware.cors import CORSMiddleware |
|
|
| MODEL = "iryahayri/lila-mirror-v1-merged" |
| SERVED = "lila-v1" |
|
|
| print("Loading Lila...") |
| model, tokenizer = FastModel.from_pretrained( |
| model_name=MODEL, |
| max_seq_length=4096, |
| load_in_4bit=False, |
| dtype=torch.bfloat16, |
| ) |
| model.eval() |
| tok = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer |
| print("Model loaded.") |
|
|
| app = FastAPI() |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| @app.get("/v1/models") |
| async def models(): |
| return {"object": "list", "data": [{"id": SERVED, "object": "model", "owned_by": "hayri"}]} |
|
|
| @app.post("/v1/chat/completions") |
| async def chat(req: Request): |
| body = await req.json() |
| msgs = body.get("messages", []) |
| prompt = tok.apply_chat_template( |
| msgs, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=False, |
| ) |
| inputs = tok(prompt, return_tensors="pt").to("cuda") |
| with torch.no_grad(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=body.get("max_tokens", 200), |
| temperature=body.get("temperature", 0.85), |
| top_p=body.get("top_p", 0.9), |
| repetition_penalty=body.get("repetition_penalty", 1.15), |
| do_sample=True, |
| pad_token_id=tok.eos_token_id, |
| ) |
| text = tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) |
| return JSONResponse({ |
| "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", |
| "object": "chat.completion", |
| "created": int(time.time()), |
| "model": SERVED, |
| "choices": [{ |
| "index": 0, |
| "message": {"role": "assistant", "content": text}, |
| "finish_reason": "stop", |
| }], |
| }) |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=8000) |