| import json
|
| import os
|
| import threading
|
| import time
|
| import uuid
|
| from functools import lru_cache
|
| from typing import Any, Dict, Iterable, List, Optional
|
|
|
| import gradio as gr
|
| from fastapi import FastAPI, Request
|
| from fastapi.responses import JSONResponse, StreamingResponse
|
| from huggingface_hub import hf_hub_download
|
| from llama_cpp import Llama
|
|
|
| MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "LiquidAI/LFM2-350M-GGUF")
|
| MODEL_FILE = os.environ.get("MODEL_FILE", "LFM2-350M-Q4_K_M.gguf")
|
|
|
| N_CTX = int(os.environ.get("N_CTX", "4096"))
|
| N_THREADS = int(os.environ.get("N_THREADS", "2"))
|
| N_BATCH = int(os.environ.get("N_BATCH", "512"))
|
| CHAT_FORMAT = os.environ.get("CHAT_FORMAT", "")
|
| USE_MMAP = os.environ.get("USE_MMAP", "1") == "1"
|
|
|
| LOCK = threading.Lock()
|
| api = FastAPI()
|
|
|
|
|
| def _now() -> int:
|
| return int(time.time())
|
|
|
|
|
| def _openai_id(prefix: str) -> str:
|
| return f"{prefix}-{uuid.uuid4().hex[:24]}"
|
|
|
|
|
| def _sse(obj: Any) -> str:
|
| return f"data: {json.dumps(obj, ensure_ascii=True)}\n\n"
|
|
|
|
|
| def _sse_done() -> str:
|
| return "data: [DONE]\n\n"
|
|
|
|
|
| @lru_cache(maxsize=1)
|
| def _get_llm_and_path() -> Dict[str, Any]:
|
| model_path = hf_hub_download(
|
| repo_id=MODEL_REPO_ID, filename=MODEL_FILE, repo_type="model"
|
| )
|
|
|
| init_kwargs: Dict[str, Any] = {
|
| "model_path": model_path,
|
| "n_ctx": N_CTX,
|
| "n_threads": N_THREADS,
|
| "n_batch": N_BATCH,
|
| "n_gpu_layers": 0,
|
| "verbose": False,
|
| "use_mmap": USE_MMAP,
|
| }
|
| if CHAT_FORMAT:
|
| init_kwargs["chat_format"] = CHAT_FORMAT
|
|
|
| llm = Llama(**init_kwargs)
|
| return {"llm": llm, "model_path": model_path}
|
|
|
|
|
| @api.get("/health")
|
| def health() -> Dict[str, Any]:
|
| loaded = _get_llm_and_path.cache_info().currsize > 0
|
| return {
|
| "status": "ok",
|
| "backend": "llama.cpp",
|
| "loaded": loaded,
|
| "model_repo_id": MODEL_REPO_ID,
|
| "model_file": MODEL_FILE,
|
| "chat_format": CHAT_FORMAT,
|
| "n_ctx": N_CTX,
|
| "n_threads": N_THREADS,
|
| }
|
|
|
|
|
| @api.get("/ready")
|
| def ready() -> Dict[str, Any]:
|
| m = _get_llm_and_path()
|
| llm: Llama = m["llm"]
|
| with LOCK:
|
| llm.create_chat_completion(
|
| messages=[{"role": "user", "content": "OK"}],
|
| max_tokens=1,
|
| temperature=0.0,
|
| stream=False,
|
| )
|
| return {"status": "ok", "loaded": True}
|
|
|
|
|
| @api.get("/v1/models")
|
| def v1_models() -> Dict[str, Any]:
|
| model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
|
| return {"object": "list", "data": [{"id": model_name, "object": "model"}]}
|
|
|
|
|
| def _filter_chat_kwargs(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| out: Dict[str, Any] = {}
|
| for k in [
|
| "max_tokens",
|
| "temperature",
|
| "top_p",
|
| "top_k",
|
| "min_p",
|
| "typical_p",
|
| "stop",
|
| "seed",
|
| "presence_penalty",
|
| "frequency_penalty",
|
| "repeat_penalty",
|
| ]:
|
| if k in payload:
|
| out[k] = payload[k]
|
| return out
|
|
|
|
|
| @api.post("/v1/chat/completions")
|
| async def chat_completions(req: Request):
|
| payload = await req.json()
|
| messages = payload.get("messages") or []
|
| stream = bool(payload.get("stream") or False)
|
|
|
| if not isinstance(messages, list) or not messages:
|
| return JSONResponse(
|
| status_code=400,
|
| content={"error": {"message": "messages must be a non-empty list"}},
|
| )
|
|
|
| m = _get_llm_and_path()
|
| llm: Llama = m["llm"]
|
| created = _now()
|
| resp_id = _openai_id("chatcmpl")
|
| model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
|
| kwargs = _filter_chat_kwargs(payload)
|
|
|
| if not stream:
|
| with LOCK:
|
| out = llm.create_chat_completion(
|
| messages=messages, stream=False, model=model_name, **kwargs
|
| )
|
| out["id"] = resp_id
|
| out["created"] = created
|
| out["model"] = out.get("model") or model_name
|
| return out
|
|
|
| def gen() -> Iterable[str]:
|
| with LOCK:
|
| it = llm.create_chat_completion(
|
| messages=messages, stream=True, model=model_name, **kwargs
|
| )
|
| for chunk in it:
|
| chunk["id"] = resp_id
|
| chunk["created"] = created
|
| chunk["model"] = chunk.get("model") or model_name
|
| yield _sse(chunk)
|
| yield _sse_done()
|
|
|
| return StreamingResponse(gen(), media_type="text/event-stream")
|
|
|
|
|
| def _ui_chat(
|
| message: str,
|
| history: List,
|
| system_message: str,
|
| max_tokens: int,
|
| temperature: float,
|
| top_p: float,
|
| ) -> str:
|
| msgs: List[Dict[str, Any]] = [{"role": "system", "content": system_message}]
|
|
|
| for h in history or []:
|
| if isinstance(h, dict) and "role" in h:
|
| msgs.append(h)
|
| elif isinstance(h, (list, tuple)) and len(h) == 2:
|
| if h[0]:
|
| msgs.append({"role": "user", "content": h[0]})
|
| if h[1]:
|
| msgs.append({"role": "assistant", "content": h[1]})
|
|
|
| msgs.append({"role": "user", "content": message})
|
|
|
| m = _get_llm_and_path()
|
| llm: Llama = m["llm"]
|
| with LOCK:
|
| out = llm.create_chat_completion(
|
| messages=msgs,
|
| max_tokens=max_tokens,
|
| temperature=temperature,
|
| top_p=top_p,
|
| stream=False,
|
| )
|
| return (((out.get("choices") or [{}])[0].get("message") or {}).get("content")) or ""
|
|
|
|
|
| DESCRIPTION = """
|
| ### LFM2 350M (Q4_K_M, CPU)
|
|
|
| Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
|
|
|
| **OpenAI-compatible API:**
|
| - `POST /v1/chat/completions` - Chat completions (supports streaming)
|
| - `GET /v1/models` - List models
|
| - `GET /health` - Health check
|
| """
|
|
|
| demo = gr.ChatInterface(
|
| fn=_ui_chat,
|
| title="LFM2 350M",
|
| description=DESCRIPTION,
|
| additional_inputs=[
|
| gr.Textbox(
|
| value="You are a helpful assistant.",
|
| label="System message",
|
| lines=2,
|
| ),
|
| gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Max tokens"),
|
| gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
|
| gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
|
| ],
|
| examples=[
|
| ["Hello! How are you?"],
|
| ["What is the capital of France?"],
|
| ["Write a Python function to add two numbers."],
|
| ],
|
| )
|
|
|
| app = gr.mount_gradio_app(api, demo, path="/")
|
|
|
|
|
| if __name__ == "__main__":
|
| import uvicorn
|
|
|
| uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|