lfm2-350m / app.py
chmielvu's picture
Upload app.py with huggingface_hub
2ff5440 verified
import json
import os
import threading
import time
import uuid
from functools import lru_cache
from typing import Any, Dict, Iterable, List, Optional
import gradio as gr
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "LiquidAI/LFM2-350M-GGUF")
MODEL_FILE = os.environ.get("MODEL_FILE", "LFM2-350M-Q4_K_M.gguf")
N_CTX = int(os.environ.get("N_CTX", "4096"))
N_THREADS = int(os.environ.get("N_THREADS", "2"))
N_BATCH = int(os.environ.get("N_BATCH", "512"))
CHAT_FORMAT = os.environ.get("CHAT_FORMAT", "")
USE_MMAP = os.environ.get("USE_MMAP", "1") == "1"
LOCK = threading.Lock()
api = FastAPI()
def _now() -> int:
return int(time.time())
def _openai_id(prefix: str) -> str:
return f"{prefix}-{uuid.uuid4().hex[:24]}"
def _sse(obj: Any) -> str:
return f"data: {json.dumps(obj, ensure_ascii=True)}\n\n"
def _sse_done() -> str:
return "data: [DONE]\n\n"
@lru_cache(maxsize=1)
def _get_llm_and_path() -> Dict[str, Any]:
model_path = hf_hub_download(
repo_id=MODEL_REPO_ID, filename=MODEL_FILE, repo_type="model"
)
init_kwargs: Dict[str, Any] = {
"model_path": model_path,
"n_ctx": N_CTX,
"n_threads": N_THREADS,
"n_batch": N_BATCH,
"n_gpu_layers": 0,
"verbose": False,
"use_mmap": USE_MMAP,
}
if CHAT_FORMAT:
init_kwargs["chat_format"] = CHAT_FORMAT
llm = Llama(**init_kwargs)
return {"llm": llm, "model_path": model_path}
@api.get("/health")
def health() -> Dict[str, Any]:
loaded = _get_llm_and_path.cache_info().currsize > 0
return {
"status": "ok",
"backend": "llama.cpp",
"loaded": loaded,
"model_repo_id": MODEL_REPO_ID,
"model_file": MODEL_FILE,
"chat_format": CHAT_FORMAT,
"n_ctx": N_CTX,
"n_threads": N_THREADS,
}
@api.get("/ready")
def ready() -> Dict[str, Any]:
m = _get_llm_and_path()
llm: Llama = m["llm"]
with LOCK:
llm.create_chat_completion(
messages=[{"role": "user", "content": "OK"}],
max_tokens=1,
temperature=0.0,
stream=False,
)
return {"status": "ok", "loaded": True}
@api.get("/v1/models")
def v1_models() -> Dict[str, Any]:
model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
return {"object": "list", "data": [{"id": model_name, "object": "model"}]}
def _filter_chat_kwargs(payload: Dict[str, Any]) -> Dict[str, Any]:
out: Dict[str, Any] = {}
for k in [
"max_tokens",
"temperature",
"top_p",
"top_k",
"min_p",
"typical_p",
"stop",
"seed",
"presence_penalty",
"frequency_penalty",
"repeat_penalty",
]:
if k in payload:
out[k] = payload[k]
return out
@api.post("/v1/chat/completions")
async def chat_completions(req: Request):
payload = await req.json()
messages = payload.get("messages") or []
stream = bool(payload.get("stream") or False)
if not isinstance(messages, list) or not messages:
return JSONResponse(
status_code=400,
content={"error": {"message": "messages must be a non-empty list"}},
)
m = _get_llm_and_path()
llm: Llama = m["llm"]
created = _now()
resp_id = _openai_id("chatcmpl")
model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
kwargs = _filter_chat_kwargs(payload)
if not stream:
with LOCK:
out = llm.create_chat_completion(
messages=messages, stream=False, model=model_name, **kwargs
)
out["id"] = resp_id
out["created"] = created
out["model"] = out.get("model") or model_name
return out
def gen() -> Iterable[str]:
with LOCK:
it = llm.create_chat_completion(
messages=messages, stream=True, model=model_name, **kwargs
)
for chunk in it:
chunk["id"] = resp_id
chunk["created"] = created
chunk["model"] = chunk.get("model") or model_name
yield _sse(chunk)
yield _sse_done()
return StreamingResponse(gen(), media_type="text/event-stream")
def _ui_chat(
message: str,
history: List,
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
) -> str:
msgs: List[Dict[str, Any]] = [{"role": "system", "content": system_message}]
for h in history or []:
if isinstance(h, dict) and "role" in h:
msgs.append(h)
elif isinstance(h, (list, tuple)) and len(h) == 2:
if h[0]:
msgs.append({"role": "user", "content": h[0]})
if h[1]:
msgs.append({"role": "assistant", "content": h[1]})
msgs.append({"role": "user", "content": message})
m = _get_llm_and_path()
llm: Llama = m["llm"]
with LOCK:
out = llm.create_chat_completion(
messages=msgs,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=False,
)
return (((out.get("choices") or [{}])[0].get("message") or {}).get("content")) or ""
DESCRIPTION = """
### LFM2 350M (Q4_K_M, CPU)
Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
**OpenAI-compatible API:**
- `POST /v1/chat/completions` - Chat completions (supports streaming)
- `GET /v1/models` - List models
- `GET /health` - Health check
"""
demo = gr.ChatInterface(
fn=_ui_chat,
title="LFM2 350M",
description=DESCRIPTION,
additional_inputs=[
gr.Textbox(
value="You are a helpful assistant.",
label="System message",
lines=2,
),
gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
],
examples=[
["Hello! How are you?"],
["What is the capital of France?"],
["Write a Python function to add two numbers."],
],
)
app = gr.mount_gradio_app(api, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)