Spaces:

chmielvu
/

lfm2-350m

Sleeping

App Files Files Community

lfm2-350m / app.py

chmielvu

Upload app.py with huggingface_hub

2ff5440 verified about 1 month ago

raw

history blame contribute delete

6.72 kB

	import json
	import os
	import threading
	import time
	import uuid
	from functools import lru_cache
	from typing import Any, Dict, Iterable, List, Optional

	import gradio as gr
	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse, StreamingResponse
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "LiquidAI/LFM2-350M-GGUF")
	MODEL_FILE = os.environ.get("MODEL_FILE", "LFM2-350M-Q4_K_M.gguf")

	N_CTX = int(os.environ.get("N_CTX", "4096"))
	N_THREADS = int(os.environ.get("N_THREADS", "2"))
	N_BATCH = int(os.environ.get("N_BATCH", "512"))
	CHAT_FORMAT = os.environ.get("CHAT_FORMAT", "")
	USE_MMAP = os.environ.get("USE_MMAP", "1") == "1"

	LOCK = threading.Lock()
	api = FastAPI()


	def _now() -> int:
	return int(time.time())


	def _openai_id(prefix: str) -> str:
	return f"{prefix}-{uuid.uuid4().hex[:24]}"


	def _sse(obj: Any) -> str:
	return f"data: {json.dumps(obj, ensure_ascii=True)}\n\n"


	def _sse_done() -> str:
	return "data: [DONE]\n\n"


	@lru_cache(maxsize=1)
	def _get_llm_and_path() -> Dict[str, Any]:
	model_path = hf_hub_download(
	repo_id=MODEL_REPO_ID, filename=MODEL_FILE, repo_type="model"
	)

	init_kwargs: Dict[str, Any] = {
	"model_path": model_path,
	"n_ctx": N_CTX,
	"n_threads": N_THREADS,
	"n_batch": N_BATCH,
	"n_gpu_layers": 0,
	"verbose": False,
	"use_mmap": USE_MMAP,
	}
	if CHAT_FORMAT:
	init_kwargs["chat_format"] = CHAT_FORMAT

	llm = Llama(**init_kwargs)
	return {"llm": llm, "model_path": model_path}


	@api.get("/health")
	def health() -> Dict[str, Any]:
	loaded = _get_llm_and_path.cache_info().currsize > 0
	return {
	"status": "ok",
	"backend": "llama.cpp",
	"loaded": loaded,
	"model_repo_id": MODEL_REPO_ID,
	"model_file": MODEL_FILE,
	"chat_format": CHAT_FORMAT,
	"n_ctx": N_CTX,
	"n_threads": N_THREADS,
	}


	@api.get("/ready")
	def ready() -> Dict[str, Any]:
	m = _get_llm_and_path()
	llm: Llama = m["llm"]
	with LOCK:
	llm.create_chat_completion(
	messages=[{"role": "user", "content": "OK"}],
	max_tokens=1,
	temperature=0.0,
	stream=False,
	)
	return {"status": "ok", "loaded": True}


	@api.get("/v1/models")
	def v1_models() -> Dict[str, Any]:
	model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
	return {"object": "list", "data": [{"id": model_name, "object": "model"}]}


	def _filter_chat_kwargs(payload: Dict[str, Any]) -> Dict[str, Any]:
	out: Dict[str, Any] = {}
	for k in [
	"max_tokens",
	"temperature",
	"top_p",
	"top_k",
	"min_p",
	"typical_p",
	"stop",
	"seed",
	"presence_penalty",
	"frequency_penalty",
	"repeat_penalty",
	]:
	if k in payload:
	out[k] = payload[k]
	return out


	@api.post("/v1/chat/completions")
	async def chat_completions(req: Request):
	payload = await req.json()
	messages = payload.get("messages") or []
	stream = bool(payload.get("stream") or False)

	if not isinstance(messages, list) or not messages:
	return JSONResponse(
	status_code=400,
	content={"error": {"message": "messages must be a non-empty list"}},
	)

	m = _get_llm_and_path()
	llm: Llama = m["llm"]
	created = _now()
	resp_id = _openai_id("chatcmpl")
	model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
	kwargs = _filter_chat_kwargs(payload)

	if not stream:
	with LOCK:
	out = llm.create_chat_completion(
	messages=messages, stream=False, model=model_name, **kwargs
	)
	out["id"] = resp_id
	out["created"] = created
	out["model"] = out.get("model") or model_name
	return out

	def gen() -> Iterable[str]:
	with LOCK:
	it = llm.create_chat_completion(
	messages=messages, stream=True, model=model_name, **kwargs
	)
	for chunk in it:
	chunk["id"] = resp_id
	chunk["created"] = created
	chunk["model"] = chunk.get("model") or model_name
	yield _sse(chunk)
	yield _sse_done()

	return StreamingResponse(gen(), media_type="text/event-stream")


	def _ui_chat(
	message: str,
	history: List,
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	) -> str:
	msgs: List[Dict[str, Any]] = [{"role": "system", "content": system_message}]

	for h in history or []:
	if isinstance(h, dict) and "role" in h:
	msgs.append(h)
	elif isinstance(h, (list, tuple)) and len(h) == 2:
	if h[0]:
	msgs.append({"role": "user", "content": h[0]})
	if h[1]:
	msgs.append({"role": "assistant", "content": h[1]})

	msgs.append({"role": "user", "content": message})

	m = _get_llm_and_path()
	llm: Llama = m["llm"]
	with LOCK:
	out = llm.create_chat_completion(
	messages=msgs,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=False,
	)
	return (((out.get("choices") or [{}])[0].get("message") or {}).get("content")) or ""


	DESCRIPTION = """
	### LFM2 350M (Q4_K_M, CPU)

	Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.

	OpenAI-compatible API:
	- `POST /v1/chat/completions` - Chat completions (supports streaming)
	- `GET /v1/models` - List models
	- `GET /health` - Health check
	"""

	demo = gr.ChatInterface(
	fn=_ui_chat,
	title="LFM2 350M",
	description=DESCRIPTION,
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful assistant.",
	label="System message",
	lines=2,
	),
	gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Max tokens"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
	],
	examples=[
	["Hello! How are you?"],
	["What is the capital of France?"],
	["Write a Python function to add two numbers."],
	],
	)

	app = gr.mount_gradio_app(api, demo, path="/")


	if __name__ == "__main__":
	import uvicorn

	uvicorn.run(app, host="0.0.0.0", port=7860)