Spaces:

chmielvu
/

lfm2-350m

Sleeping

App Files Files Community

chmielvu commited on Mar 17

Commit

84010f0

verified ·

1 Parent(s): 0375f6c

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +20 -0
README.md +35 -10
app.py +236 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas-dev \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir \
+    https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app.py .
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,35 @@
----
-title: Lfm2 350m
-emoji: 😻
-colorFrom: blue
-colorTo: pink
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: LFM2 350M
+emoji: 💧
+colorFrom: green
+colorTo: green
+sdk: docker
+pinned: false
+license: other
+preload_from_hub:
+- LiquidAI/LFM2-350M-GGUF
+---
+# LFM2 350M (Q4_K_M)
+Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
+## API Endpoints
+- `POST /v1/chat/completions` - OpenAI-compatible chat completions (supports streaming)
+- `GET /v1/models` - List available models
+- `GET /health` - Health check
+## Usage
+```bash
+curl -X POST "https://YOUR_SPACE.hf.space/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{"messages": [{"role": "user", "content": "Hello!"}]}'
+```
+## Tech
+- llama.cpp via JamePeng fork (Luigi wheel v0.3.22)
+- chat_format: chatml
+- Model: LFM2-350M-GGUF (Q4_K_M)

app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import json
+import os
+import threading
+import time
+import uuid
+from functools import lru_cache
+from typing import Any, Dict, Iterable, List, Optional
+import gradio as gr
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "LiquidAI/LFM2-350M-GGUF")
+MODEL_FILE = os.environ.get("MODEL_FILE", "LFM2-350M-Q4_K_M.gguf")
+N_CTX = int(os.environ.get("N_CTX", "4096"))
+N_THREADS = int(os.environ.get("N_THREADS", "2"))
+N_BATCH = int(os.environ.get("N_BATCH", "512"))
+CHAT_FORMAT = os.environ.get("CHAT_FORMAT", "chatml")
+USE_MMAP = os.environ.get("USE_MMAP", "1") == "1"
+LOCK = threading.Lock()
+api = FastAPI()
+def _now() -> int:
+    return int(time.time())
+def _openai_id(prefix: str) -> str:
+    return f"{prefix}-{uuid.uuid4().hex[:24]}"
+def _sse(obj: Any) -> str:
+    return f"data: {json.dumps(obj, ensure_ascii=True)}\n\n"
+def _sse_done() -> str:
+    return "data: [DONE]\n\n"
+@lru_cache(maxsize=1)
+def _get_llm_and_path() -> Dict[str, Any]:
+    model_path = hf_hub_download(
+        repo_id=MODEL_REPO_ID, filename=MODEL_FILE, repo_type="model"
+    )
+    init_kwargs: Dict[str, Any] = {
+        "model_path": model_path,
+        "n_ctx": N_CTX,
+        "n_threads": N_THREADS,
+        "n_batch": N_BATCH,
+        "n_gpu_layers": 0,
+        "verbose": False,
+        "use_mmap": USE_MMAP,
+        "chat_format": CHAT_FORMAT,
+    }
+    llm = Llama(**init_kwargs)
+    return {"llm": llm, "model_path": model_path}
+@api.get("/health")
+def health() -> Dict[str, Any]:
+    loaded = _get_llm_and_path.cache_info().currsize > 0
+    return {
+        "status": "ok",
+        "backend": "llama.cpp",
+        "loaded": loaded,
+        "model_repo_id": MODEL_REPO_ID,
+        "model_file": MODEL_FILE,
+        "chat_format": CHAT_FORMAT,
+        "n_ctx": N_CTX,
+        "n_threads": N_THREADS,
+    }
+@api.get("/ready")
+def ready() -> Dict[str, Any]:
+    m = _get_llm_and_path()
+    llm: Llama = m["llm"]
+    with LOCK:
+        llm.create_chat_completion(
+            messages=[{"role": "user", "content": "OK"}],
+            max_tokens=1,
+            temperature=0.0,
+            stream=False,
+        )
+    return {"status": "ok", "loaded": True}
+@api.get("/v1/models")
+def v1_models() -> Dict[str, Any]:
+    model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
+    return {"object": "list", "data": [{"id": model_name, "object": "model"}]}
+def _filter_chat_kwargs(payload: Dict[str, Any]) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for k in [
+        "max_tokens",
+        "temperature",
+        "top_p",
+        "top_k",
+        "min_p",
+        "typical_p",
+        "stop",
+        "seed",
+        "presence_penalty",
+        "frequency_penalty",
+        "repeat_penalty",
+    ]:
+        if k in payload:
+            out[k] = payload[k]
+    return out
+@api.post("/v1/chat/completions")
+async def chat_completions(req: Request):
+    payload = await req.json()
+    messages = payload.get("messages") or []
+    stream = bool(payload.get("stream") or False)
+    if not isinstance(messages, list) or not messages:
+        return JSONResponse(
+            status_code=400,
+            content={"error": {"message": "messages must be a non-empty list"}},
+        )
+    m = _get_llm_and_path()
+    llm: Llama = m["llm"]
+    created = _now()
+    resp_id = _openai_id("chatcmpl")
+    model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
+    kwargs = _filter_chat_kwargs(payload)
+    if not stream:
+        with LOCK:
+            out = llm.create_chat_completion(
+                messages=messages, stream=False, model=model_name, **kwargs
+            )
+        out["id"] = resp_id
+        out["created"] = created
+        out["model"] = out.get("model") or model_name
+        return out
+    def gen() -> Iterable[str]:
+        with LOCK:
+            it = llm.create_chat_completion(
+                messages=messages, stream=True, model=model_name, **kwargs
+            )
+            for chunk in it:
+                chunk["id"] = resp_id
+                chunk["created"] = created
+                chunk["model"] = chunk.get("model") or model_name
+                yield _sse(chunk)
+            yield _sse_done()
+    return StreamingResponse(gen(), media_type="text/event-stream")
+def _ui_chat(
+    message: str,
+    history: List,
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+) -> str:
+    msgs: List[Dict[str, Any]] = [{"role": "system", "content": system_message}]
+    for h in history or []:
+        if isinstance(h, dict) and "role" in h:
+            msgs.append(h)
+        elif isinstance(h, (list, tuple)) and len(h) == 2:
+            if h[0]:
+                msgs.append({"role": "user", "content": h[0]})
+            if h[1]:
+                msgs.append({"role": "assistant", "content": h[1]})
+    msgs.append({"role": "user", "content": message})
+    m = _get_llm_and_path()
+    llm: Llama = m["llm"]
+    with LOCK:
+        out = llm.create_chat_completion(
+            messages=msgs,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stream=False,
+        )
+    return (((out.get("choices") or [{}])[0].get("message") or {}).get("content")) or ""
+DESCRIPTION = """
+### LFM2 350M (Q4_K_M, CPU)
+Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
+**OpenAI-compatible API:**
+- `POST /v1/chat/completions` - Chat completions (supports streaming)
+- `GET /v1/models` - List models
+- `GET /health` - Health check
+"""
+demo = gr.ChatInterface(
+    fn=_ui_chat,
+    title="LFM2 350M",
+    description=DESCRIPTION,
+    additional_inputs=[
+        gr.Textbox(
+            value="You are a helpful assistant.",
+            label="System message",
+            lines=2,
+        ),
+        gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Max tokens"),
+        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
+    ],
+    examples=[
+        ["Hello! How are you?"],
+        ["What is the capital of France?"],
+        ["Write a Python function to add two numbers."],
+    ],
+)
+app = gr.mount_gradio_app(api, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+huggingface_hub>=0.26.0
+numpy