chmielvu commited on
Commit
84010f0
·
verified ·
1 Parent(s): 0375f6c

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +20 -0
  2. README.md +35 -10
  3. app.py +236 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ libopenblas-dev \
7
+ libgomp1 \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ RUN pip install --no-cache-dir \
11
+ https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
12
+
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY app.py .
17
+
18
+ EXPOSE 7860
19
+
20
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,35 @@
1
- ---
2
- title: Lfm2 350m
3
- emoji: 😻
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: LFM2 350M
3
+ emoji: 💧
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ license: other
9
+ preload_from_hub:
10
+ - LiquidAI/LFM2-350M-GGUF
11
+ ---
12
+
13
+ # LFM2 350M (Q4_K_M)
14
+
15
+ Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
16
+
17
+ ## API Endpoints
18
+
19
+ - `POST /v1/chat/completions` - OpenAI-compatible chat completions (supports streaming)
20
+ - `GET /v1/models` - List available models
21
+ - `GET /health` - Health check
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ curl -X POST "https://YOUR_SPACE.hf.space/v1/chat/completions" \
27
+ -H "Content-Type: application/json" \
28
+ -d '{"messages": [{"role": "user", "content": "Hello!"}]}'
29
+ ```
30
+
31
+ ## Tech
32
+
33
+ - llama.cpp via JamePeng fork (Luigi wheel v0.3.22)
34
+ - chat_format: chatml
35
+ - Model: LFM2-350M-GGUF (Q4_K_M)
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import threading
4
+ import time
5
+ import uuid
6
+ from functools import lru_cache
7
+ from typing import Any, Dict, Iterable, List, Optional
8
+
9
+ import gradio as gr
10
+ from fastapi import FastAPI, Request
11
+ from fastapi.responses import JSONResponse, StreamingResponse
12
+ from huggingface_hub import hf_hub_download
13
+ from llama_cpp import Llama
14
+
15
+ MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "LiquidAI/LFM2-350M-GGUF")
16
+ MODEL_FILE = os.environ.get("MODEL_FILE", "LFM2-350M-Q4_K_M.gguf")
17
+
18
+ N_CTX = int(os.environ.get("N_CTX", "4096"))
19
+ N_THREADS = int(os.environ.get("N_THREADS", "2"))
20
+ N_BATCH = int(os.environ.get("N_BATCH", "512"))
21
+ CHAT_FORMAT = os.environ.get("CHAT_FORMAT", "chatml")
22
+ USE_MMAP = os.environ.get("USE_MMAP", "1") == "1"
23
+
24
+ LOCK = threading.Lock()
25
+ api = FastAPI()
26
+
27
+
28
+ def _now() -> int:
29
+ return int(time.time())
30
+
31
+
32
+ def _openai_id(prefix: str) -> str:
33
+ return f"{prefix}-{uuid.uuid4().hex[:24]}"
34
+
35
+
36
+ def _sse(obj: Any) -> str:
37
+ return f"data: {json.dumps(obj, ensure_ascii=True)}\n\n"
38
+
39
+
40
+ def _sse_done() -> str:
41
+ return "data: [DONE]\n\n"
42
+
43
+
44
+ @lru_cache(maxsize=1)
45
+ def _get_llm_and_path() -> Dict[str, Any]:
46
+ model_path = hf_hub_download(
47
+ repo_id=MODEL_REPO_ID, filename=MODEL_FILE, repo_type="model"
48
+ )
49
+
50
+ init_kwargs: Dict[str, Any] = {
51
+ "model_path": model_path,
52
+ "n_ctx": N_CTX,
53
+ "n_threads": N_THREADS,
54
+ "n_batch": N_BATCH,
55
+ "n_gpu_layers": 0,
56
+ "verbose": False,
57
+ "use_mmap": USE_MMAP,
58
+ "chat_format": CHAT_FORMAT,
59
+ }
60
+
61
+ llm = Llama(**init_kwargs)
62
+ return {"llm": llm, "model_path": model_path}
63
+
64
+
65
+ @api.get("/health")
66
+ def health() -> Dict[str, Any]:
67
+ loaded = _get_llm_and_path.cache_info().currsize > 0
68
+ return {
69
+ "status": "ok",
70
+ "backend": "llama.cpp",
71
+ "loaded": loaded,
72
+ "model_repo_id": MODEL_REPO_ID,
73
+ "model_file": MODEL_FILE,
74
+ "chat_format": CHAT_FORMAT,
75
+ "n_ctx": N_CTX,
76
+ "n_threads": N_THREADS,
77
+ }
78
+
79
+
80
+ @api.get("/ready")
81
+ def ready() -> Dict[str, Any]:
82
+ m = _get_llm_and_path()
83
+ llm: Llama = m["llm"]
84
+ with LOCK:
85
+ llm.create_chat_completion(
86
+ messages=[{"role": "user", "content": "OK"}],
87
+ max_tokens=1,
88
+ temperature=0.0,
89
+ stream=False,
90
+ )
91
+ return {"status": "ok", "loaded": True}
92
+
93
+
94
+ @api.get("/v1/models")
95
+ def v1_models() -> Dict[str, Any]:
96
+ model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
97
+ return {"object": "list", "data": [{"id": model_name, "object": "model"}]}
98
+
99
+
100
+ def _filter_chat_kwargs(payload: Dict[str, Any]) -> Dict[str, Any]:
101
+ out: Dict[str, Any] = {}
102
+ for k in [
103
+ "max_tokens",
104
+ "temperature",
105
+ "top_p",
106
+ "top_k",
107
+ "min_p",
108
+ "typical_p",
109
+ "stop",
110
+ "seed",
111
+ "presence_penalty",
112
+ "frequency_penalty",
113
+ "repeat_penalty",
114
+ ]:
115
+ if k in payload:
116
+ out[k] = payload[k]
117
+ return out
118
+
119
+
120
+ @api.post("/v1/chat/completions")
121
+ async def chat_completions(req: Request):
122
+ payload = await req.json()
123
+ messages = payload.get("messages") or []
124
+ stream = bool(payload.get("stream") or False)
125
+
126
+ if not isinstance(messages, list) or not messages:
127
+ return JSONResponse(
128
+ status_code=400,
129
+ content={"error": {"message": "messages must be a non-empty list"}},
130
+ )
131
+
132
+ m = _get_llm_and_path()
133
+ llm: Llama = m["llm"]
134
+ created = _now()
135
+ resp_id = _openai_id("chatcmpl")
136
+ model_name = f"{MODEL_REPO_ID}/{MODEL_FILE}"
137
+ kwargs = _filter_chat_kwargs(payload)
138
+
139
+ if not stream:
140
+ with LOCK:
141
+ out = llm.create_chat_completion(
142
+ messages=messages, stream=False, model=model_name, **kwargs
143
+ )
144
+ out["id"] = resp_id
145
+ out["created"] = created
146
+ out["model"] = out.get("model") or model_name
147
+ return out
148
+
149
+ def gen() -> Iterable[str]:
150
+ with LOCK:
151
+ it = llm.create_chat_completion(
152
+ messages=messages, stream=True, model=model_name, **kwargs
153
+ )
154
+ for chunk in it:
155
+ chunk["id"] = resp_id
156
+ chunk["created"] = created
157
+ chunk["model"] = chunk.get("model") or model_name
158
+ yield _sse(chunk)
159
+ yield _sse_done()
160
+
161
+ return StreamingResponse(gen(), media_type="text/event-stream")
162
+
163
+
164
+ def _ui_chat(
165
+ message: str,
166
+ history: List,
167
+ system_message: str,
168
+ max_tokens: int,
169
+ temperature: float,
170
+ top_p: float,
171
+ ) -> str:
172
+ msgs: List[Dict[str, Any]] = [{"role": "system", "content": system_message}]
173
+
174
+ for h in history or []:
175
+ if isinstance(h, dict) and "role" in h:
176
+ msgs.append(h)
177
+ elif isinstance(h, (list, tuple)) and len(h) == 2:
178
+ if h[0]:
179
+ msgs.append({"role": "user", "content": h[0]})
180
+ if h[1]:
181
+ msgs.append({"role": "assistant", "content": h[1]})
182
+
183
+ msgs.append({"role": "user", "content": message})
184
+
185
+ m = _get_llm_and_path()
186
+ llm: Llama = m["llm"]
187
+ with LOCK:
188
+ out = llm.create_chat_completion(
189
+ messages=msgs,
190
+ max_tokens=max_tokens,
191
+ temperature=temperature,
192
+ top_p=top_p,
193
+ stream=False,
194
+ )
195
+ return (((out.get("choices") or [{}])[0].get("message") or {}).get("content")) or ""
196
+
197
+
198
+ DESCRIPTION = """
199
+ ### LFM2 350M (Q4_K_M, CPU)
200
+
201
+ Liquid Foundation Model 2 - 350M parameters. Edge-ready multilingual generation.
202
+
203
+ **OpenAI-compatible API:**
204
+ - `POST /v1/chat/completions` - Chat completions (supports streaming)
205
+ - `GET /v1/models` - List models
206
+ - `GET /health` - Health check
207
+ """
208
+
209
+ demo = gr.ChatInterface(
210
+ fn=_ui_chat,
211
+ title="LFM2 350M",
212
+ description=DESCRIPTION,
213
+ additional_inputs=[
214
+ gr.Textbox(
215
+ value="You are a helpful assistant.",
216
+ label="System message",
217
+ lines=2,
218
+ ),
219
+ gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Max tokens"),
220
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
221
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
222
+ ],
223
+ examples=[
224
+ ["Hello! How are you?"],
225
+ ["What is the capital of France?"],
226
+ ["Write a Python function to add two numbers."],
227
+ ],
228
+ )
229
+
230
+ app = gr.mount_gradio_app(api, demo, path="/")
231
+
232
+
233
+ if __name__ == "__main__":
234
+ import uvicorn
235
+
236
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ huggingface_hub>=0.26.0
5
+ numpy