Rajhuggingface4253 commited on
Commit
0f735d4
·
verified ·
1 Parent(s): c03878f

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +43 -0
  2. app.py +850 -0
  3. config.py +54 -0
  4. requirements.txt +25 -0
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LFM2.5 FastAPI Backend - Dockerfile
2
+ # Optimized for HuggingFace Spaces
3
+
4
+ FROM python:3.11-slim
5
+
6
+ # Install minimal dependencies
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ curl \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Create non-root user
12
+ RUN useradd -m -u 1000 appuser
13
+
14
+ # Set working directory
15
+ WORKDIR /app
16
+
17
+ # Copy requirements first for caching
18
+ COPY requirements.txt .
19
+
20
+ # Install Python dependencies
21
+ RUN pip install --no-cache-dir --upgrade pip && \
22
+ pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy application code
25
+ COPY --chown=appuser:appuser app.py config.py ./
26
+
27
+ # Switch to non-root user
28
+ USER appuser
29
+
30
+ # Environment variables for HuggingFace Spaces
31
+ ENV PYTHONUNBUFFERED=1 \
32
+ LFM_HOST=0.0.0.0 \
33
+ LFM_PORT=7860
34
+
35
+ # Expose HuggingFace Spaces port
36
+ EXPOSE 7860
37
+
38
+ # Health check
39
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
40
+ CMD curl -f http://localhost:7860/health || exit 1
41
+
42
+ # Run
43
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LFM2.5 FastAPI Backend - ONNX Runtime Edition
3
+ ==============================================
4
+ Lightweight, CPU-friendly FastAPI backend for LiquidAI LFM2.5-1.2B-Instruct.
5
+ Uses official ONNX model for fast inference without heavy PyTorch dependencies.
6
+
7
+ Features:
8
+ - ONNX Runtime for fast CPU inference (no GPU required)
9
+ - Q8 quantization for 95%+ accuracy retention
10
+ - Streaming SSE responses
11
+ - OpenAI-compatible API
12
+ - Optimized for HuggingFace Spaces (2 vCPU, 16GB RAM)
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import logging
18
+ import time
19
+ import uuid
20
+ import threading
21
+ import queue # Thread-safe queue for true streaming
22
+ from contextlib import asynccontextmanager
23
+ from typing import AsyncGenerator, Dict, List, Optional, Union
24
+ from pathlib import Path
25
+
26
+ import numpy as np
27
+ import onnxruntime as ort
28
+ from fastapi import FastAPI, HTTPException, Request
29
+ from fastapi.middleware.cors import CORSMiddleware
30
+ from fastapi.responses import JSONResponse
31
+ from huggingface_hub import hf_hub_download, list_repo_files
32
+ from pydantic import BaseModel, Field
33
+ from sse_starlette.sse import EventSourceResponse
34
+ from transformers import AutoTokenizer, PreTrainedTokenizerFast
35
+
36
+ from config import settings
37
+
38
+ # Configure logging
39
+ logging.basicConfig(
40
+ level=getattr(logging, settings.log_level.upper()),
41
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
42
+ )
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ # ==============================================================================
47
+ # Pydantic Models for OpenAI-compatible API
48
+ # ==============================================================================
49
+
50
+ class ChatMessage(BaseModel):
51
+ role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
52
+ content: str = Field(..., description="Message content")
53
+
54
+
55
+ class ChatCompletionRequest(BaseModel):
56
+ model: str = Field(default="lfm", description="Model identifier")
57
+ messages: List[ChatMessage] = Field(..., description="Conversation messages")
58
+ temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
59
+ top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
60
+ top_k: Optional[int] = Field(default=None, ge=0)
61
+ max_tokens: Optional[int] = Field(default=None, ge=1)
62
+ stream: bool = Field(default=False, description="Enable streaming response")
63
+ stop: Optional[Union[str, List[str]]] = Field(default=None)
64
+
65
+
66
+ class CompletionRequest(BaseModel):
67
+ model: str = Field(default="lfm", description="Model identifier")
68
+ prompt: str = Field(..., description="Text prompt")
69
+ temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
70
+ top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
71
+ top_k: Optional[int] = Field(default=None, ge=0)
72
+ max_tokens: Optional[int] = Field(default=None, ge=1)
73
+ stream: bool = Field(default=False, description="Enable streaming response")
74
+
75
+
76
+ class ChatCompletionChoice(BaseModel):
77
+ index: int
78
+ message: ChatMessage
79
+ finish_reason: Optional[str] = None
80
+
81
+
82
+ class ChatCompletionResponse(BaseModel):
83
+ id: str
84
+ object: str = "chat.completion"
85
+ created: int
86
+ model: str
87
+ choices: List[ChatCompletionChoice]
88
+ usage: Dict[str, int]
89
+
90
+
91
+ class CompletionChoice(BaseModel):
92
+ index: int
93
+ text: str
94
+ finish_reason: Optional[str] = None
95
+
96
+
97
+ class CompletionResponse(BaseModel):
98
+ id: str
99
+ object: str = "text_completion"
100
+ created: int
101
+ model: str
102
+ choices: List[CompletionChoice]
103
+ usage: Dict[str, int]
104
+
105
+
106
+ class ModelInfo(BaseModel):
107
+ id: str
108
+ object: str = "model"
109
+ created: int
110
+ owned_by: str = "liquid-ai"
111
+
112
+
113
+ class ModelListResponse(BaseModel):
114
+ object: str = "list"
115
+ data: List[ModelInfo]
116
+
117
+
118
+ # ==============================================================================
119
+ # ONNX Model Manager
120
+ # ==============================================================================
121
+
122
+ # ONNX dtype mapping
123
+ ONNX_DTYPE = {
124
+ "tensor(float)": np.float32,
125
+ "tensor(float16)": np.float16,
126
+ "tensor(int64)": np.int64
127
+ }
128
+
129
+
130
+ class ONNXModelManager:
131
+ """Manages ONNX model with KV cache for efficient generation."""
132
+
133
+ def __init__(self):
134
+ self._session = None
135
+ self._tokenizer = None
136
+ self._cache_template = None
137
+ self._use_position_ids = False
138
+ self._lock = threading.Lock()
139
+
140
+ @property
141
+ def is_loaded(self) -> bool:
142
+ return self._session is not None
143
+
144
+ def download_model(self) -> str:
145
+ """Download ONNX model files from HuggingFace."""
146
+ model_id = settings.model_id
147
+ variant = settings.model_variant
148
+
149
+ logger.info(f"Downloading model: {model_id} (variant: {variant})")
150
+
151
+ # Download main model file
152
+ model_filename = f"onnx/model_{variant}.onnx"
153
+ model_path = hf_hub_download(model_id, model_filename)
154
+
155
+ # Download all data files for this variant
156
+ for f in list_repo_files(model_id):
157
+ if f.startswith(f"onnx/model_{variant}.onnx_data"):
158
+ logger.info(f"Downloading: {f}")
159
+ hf_hub_download(model_id, f)
160
+
161
+ return model_path
162
+
163
+ def load_model(self) -> None:
164
+ """Load the ONNX model and tokenizer."""
165
+ with self._lock:
166
+ if self._session is not None:
167
+ return
168
+
169
+ logger.info("=" * 60)
170
+ logger.info("Loading LFM2.5-1.2B-Instruct ONNX model...")
171
+ logger.info(f"Model: {settings.model_id}")
172
+ logger.info(f"Variant: {settings.model_variant} (Q8 = ~95% accuracy)")
173
+ logger.info("=" * 60)
174
+
175
+ start_time = time.time()
176
+
177
+ # Download model
178
+ model_path = self.download_model()
179
+
180
+ # Configure ONNX Runtime for CPU
181
+ sess_options = ort.SessionOptions()
182
+ sess_options.intra_op_num_threads = settings.num_threads
183
+ sess_options.inter_op_num_threads = settings.num_threads
184
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
185
+
186
+ # Load ONNX session
187
+ self._session = ort.InferenceSession(
188
+ model_path,
189
+ sess_options=sess_options,
190
+ providers=['CPUExecutionProvider']
191
+ )
192
+
193
+ # Load tokenizer with fallback for models with invalid tokenizer_class
194
+ try:
195
+ self._tokenizer = AutoTokenizer.from_pretrained(
196
+ settings.model_id,
197
+ trust_remote_code=True
198
+ )
199
+ except ValueError as e:
200
+ if "TokenizersBackend" in str(e):
201
+ # LFM models incorrectly specify TokenizersBackend as tokenizer_class
202
+ # Fallback to PreTrainedTokenizerFast which works with tokenizers backend
203
+ logger.warning(
204
+ "AutoTokenizer failed with TokenizersBackend error. "
205
+ "Falling back to PreTrainedTokenizerFast..."
206
+ )
207
+ self._tokenizer = PreTrainedTokenizerFast.from_pretrained(
208
+ settings.model_id,
209
+ trust_remote_code=True
210
+ )
211
+ else:
212
+ raise
213
+
214
+ # Initialize cache template
215
+ self._init_cache_template()
216
+
217
+ # Check if model uses position_ids
218
+ input_names = {inp.name for inp in self._session.get_inputs()}
219
+ self._use_position_ids = "position_ids" in input_names
220
+
221
+ load_time = time.time() - start_time
222
+ logger.info("=" * 60)
223
+ logger.info(f"✓ Model loaded in {load_time:.2f}s")
224
+ logger.info(f" Threads: {settings.num_threads}")
225
+ logger.info(f" Provider: CPU")
226
+ logger.info("=" * 60)
227
+
228
+ def _init_cache_template(self) -> None:
229
+ """Initialize KV cache template."""
230
+ self._cache_template = {}
231
+ for inp in self._session.get_inputs():
232
+ if inp.name in {"input_ids", "attention_mask", "position_ids"}:
233
+ continue
234
+
235
+ shape = [d if isinstance(d, int) else 1 for d in inp.shape]
236
+ for i, d in enumerate(inp.shape):
237
+ if isinstance(d, str) and "sequence" in d.lower():
238
+ shape[i] = 0
239
+
240
+ dtype = ONNX_DTYPE.get(inp.type, np.float32)
241
+ self._cache_template[inp.name] = (shape, dtype)
242
+
243
+ def _create_empty_cache(self) -> Dict[str, np.ndarray]:
244
+ """Create a new empty KV cache."""
245
+ return {
246
+ name: np.zeros(shape, dtype=dtype)
247
+ for name, (shape, dtype) in self._cache_template.items()
248
+ }
249
+
250
+ @property
251
+ def session(self):
252
+ if self._session is None:
253
+ raise RuntimeError("Model not loaded")
254
+ return self._session
255
+
256
+ @property
257
+ def tokenizer(self):
258
+ if self._tokenizer is None:
259
+ raise RuntimeError("Tokenizer not loaded")
260
+ return self._tokenizer
261
+
262
+ def generate(
263
+ self,
264
+ input_ids: np.ndarray,
265
+ max_tokens: int = 512,
266
+ temperature: float = 0.1,
267
+ top_k: int = 50,
268
+ top_p: float = 0.1,
269
+ stop_tokens: Optional[List[int]] = None
270
+ ) -> List[int]:
271
+ """Generate tokens using ONNX model."""
272
+ if stop_tokens is None:
273
+ stop_tokens = [self._tokenizer.eos_token_id]
274
+
275
+ cache = self._create_empty_cache()
276
+ seq_len = input_ids.shape[1]
277
+ generated_tokens = []
278
+
279
+ for step in range(max_tokens):
280
+ if step == 0:
281
+ ids = input_ids
282
+ pos = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
283
+ else:
284
+ ids = np.array([[generated_tokens[-1]]], dtype=np.int64)
285
+ pos = np.array([[seq_len + len(generated_tokens) - 1]], dtype=np.int64)
286
+
287
+ attn_mask = np.ones((1, seq_len + len(generated_tokens)), dtype=np.int64)
288
+
289
+ feed = {"input_ids": ids, "attention_mask": attn_mask, **cache}
290
+ if self._use_position_ids:
291
+ feed["position_ids"] = pos
292
+
293
+ outputs = self._session.run(None, feed)
294
+
295
+ # Get logits and apply temperature
296
+ logits = outputs[0][0, -1]
297
+
298
+ if temperature > 0:
299
+ logits = logits / temperature
300
+
301
+ # Apply top-k
302
+ if top_k > 0:
303
+ indices_to_remove = np.argsort(logits)[:-top_k]
304
+ logits[indices_to_remove] = -np.inf
305
+
306
+ # Apply top-p (nucleus sampling)
307
+ if top_p < 1.0:
308
+ sorted_indices = np.argsort(logits)[::-1]
309
+ sorted_logits = logits[sorted_indices]
310
+ probs = np.exp(sorted_logits - np.max(sorted_logits))
311
+ probs = probs / probs.sum()
312
+ cumulative_probs = np.cumsum(probs)
313
+ sorted_indices_to_remove = cumulative_probs > top_p
314
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
315
+ sorted_indices_to_remove[0] = False
316
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
317
+ logits[indices_to_remove] = -np.inf
318
+
319
+ # Sample
320
+ probs = np.exp(logits - np.max(logits))
321
+ probs = probs / probs.sum()
322
+ next_token = int(np.random.choice(len(probs), p=probs))
323
+ else:
324
+ next_token = int(np.argmax(logits))
325
+
326
+ generated_tokens.append(next_token)
327
+
328
+ # Update cache
329
+ for i, out in enumerate(self._session.get_outputs()[1:], 1):
330
+ name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
331
+ if name in cache:
332
+ cache[name] = outputs[i]
333
+
334
+ if next_token in stop_tokens:
335
+ break
336
+
337
+ return generated_tokens
338
+
339
+ def generate_stream(
340
+ self,
341
+ input_ids: np.ndarray,
342
+ max_tokens: int = 2000,
343
+ temperature: float = 0.1,
344
+ top_k: int = 50,
345
+ top_p: float = 0.1,
346
+ stop_tokens: Optional[List[int]] = None
347
+ ):
348
+ """Fixed and optimized streaming generation."""
349
+ if stop_tokens is None:
350
+ stop_tokens = [self._tokenizer.eos_token_id]
351
+
352
+ cache = self._create_empty_cache()
353
+ seq_len = input_ids.shape[1]
354
+
355
+ # Pre-allocate inputs
356
+ max_possible_len = seq_len + max_tokens
357
+ attn_mask = np.ones((1, max_possible_len), dtype=np.int64)
358
+
359
+ # Pre-compute flags
360
+ use_temp = temperature > 0
361
+ use_top_k = top_k > 0
362
+ use_top_p = top_p < 1.0
363
+
364
+ # Reuse this dict to avoid garbage collection overhead
365
+ feed = {}
366
+
367
+ # Initialize token storage
368
+ generated_tokens = []
369
+
370
+ for step in range(max_tokens):
371
+ current_len = seq_len + step
372
+
373
+ # Input Preparation
374
+ if step == 0:
375
+ ids = input_ids
376
+ if self._use_position_ids:
377
+ pos = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
378
+ else:
379
+ # FIX: Access list directly. O(1) speed, no UnboundLocalError.
380
+ ids = np.array([[generated_tokens[-1]]], dtype=np.int64)
381
+ if self._use_position_ids:
382
+ pos = np.array([[current_len - 1]], dtype=np.int64)
383
+
384
+ # Update Feed Dict (In-place update is faster than creating new dict)
385
+ feed.clear()
386
+ feed["input_ids"] = ids
387
+ feed["attention_mask"] = attn_mask[:, :current_len]
388
+ if self._use_position_ids:
389
+ feed["position_ids"] = pos
390
+ feed.update(cache) # Merging cache is unavoidable
391
+
392
+ # Inference
393
+ outputs = self._session.run(None, feed)
394
+ logits = outputs[0][0, -1]
395
+
396
+ # --- Ultra-Fast Sampling ---
397
+ if use_temp:
398
+ logits /= temperature
399
+
400
+ # 1. Top-K Selection (Partitioning is O(N))
401
+ if use_top_k and top_k < len(logits):
402
+ # Moves largest k elements to the right; unordered
403
+ top_k_idx = np.argpartition(logits, -top_k)[-top_k:]
404
+ # Mask everything else
405
+ mask = np.ones(logits.shape, dtype=bool)
406
+ mask[top_k_idx] = False
407
+ logits[mask] = -np.inf
408
+
409
+ # 2. Top-P (Nucleus)
410
+ if use_top_p:
411
+ valid_mask = logits > -np.inf
412
+ if valid_mask.any():
413
+ valid_logits = logits[valid_mask]
414
+ valid_indices = np.where(valid_mask)[0]
415
+
416
+ # Sort only the valid candidates (small N)
417
+ sorted_indices = np.argsort(valid_logits)[::-1]
418
+ sorted_logits = valid_logits[sorted_indices]
419
+
420
+ # Softmax on valid set
421
+ exp_logits = np.exp(sorted_logits - np.max(sorted_logits))
422
+ probs = exp_logits / exp_logits.sum()
423
+
424
+ cumulative = np.cumsum(probs)
425
+
426
+ # Find cutoff
427
+ cutoff = np.searchsorted(cumulative, top_p)
428
+ # Ensure we keep at least one token
429
+ cutoff = min(cutoff + 1, len(sorted_logits))
430
+
431
+ # Filter indices
432
+ accepted_indices = sorted_indices[:cutoff]
433
+ accepted_probs = probs[:cutoff]
434
+ accepted_probs /= accepted_probs.sum() # Re-normalize
435
+
436
+ # Fast Weighted Sample: Use searchsorted instead of np.random.choice
437
+ # This avoids Python overhead in np.random.choice
438
+ sample_idx = np.searchsorted(np.cumsum(accepted_probs), np.random.rand())
439
+ next_token = int(valid_indices[accepted_indices[sample_idx]])
440
+ else:
441
+ next_token = int(np.argmax(logits))
442
+ else:
443
+ # Fallback if only Top-K was used
444
+ valid_mask = logits > -np.inf
445
+ valid_logits = logits[valid_mask]
446
+ valid_indices = np.where(valid_mask)[0]
447
+ exp_logits = np.exp(valid_logits - np.max(valid_logits))
448
+ probs = exp_logits / exp_logits.sum()
449
+ sample_idx = np.searchsorted(np.cumsum(probs), np.random.rand())
450
+ next_token = int(valid_indices[sample_idx])
451
+ else:
452
+ next_token = int(np.argmax(logits))
453
+
454
+ # Storage
455
+ generated_tokens.append(next_token)
456
+ yield next_token
457
+
458
+ if next_token in stop_tokens:
459
+ break
460
+
461
+ # Update Cache
462
+ for i, out in enumerate(self._session.get_outputs()[1:], 1):
463
+ name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
464
+ if name in cache:
465
+ cache[name] = outputs[i]
466
+
467
+ def unload(self) -> None:
468
+ """Unload model from memory."""
469
+ with self._lock:
470
+ if self._session is not None:
471
+ del self._session
472
+ del self._tokenizer
473
+ self._session = None
474
+ self._tokenizer = None
475
+ logger.info("Model unloaded")
476
+
477
+
478
+ # Global model manager
479
+ model_manager = ONNXModelManager()
480
+
481
+
482
+ # ==============================================================================
483
+ # Application Lifecycle
484
+ # ==============================================================================
485
+
486
+ @asynccontextmanager
487
+ async def lifespan(app: FastAPI):
488
+ """Application lifespan handler."""
489
+ logger.info("Starting LFM2.5 API Server (ONNX Runtime)...")
490
+
491
+ loop = asyncio.get_event_loop()
492
+ await loop.run_in_executor(None, model_manager.load_model)
493
+
494
+ yield
495
+
496
+ logger.info("Shutting down...")
497
+ model_manager.unload()
498
+
499
+
500
+ # ==============================================================================
501
+ # FastAPI Application
502
+ # ==============================================================================
503
+
504
+ app = FastAPI(
505
+ title=settings.app_name,
506
+ description="Fast CPU inference for LiquidAI LFM2.5-1.2B-Instruct using ONNX Runtime",
507
+ version=settings.app_version,
508
+ lifespan=lifespan,
509
+ docs_url="/docs",
510
+ redoc_url="/redoc",
511
+ )
512
+
513
+ app.add_middleware(
514
+ CORSMiddleware,
515
+ allow_origins=["*"], # Allow all origins
516
+ allow_credentials=False, # Must be False when using wildcard origins
517
+ allow_methods=["*"],
518
+ allow_headers=["*"],
519
+ expose_headers=["*"], # Expose all headers for SSE
520
+ )
521
+
522
+
523
+ # Custom middleware to handle null origin (file:// protocol)
524
+ @app.middleware("http")
525
+ async def add_cors_for_null_origin(request: Request, call_next):
526
+ """Handle CORS for null origin (when HTML is opened from file://)."""
527
+ origin = request.headers.get("origin", "")
528
+ response = await call_next(request)
529
+
530
+ # If origin is null (file:// protocol), add explicit CORS headers
531
+ if origin == "null" or not origin:
532
+ response.headers["Access-Control-Allow-Origin"] = "*"
533
+ response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
534
+ response.headers["Access-Control-Allow-Headers"] = "*"
535
+ response.headers["Access-Control-Expose-Headers"] = "*"
536
+
537
+ return response
538
+
539
+
540
+ # ==============================================================================
541
+ # Helper Functions
542
+ # ==============================================================================
543
+
544
+ def generate_id() -> str:
545
+ return f"chatcmpl-{uuid.uuid4().hex[:12]}"
546
+
547
+
548
+ async def stream_chat_completion(request: ChatCompletionRequest) -> AsyncGenerator[str, None]:
549
+ """
550
+ Optimized 'Zero-Latency' Streaming.
551
+ Uses asyncio.Queue + call_soon_threadsafe to eliminate polling and blocking.
552
+ """
553
+ request_id = generate_id()
554
+ created = int(time.time())
555
+
556
+ # Capture the running event loop to bridge the background thread safely
557
+ loop = asyncio.get_running_loop()
558
+ # Async Queue allows 'await get()' which is non-blocking and instant
559
+ async_queue = asyncio.Queue()
560
+
561
+ tokenizer = model_manager.tokenizer
562
+
563
+ # Prepare inputs
564
+ messages = [{"role": m.role, "content": m.content} for m in request.messages]
565
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
566
+ input_ids = np.array([tokenizer.encode(prompt, add_special_tokens=False)], dtype=np.int64)
567
+
568
+ # Config
569
+ max_tokens = request.max_tokens or settings.max_tokens
570
+ temperature = request.temperature if request.temperature is not None else settings.temperature
571
+ top_k = request.top_k if request.top_k is not None else settings.top_k
572
+ top_p = request.top_p if request.top_p is not None else settings.top_p
573
+
574
+ # Prepare stop tokens
575
+ stop_tokens = [tokenizer.eos_token_id]
576
+ if request.stop:
577
+ if isinstance(request.stop, str):
578
+ encoded = tokenizer.encode(request.stop, add_special_tokens=False)
579
+ if encoded:
580
+ stop_tokens.append(encoded[0])
581
+ elif isinstance(request.stop, list):
582
+ for stop_str in request.stop:
583
+ encoded = tokenizer.encode(stop_str, add_special_tokens=False)
584
+ if encoded:
585
+ stop_tokens.append(encoded[0])
586
+
587
+ def generate_tokens():
588
+ """
589
+ Background Thread: Pushes data directly into the async loop.
590
+ """
591
+ try:
592
+ # Use the optimized generate_stream from ONNXModelManager
593
+ for token in model_manager.generate_stream(
594
+ input_ids,
595
+ max_tokens=max_tokens,
596
+ temperature=temperature,
597
+ top_k=top_k,
598
+ top_p=top_p,
599
+ stop_tokens=stop_tokens
600
+ ):
601
+ # CRITICAL: Schedule the 'put' on the main loop immediately
602
+ # This wakes up the awaiter instantly—0ms latency overhead.
603
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("token", token))
604
+ except Exception as e:
605
+ logger.error(f"Stream generation error: {e}")
606
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("error", str(e)))
607
+ finally:
608
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("done", None))
609
+
610
+ # Start generation in background thread
611
+ threading.Thread(target=generate_tokens, daemon=True).start()
612
+
613
+ # Main Async Loop - No timeouts, no sleeps, pure event awaiting
614
+ try:
615
+ while True:
616
+ # waits until data is pushed; yields control to other users while waiting
617
+ msg_type, data = await async_queue.get()
618
+
619
+ if msg_type == "token":
620
+ text = tokenizer.decode([data], skip_special_tokens=True)
621
+ if text:
622
+ chunk = {
623
+ "id": request_id,
624
+ "object": "chat.completion.chunk",
625
+ "created": created,
626
+ "model": request.model,
627
+ "choices": [{
628
+ "index": 0,
629
+ "delta": {"content": text},
630
+ "finish_reason": None
631
+ }]
632
+ }
633
+ # Yield in the format expected by EventSourceResponse
634
+ yield {"data": json.dumps(chunk)}
635
+
636
+ elif msg_type == "done":
637
+ final = {
638
+ "id": request_id,
639
+ "object": "chat.completion.chunk",
640
+ "created": created,
641
+ "model": request.model,
642
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
643
+ }
644
+ yield {"data": json.dumps(final)}
645
+ yield {"data": "[DONE]"}
646
+ break
647
+
648
+ elif msg_type == "error":
649
+ logger.error(f"Stream error: {data}")
650
+ yield {"data": json.dumps({"error": {"message": data}})}
651
+ break
652
+
653
+ except asyncio.CancelledError:
654
+ logger.info(f"Stream cancelled for request {request_id[:8]}")
655
+ raise
656
+ except Exception as e:
657
+ logger.error(f"Streaming error: {e}")
658
+ yield {"data": json.dumps({"error": {"message": str(e)}})}
659
+
660
+
661
+ # ==============================================================================
662
+ # API Endpoints
663
+ # ==============================================================================
664
+
665
+ @app.get("/", response_class=JSONResponse)
666
+ async def health_check():
667
+ """Health check with model status."""
668
+ return {
669
+ "status": "ready" if model_manager.is_loaded else "loading",
670
+ "model": {
671
+ "id": settings.model_id,
672
+ "variant": settings.model_variant,
673
+ "loaded": model_manager.is_loaded,
674
+ "backend": "ONNX Runtime"
675
+ },
676
+ "server": {
677
+ "name": settings.app_name,
678
+ "version": settings.app_version,
679
+ "port": settings.port
680
+ }
681
+ }
682
+
683
+
684
+ @app.get("/health")
685
+ async def health():
686
+ if not model_manager.is_loaded:
687
+ raise HTTPException(status_code=503, detail="Model not loaded")
688
+ return {"status": "healthy"}
689
+
690
+
691
+ @app.get("/v1/models", response_model=ModelListResponse)
692
+ async def list_models():
693
+ return ModelListResponse(
694
+ data=[
695
+ ModelInfo(id="lfm", created=int(time.time())),
696
+ ModelInfo(id="lfm-2.5-1.2b-instruct-onnx", created=int(time.time()))
697
+ ]
698
+ )
699
+
700
+
701
+ @app.post("/v1/chat/completions")
702
+ async def chat_completions(request: ChatCompletionRequest):
703
+ """OpenAI-compatible chat completion."""
704
+ if not model_manager.is_loaded:
705
+ raise HTTPException(status_code=503, detail="Model not loaded")
706
+
707
+ if request.stream:
708
+ return EventSourceResponse(
709
+ stream_chat_completion(request),
710
+ media_type="text/event-stream",
711
+ ping=30000, # 30 second keep-alive
712
+ ping_message_factory=lambda: '{"type": "ping"}'
713
+ )
714
+
715
+ try:
716
+ tokenizer = model_manager.tokenizer
717
+
718
+ messages = [{"role": m.role, "content": m.content} for m in request.messages]
719
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
720
+ input_ids = np.array([tokenizer.encode(prompt, add_special_tokens=False)], dtype=np.int64)
721
+
722
+ max_tokens = request.max_tokens or settings.max_tokens
723
+ temperature = request.temperature if request.temperature is not None else settings.temperature
724
+ top_k = request.top_k if request.top_k is not None else settings.top_k
725
+ top_p = request.top_p if request.top_p is not None else settings.top_p
726
+
727
+ start_time = time.time()
728
+
729
+ loop = asyncio.get_event_loop()
730
+ tokens = await loop.run_in_executor(
731
+ None,
732
+ lambda: model_manager.generate(
733
+ input_ids,
734
+ max_tokens=max_tokens,
735
+ temperature=temperature,
736
+ top_k=top_k,
737
+ top_p=top_p
738
+ )
739
+ )
740
+
741
+ response_text = tokenizer.decode(tokens, skip_special_tokens=True)
742
+ gen_time = time.time() - start_time
743
+
744
+ logger.debug(f"Generated {len(tokens)} tokens in {gen_time:.2f}s")
745
+
746
+ return ChatCompletionResponse(
747
+ id=generate_id(),
748
+ created=int(time.time()),
749
+ model=request.model,
750
+ choices=[
751
+ ChatCompletionChoice(
752
+ index=0,
753
+ message=ChatMessage(role="assistant", content=response_text),
754
+ finish_reason="stop"
755
+ )
756
+ ],
757
+ usage={
758
+ "prompt_tokens": input_ids.shape[1],
759
+ "completion_tokens": len(tokens),
760
+ "total_tokens": input_ids.shape[1] + len(tokens)
761
+ }
762
+ )
763
+
764
+ except Exception as e:
765
+ logger.error(f"Chat completion error: {e}")
766
+ raise HTTPException(status_code=500, detail=str(e))
767
+
768
+
769
+ @app.post("/v1/completions")
770
+ async def completions(request: CompletionRequest):
771
+ """OpenAI-compatible text completion."""
772
+ if not model_manager.is_loaded:
773
+ raise HTTPException(status_code=503, detail="Model not loaded")
774
+
775
+ try:
776
+ tokenizer = model_manager.tokenizer
777
+ input_ids = np.array([tokenizer.encode(request.prompt)], dtype=np.int64)
778
+
779
+ max_tokens = request.max_tokens or settings.max_tokens
780
+ temperature = request.temperature if request.temperature is not None else settings.temperature
781
+ top_k = request.top_k if request.top_k is not None else settings.top_k
782
+ top_p = request.top_p if request.top_p is not None else settings.top_p
783
+
784
+ loop = asyncio.get_event_loop()
785
+ tokens = await loop.run_in_executor(
786
+ None,
787
+ lambda: model_manager.generate(
788
+ input_ids,
789
+ max_tokens=max_tokens,
790
+ temperature=temperature,
791
+ top_k=top_k,
792
+ top_p=top_p
793
+ )
794
+ )
795
+
796
+ response_text = tokenizer.decode(tokens, skip_special_tokens=True)
797
+
798
+ return CompletionResponse(
799
+ id=generate_id(),
800
+ created=int(time.time()),
801
+ model=request.model,
802
+ choices=[
803
+ CompletionChoice(index=0, text=response_text, finish_reason="stop")
804
+ ],
805
+ usage={
806
+ "prompt_tokens": input_ids.shape[1],
807
+ "completion_tokens": len(tokens),
808
+ "total_tokens": input_ids.shape[1] + len(tokens)
809
+ }
810
+ )
811
+
812
+ except Exception as e:
813
+ logger.error(f"Completion error: {e}")
814
+ raise HTTPException(status_code=500, detail=str(e))
815
+
816
+
817
+ @app.exception_handler(Exception)
818
+ async def global_exception_handler(request: Request, exc: Exception):
819
+ logger.error(f"Unhandled exception: {exc}", exc_info=True)
820
+ return JSONResponse(
821
+ status_code=500,
822
+ content={"error": {"message": "Internal server error", "type": "server_error"}}
823
+ )
824
+
825
+
826
+ # ==============================================================================
827
+ # Main Entry Point
828
+ # ==============================================================================
829
+
830
+ if __name__ == "__main__":
831
+ import uvicorn
832
+
833
+ print(f"""
834
+ ╔═══════════════════════════════════════════════════════════════╗
835
+ ║ LFM2.5 FastAPI Backend (ONNX Runtime) ║
836
+ ╠═══════════════════════════════════════════════════════════════╣
837
+ ║ Model: LiquidAI/LFM2.5-1.2B-Instruct-ONNX ║
838
+ ║ Variant: Q8 (~95% accuracy, fast CPU inference) ║
839
+ ║ Host: {settings.host}:{settings.port} ║
840
+ ║ Docs: http://{settings.host}:{settings.port}/docs ║
841
+ ╚═══════════════════════════════════════════════════════════════╝
842
+ """)
843
+
844
+ uvicorn.run(
845
+ "app:app",
846
+ host=settings.host,
847
+ port=settings.port,
848
+ log_level=settings.log_level,
849
+ workers=1,
850
+ )
config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for LFM2.5 FastAPI Backend.
3
+ Optimized for HuggingFace Spaces deployment (2 vCPU, 16GB RAM).
4
+ Uses ONNX Runtime for fast CPU inference.
5
+ """
6
+
7
+ from functools import lru_cache
8
+ from typing import List
9
+
10
+ from pydantic_settings import BaseSettings
11
+
12
+
13
+ class Settings(BaseSettings):
14
+ """Application settings optimized for HuggingFace Spaces."""
15
+
16
+ # Application metadata
17
+ app_name: str = "LFM2.5 API"
18
+ app_version: str = "1.0.0"
19
+
20
+ # Model settings - Using official ONNX model with Q8 for ~95% accuracy
21
+ model_id: str = "LiquidAI/LFM2.5-1.2B-Instruct-ONNX"
22
+ model_variant: str = "q8" # Options: q4 (fastest), q8 (balanced), fp16 (best quality)
23
+
24
+ # Server settings (HuggingFace Spaces uses port 7860)
25
+ host: str = "0.0.0.0"
26
+ port: int = 7860
27
+
28
+ # CORS settings
29
+ cors_origins: List[str] = ["*"]
30
+
31
+ # Generation defaults (from LiquidAI recommendations)
32
+ temperature: float = 0.1
33
+ top_k: int = 50
34
+ top_p: float = 0.1
35
+ max_tokens: int = 2000 # Max output tokens (model supports 32K context)
36
+ repetition_penalty: float = 1.05
37
+
38
+ # CPU optimization - increase threads for better performance
39
+ num_threads: int = 2 # Set higher if you have more cores (check with: python -c "import os; print(os.cpu_count())")
40
+
41
+ # Logging
42
+ log_level: str = "info"
43
+
44
+ class Config:
45
+ env_prefix = "LFM_"
46
+
47
+
48
+ @lru_cache()
49
+ def get_settings() -> Settings:
50
+ """Get cached settings."""
51
+ return Settings()
52
+
53
+
54
+ settings = get_settings()
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI LFM2.5 Backend Dependencies
2
+ # Lightweight CPU-friendly with ONNX Runtime (no heavy PyTorch GPU deps)
3
+
4
+ # Web Framework
5
+ fastapi>=0.109.0
6
+ uvicorn[standard]>=0.27.0
7
+
8
+ # Server-Sent Events for Streaming
9
+ sse-starlette>=2.0.0
10
+
11
+ # ONNX Runtime for fast CPU inference (lightweight, no compilation)
12
+ onnxruntime>=1.17.0
13
+ optimum[onnxruntime]>=1.17.0
14
+
15
+ # Transformers for tokenizer only (lightweight)
16
+ transformers>=4.40.0
17
+ huggingface-hub>=0.21.0
18
+ tokenizers>=0.19.0 # Required for LFM custom tokenizer
19
+
20
+ # Configuration
21
+ pydantic-settings>=2.1.0
22
+
23
+ # Utilities
24
+ python-multipart>=0.0.9
25
+ numpy>=1.24.0