Spaces:

Rajhuggingface4253
/

test-l

Running

App Files Files Community

Rajhuggingface4253 commited on Jan 21

Commit

0f735d4

verified ·

1 Parent(s): c03878f

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +43 -0
app.py +850 -0
config.py +54 -0
requirements.txt +25 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# LFM2.5 FastAPI Backend - Dockerfile
+# Optimized for HuggingFace Spaces
+FROM python:3.11-slim
+# Install minimal dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user
+RUN useradd -m -u 1000 appuser
+# Set working directory
+WORKDIR /app
+# Copy requirements first for caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY --chown=appuser:appuser app.py config.py ./
+# Switch to non-root user
+USER appuser
+# Environment variables for HuggingFace Spaces
+ENV PYTHONUNBUFFERED=1 \
+    LFM_HOST=0.0.0.0 \
+    LFM_PORT=7860
+# Expose HuggingFace Spaces port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Run
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,850 @@

+"""
+LFM2.5 FastAPI Backend - ONNX Runtime Edition
+==============================================
+Lightweight, CPU-friendly FastAPI backend for LiquidAI LFM2.5-1.2B-Instruct.
+Uses official ONNX model for fast inference without heavy PyTorch dependencies.
+Features:
+- ONNX Runtime for fast CPU inference (no GPU required)
+- Q8 quantization for 95%+ accuracy retention
+- Streaming SSE responses
+- OpenAI-compatible API
+- Optimized for HuggingFace Spaces (2 vCPU, 16GB RAM)
+"""
+import asyncio
+import json
+import logging
+import time
+import uuid
+import threading
+import queue  # Thread-safe queue for true streaming
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator, Dict, List, Optional, Union
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from huggingface_hub import hf_hub_download, list_repo_files
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from config import settings
+# Configure logging
+logging.basicConfig(
+    level=getattr(logging, settings.log_level.upper()),
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# ==============================================================================
+# Pydantic Models for OpenAI-compatible API
+# ==============================================================================
+class ChatMessage(BaseModel):
+    role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
+    content: str = Field(..., description="Message content")
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(default="lfm", description="Model identifier")
+    messages: List[ChatMessage] = Field(..., description="Conversation messages")
+    temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+    top_k: Optional[int] = Field(default=None, ge=0)
+    max_tokens: Optional[int] = Field(default=None, ge=1)
+    stream: bool = Field(default=False, description="Enable streaming response")
+    stop: Optional[Union[str, List[str]]] = Field(default=None)
+class CompletionRequest(BaseModel):
+    model: str = Field(default="lfm", description="Model identifier")
+    prompt: str = Field(..., description="Text prompt")
+    temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+    top_k: Optional[int] = Field(default=None, ge=0)
+    max_tokens: Optional[int] = Field(default=None, ge=1)
+    stream: bool = Field(default=False, description="Enable streaming response")
+class ChatCompletionChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Optional[str] = None
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: Dict[str, int]
+class CompletionChoice(BaseModel):
+    index: int
+    text: str
+    finish_reason: Optional[str] = None
+class CompletionResponse(BaseModel):
+    id: str
+    object: str = "text_completion"
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+    usage: Dict[str, int]
+class ModelInfo(BaseModel):
+    id: str
+    object: str = "model"
+    created: int
+    owned_by: str = "liquid-ai"
+class ModelListResponse(BaseModel):
+    object: str = "list"
+    data: List[ModelInfo]
+# ==============================================================================
+# ONNX Model Manager
+# ==============================================================================
+# ONNX dtype mapping
+ONNX_DTYPE = {
+    "tensor(float)": np.float32,
+    "tensor(float16)": np.float16,
+    "tensor(int64)": np.int64
+}
+class ONNXModelManager:
+    """Manages ONNX model with KV cache for efficient generation."""
+    def __init__(self):
+        self._session = None
+        self._tokenizer = None
+        self._cache_template = None
+        self._use_position_ids = False
+        self._lock = threading.Lock()
+    @property
+    def is_loaded(self) -> bool:
+        return self._session is not None
+    def download_model(self) -> str:
+        """Download ONNX model files from HuggingFace."""
+        model_id = settings.model_id
+        variant = settings.model_variant
+        logger.info(f"Downloading model: {model_id} (variant: {variant})")
+        # Download main model file
+        model_filename = f"onnx/model_{variant}.onnx"
+        model_path = hf_hub_download(model_id, model_filename)
+        # Download all data files for this variant
+        for f in list_repo_files(model_id):
+            if f.startswith(f"onnx/model_{variant}.onnx_data"):
+                logger.info(f"Downloading: {f}")
+                hf_hub_download(model_id, f)
+        return model_path
+    def load_model(self) -> None:
+        """Load the ONNX model and tokenizer."""
+        with self._lock:
+            if self._session is not None:
+                return
+            logger.info("=" * 60)
+            logger.info("Loading LFM2.5-1.2B-Instruct ONNX model...")
+            logger.info(f"Model: {settings.model_id}")
+            logger.info(f"Variant: {settings.model_variant} (Q8 = ~95% accuracy)")
+            logger.info("=" * 60)
+            start_time = time.time()
+            # Download model
+            model_path = self.download_model()
+            # Configure ONNX Runtime for CPU
+            sess_options = ort.SessionOptions()
+            sess_options.intra_op_num_threads = settings.num_threads
+            sess_options.inter_op_num_threads = settings.num_threads
+            sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+            # Load ONNX session
+            self._session = ort.InferenceSession(
+                model_path,
+                sess_options=sess_options,
+                providers=['CPUExecutionProvider']
+            )
+            # Load tokenizer with fallback for models with invalid tokenizer_class
+            try:
+                self._tokenizer = AutoTokenizer.from_pretrained(
+                    settings.model_id,
+                    trust_remote_code=True
+                )
+            except ValueError as e:
+                if "TokenizersBackend" in str(e):
+                    # LFM models incorrectly specify TokenizersBackend as tokenizer_class
+                    # Fallback to PreTrainedTokenizerFast which works with tokenizers backend
+                    logger.warning(
+                        "AutoTokenizer failed with TokenizersBackend error. "
+                        "Falling back to PreTrainedTokenizerFast..."
+                    )
+                    self._tokenizer = PreTrainedTokenizerFast.from_pretrained(
+                        settings.model_id,
+                        trust_remote_code=True
+                    )
+                else:
+                    raise
+            # Initialize cache template
+            self._init_cache_template()
+            # Check if model uses position_ids
+            input_names = {inp.name for inp in self._session.get_inputs()}
+            self._use_position_ids = "position_ids" in input_names
+            load_time = time.time() - start_time
+            logger.info("=" * 60)
+            logger.info(f"✓ Model loaded in {load_time:.2f}s")
+            logger.info(f"  Threads: {settings.num_threads}")
+            logger.info(f"  Provider: CPU")
+            logger.info("=" * 60)
+    def _init_cache_template(self) -> None:
+        """Initialize KV cache template."""
+        self._cache_template = {}
+        for inp in self._session.get_inputs():
+            if inp.name in {"input_ids", "attention_mask", "position_ids"}:
+                continue
+            shape = [d if isinstance(d, int) else 1 for d in inp.shape]
+            for i, d in enumerate(inp.shape):
+                if isinstance(d, str) and "sequence" in d.lower():
+                    shape[i] = 0
+            dtype = ONNX_DTYPE.get(inp.type, np.float32)
+            self._cache_template[inp.name] = (shape, dtype)
+    def _create_empty_cache(self) -> Dict[str, np.ndarray]:
+        """Create a new empty KV cache."""
+        return {
+            name: np.zeros(shape, dtype=dtype)
+            for name, (shape, dtype) in self._cache_template.items()
+        }
+    @property
+    def session(self):
+        if self._session is None:
+            raise RuntimeError("Model not loaded")
+        return self._session
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            raise RuntimeError("Tokenizer not loaded")
+        return self._tokenizer
+    def generate(
+        self,
+        input_ids: np.ndarray,
+        max_tokens: int = 512,
+        temperature: float = 0.1,
+        top_k: int = 50,
+        top_p: float = 0.1,
+        stop_tokens: Optional[List[int]] = None
+    ) -> List[int]:
+        """Generate tokens using ONNX model."""
+        if stop_tokens is None:
+            stop_tokens = [self._tokenizer.eos_token_id]
+        cache = self._create_empty_cache()
+        seq_len = input_ids.shape[1]
+        generated_tokens = []
+        for step in range(max_tokens):
+            if step == 0:
+                ids = input_ids
+                pos = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
+            else:
+                ids = np.array([[generated_tokens[-1]]], dtype=np.int64)
+                pos = np.array([[seq_len + len(generated_tokens) - 1]], dtype=np.int64)
+            attn_mask = np.ones((1, seq_len + len(generated_tokens)), dtype=np.int64)
+            feed = {"input_ids": ids, "attention_mask": attn_mask, **cache}
+            if self._use_position_ids:
+                feed["position_ids"] = pos
+            outputs = self._session.run(None, feed)
+            # Get logits and apply temperature
+            logits = outputs[0][0, -1]
+            if temperature > 0:
+                logits = logits / temperature
+                # Apply top-k
+                if top_k > 0:
+                    indices_to_remove = np.argsort(logits)[:-top_k]
+                    logits[indices_to_remove] = -np.inf
+                # Apply top-p (nucleus sampling)
+                if top_p < 1.0:
+                    sorted_indices = np.argsort(logits)[::-1]
+                    sorted_logits = logits[sorted_indices]
+                    probs = np.exp(sorted_logits - np.max(sorted_logits))
+                    probs = probs / probs.sum()
+                    cumulative_probs = np.cumsum(probs)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+                    sorted_indices_to_remove[0] = False
+                    indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                    logits[indices_to_remove] = -np.inf
+                # Sample
+                probs = np.exp(logits - np.max(logits))
+                probs = probs / probs.sum()
+                next_token = int(np.random.choice(len(probs), p=probs))
+            else:
+                next_token = int(np.argmax(logits))
+            generated_tokens.append(next_token)
+            # Update cache
+            for i, out in enumerate(self._session.get_outputs()[1:], 1):
+                name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
+                if name in cache:
+                    cache[name] = outputs[i]
+            if next_token in stop_tokens:
+                break
+        return generated_tokens
+    def generate_stream(
+        self,
+        input_ids: np.ndarray,
+        max_tokens: int = 2000,
+        temperature: float = 0.1,
+        top_k: int = 50,
+        top_p: float = 0.1,
+        stop_tokens: Optional[List[int]] = None
+    ):
+        """Fixed and optimized streaming generation."""
+        if stop_tokens is None:
+            stop_tokens = [self._tokenizer.eos_token_id]
+        cache = self._create_empty_cache()
+        seq_len = input_ids.shape[1]
+        # Pre-allocate inputs
+        max_possible_len = seq_len + max_tokens
+        attn_mask = np.ones((1, max_possible_len), dtype=np.int64)
+        # Pre-compute flags
+        use_temp = temperature > 0
+        use_top_k = top_k > 0
+        use_top_p = top_p < 1.0
+        # Reuse this dict to avoid garbage collection overhead
+        feed = {}
+        # Initialize token storage
+        generated_tokens = []
+        for step in range(max_tokens):
+            current_len = seq_len + step
+            # Input Preparation
+            if step == 0:
+                ids = input_ids
+                if self._use_position_ids:
+                    pos = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
+            else:
+                # FIX: Access list directly. O(1) speed, no UnboundLocalError.
+                ids = np.array([[generated_tokens[-1]]], dtype=np.int64)
+                if self._use_position_ids:
+                    pos = np.array([[current_len - 1]], dtype=np.int64)
+            # Update Feed Dict (In-place update is faster than creating new dict)
+            feed.clear()
+            feed["input_ids"] = ids
+            feed["attention_mask"] = attn_mask[:, :current_len]
+            if self._use_position_ids:
+                feed["position_ids"] = pos
+            feed.update(cache) # Merging cache is unavoidable
+            # Inference
+            outputs = self._session.run(None, feed)
+            logits = outputs[0][0, -1]
+            # --- Ultra-Fast Sampling ---
+            if use_temp:
+                logits /= temperature
+                # 1. Top-K Selection (Partitioning is O(N))
+                if use_top_k and top_k < len(logits):
+                    # Moves largest k elements to the right; unordered
+                    top_k_idx = np.argpartition(logits, -top_k)[-top_k:]
+                    # Mask everything else
+                    mask = np.ones(logits.shape, dtype=bool)
+                    mask[top_k_idx] = False
+                    logits[mask] = -np.inf
+                # 2. Top-P (Nucleus)
+                if use_top_p:
+                    valid_mask = logits > -np.inf
+                    if valid_mask.any():
+                        valid_logits = logits[valid_mask]
+                        valid_indices = np.where(valid_mask)[0]
+                        # Sort only the valid candidates (small N)
+                        sorted_indices = np.argsort(valid_logits)[::-1]
+                        sorted_logits = valid_logits[sorted_indices]
+                        # Softmax on valid set
+                        exp_logits = np.exp(sorted_logits - np.max(sorted_logits))
+                        probs = exp_logits / exp_logits.sum()
+                        cumulative = np.cumsum(probs)
+                        # Find cutoff
+                        cutoff = np.searchsorted(cumulative, top_p)
+                        # Ensure we keep at least one token
+                        cutoff = min(cutoff + 1, len(sorted_logits))
+                        # Filter indices
+                        accepted_indices = sorted_indices[:cutoff]
+                        accepted_probs = probs[:cutoff]
+                        accepted_probs /= accepted_probs.sum() # Re-normalize
+                        # Fast Weighted Sample: Use searchsorted instead of np.random.choice
+                        # This avoids Python overhead in np.random.choice
+                        sample_idx = np.searchsorted(np.cumsum(accepted_probs), np.random.rand())
+                        next_token = int(valid_indices[accepted_indices[sample_idx]])
+                    else:
+                        next_token = int(np.argmax(logits))
+                else:
+                    # Fallback if only Top-K was used
+                    valid_mask = logits > -np.inf
+                    valid_logits = logits[valid_mask]
+                    valid_indices = np.where(valid_mask)[0]
+                    exp_logits = np.exp(valid_logits - np.max(valid_logits))
+                    probs = exp_logits / exp_logits.sum()
+                    sample_idx = np.searchsorted(np.cumsum(probs), np.random.rand())
+                    next_token = int(valid_indices[sample_idx])
+            else:
+                next_token = int(np.argmax(logits))
+            # Storage
+            generated_tokens.append(next_token)
+            yield next_token
+            if next_token in stop_tokens:
+                break
+            # Update Cache
+            for i, out in enumerate(self._session.get_outputs()[1:], 1):
+                name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
+                if name in cache:
+                    cache[name] = outputs[i]
+    def unload(self) -> None:
+        """Unload model from memory."""
+        with self._lock:
+            if self._session is not None:
+                del self._session
+                del self._tokenizer
+                self._session = None
+                self._tokenizer = None
+                logger.info("Model unloaded")
+# Global model manager
+model_manager = ONNXModelManager()
+# ==============================================================================
+# Application Lifecycle
+# ==============================================================================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan handler."""
+    logger.info("Starting LFM2.5 API Server (ONNX Runtime)...")
+    loop = asyncio.get_event_loop()
+    await loop.run_in_executor(None, model_manager.load_model)
+    yield
+    logger.info("Shutting down...")
+    model_manager.unload()
+# ==============================================================================
+# FastAPI Application
+# ==============================================================================
+app = FastAPI(
+    title=settings.app_name,
+    description="Fast CPU inference for LiquidAI LFM2.5-1.2B-Instruct using ONNX Runtime",
+    version=settings.app_version,
+    lifespan=lifespan,
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
+    allow_credentials=False,  # Must be False when using wildcard origins
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["*"],  # Expose all headers for SSE
+)
+# Custom middleware to handle null origin (file:// protocol)
+@app.middleware("http")
+async def add_cors_for_null_origin(request: Request, call_next):
+    """Handle CORS for null origin (when HTML is opened from file://)."""
+    origin = request.headers.get("origin", "")
+    response = await call_next(request)
+    # If origin is null (file:// protocol), add explicit CORS headers
+    if origin == "null" or not origin:
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        response.headers["Access-Control-Expose-Headers"] = "*"
+    return response
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
+def generate_id() -> str:
+    return f"chatcmpl-{uuid.uuid4().hex[:12]}"
+async def stream_chat_completion(request: ChatCompletionRequest) -> AsyncGenerator[str, None]:
+    """
+    Optimized 'Zero-Latency' Streaming.
+    Uses asyncio.Queue + call_soon_threadsafe to eliminate polling and blocking.
+    """
+    request_id = generate_id()
+    created = int(time.time())
+    # Capture the running event loop to bridge the background thread safely
+    loop = asyncio.get_running_loop()
+    # Async Queue allows 'await get()' which is non-blocking and instant
+    async_queue = asyncio.Queue()
+    tokenizer = model_manager.tokenizer
+    # Prepare inputs
+    messages = [{"role": m.role, "content": m.content} for m in request.messages]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    input_ids = np.array([tokenizer.encode(prompt, add_special_tokens=False)], dtype=np.int64)
+    # Config
+    max_tokens = request.max_tokens or settings.max_tokens
+    temperature = request.temperature if request.temperature is not None else settings.temperature
+    top_k = request.top_k if request.top_k is not None else settings.top_k
+    top_p = request.top_p if request.top_p is not None else settings.top_p
+    # Prepare stop tokens
+    stop_tokens = [tokenizer.eos_token_id]
+    if request.stop:
+        if isinstance(request.stop, str):
+            encoded = tokenizer.encode(request.stop, add_special_tokens=False)
+            if encoded:
+                stop_tokens.append(encoded[0])
+        elif isinstance(request.stop, list):
+            for stop_str in request.stop:
+                encoded = tokenizer.encode(stop_str, add_special_tokens=False)
+                if encoded:
+                    stop_tokens.append(encoded[0])
+    def generate_tokens():
+        """
+        Background Thread: Pushes data directly into the async loop.
+        """
+        try:
+            # Use the optimized generate_stream from ONNXModelManager
+            for token in model_manager.generate_stream(
+                input_ids,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                stop_tokens=stop_tokens
+            ):
+                # CRITICAL: Schedule the 'put' on the main loop immediately
+                # This wakes up the awaiter instantly—0ms latency overhead.
+                loop.call_soon_threadsafe(async_queue.put_nowait, ("token", token))
+        except Exception as e:
+            logger.error(f"Stream generation error: {e}")
+            loop.call_soon_threadsafe(async_queue.put_nowait, ("error", str(e)))
+        finally:
+            loop.call_soon_threadsafe(async_queue.put_nowait, ("done", None))
+    # Start generation in background thread
+    threading.Thread(target=generate_tokens, daemon=True).start()
+    # Main Async Loop - No timeouts, no sleeps, pure event awaiting
+    try:
+        while True:
+            # waits until data is pushed; yields control to other users while waiting
+            msg_type, data = await async_queue.get()
+            if msg_type == "token":
+                text = tokenizer.decode([data], skip_special_tokens=True)
+                if text:
+                    chunk = {
+                        "id": request_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": request.model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {"content": text},
+                            "finish_reason": None
+                        }]
+                    }
+                    # Yield in the format expected by EventSourceResponse
+                    yield {"data": json.dumps(chunk)}
+            elif msg_type == "done":
+                final = {
+                    "id": request_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": request.model,
+                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+                }
+                yield {"data": json.dumps(final)}
+                yield {"data": "[DONE]"}
+                break
+            elif msg_type == "error":
+                logger.error(f"Stream error: {data}")
+                yield {"data": json.dumps({"error": {"message": data}})}
+                break
+    except asyncio.CancelledError:
+        logger.info(f"Stream cancelled for request {request_id[:8]}")
+        raise
+    except Exception as e:
+        logger.error(f"Streaming error: {e}")
+        yield {"data": json.dumps({"error": {"message": str(e)}})}
+# ==============================================================================
+# API Endpoints
+# ==============================================================================
+@app.get("/", response_class=JSONResponse)
+async def health_check():
+    """Health check with model status."""
+    return {
+        "status": "ready" if model_manager.is_loaded else "loading",
+        "model": {
+            "id": settings.model_id,
+            "variant": settings.model_variant,
+            "loaded": model_manager.is_loaded,
+            "backend": "ONNX Runtime"
+        },
+        "server": {
+            "name": settings.app_name,
+            "version": settings.app_version,
+            "port": settings.port
+        }
+    }
+@app.get("/health")
+async def health():
+    if not model_manager.is_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    return {"status": "healthy"}
+@app.get("/v1/models", response_model=ModelListResponse)
+async def list_models():
+    return ModelListResponse(
+        data=[
+            ModelInfo(id="lfm", created=int(time.time())),
+            ModelInfo(id="lfm-2.5-1.2b-instruct-onnx", created=int(time.time()))
+        ]
+    )
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completion."""
+    if not model_manager.is_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    if request.stream:
+        return EventSourceResponse(
+            stream_chat_completion(request),
+            media_type="text/event-stream",
+            ping=30000,  # 30 second keep-alive
+            ping_message_factory=lambda: '{"type": "ping"}'
+        )
+    try:
+        tokenizer = model_manager.tokenizer
+        messages = [{"role": m.role, "content": m.content} for m in request.messages]
+        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        input_ids = np.array([tokenizer.encode(prompt, add_special_tokens=False)], dtype=np.int64)
+        max_tokens = request.max_tokens or settings.max_tokens
+        temperature = request.temperature if request.temperature is not None else settings.temperature
+        top_k = request.top_k if request.top_k is not None else settings.top_k
+        top_p = request.top_p if request.top_p is not None else settings.top_p
+        start_time = time.time()
+        loop = asyncio.get_event_loop()
+        tokens = await loop.run_in_executor(
+            None,
+            lambda: model_manager.generate(
+                input_ids,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p
+            )
+        )
+        response_text = tokenizer.decode(tokens, skip_special_tokens=True)
+        gen_time = time.time() - start_time
+        logger.debug(f"Generated {len(tokens)} tokens in {gen_time:.2f}s")
+        return ChatCompletionResponse(
+            id=generate_id(),
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatMessage(role="assistant", content=response_text),
+                    finish_reason="stop"
+                )
+            ],
+            usage={
+                "prompt_tokens": input_ids.shape[1],
+                "completion_tokens": len(tokens),
+                "total_tokens": input_ids.shape[1] + len(tokens)
+            }
+        )
+    except Exception as e:
+        logger.error(f"Chat completion error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/v1/completions")
+async def completions(request: CompletionRequest):
+    """OpenAI-compatible text completion."""
+    if not model_manager.is_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    try:
+        tokenizer = model_manager.tokenizer
+        input_ids = np.array([tokenizer.encode(request.prompt)], dtype=np.int64)
+        max_tokens = request.max_tokens or settings.max_tokens
+        temperature = request.temperature if request.temperature is not None else settings.temperature
+        top_k = request.top_k if request.top_k is not None else settings.top_k
+        top_p = request.top_p if request.top_p is not None else settings.top_p
+        loop = asyncio.get_event_loop()
+        tokens = await loop.run_in_executor(
+            None,
+            lambda: model_manager.generate(
+                input_ids,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p
+            )
+        )
+        response_text = tokenizer.decode(tokens, skip_special_tokens=True)
+        return CompletionResponse(
+            id=generate_id(),
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                CompletionChoice(index=0, text=response_text, finish_reason="stop")
+            ],
+            usage={
+                "prompt_tokens": input_ids.shape[1],
+                "completion_tokens": len(tokens),
+                "total_tokens": input_ids.shape[1] + len(tokens)
+            }
+        )
+    except Exception as e:
+        logger.error(f"Completion error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    logger.error(f"Unhandled exception: {exc}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={"error": {"message": "Internal server error", "type": "server_error"}}
+    )
+# ==============================================================================
+# Main Entry Point
+# ==============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    print(f"""
+╔═══════════════════════════════════════════════════════════════╗
+║           LFM2.5 FastAPI Backend (ONNX Runtime)               ║
+╠═══════════════════════════════════════════════════════════════╣
+║  Model:   LiquidAI/LFM2.5-1.2B-Instruct-ONNX                  ║
+║  Variant: Q8 (~95% accuracy, fast CPU inference)              ║
+║  Host:    {settings.host}:{settings.port}                                        ║
+║  Docs:    http://{settings.host}:{settings.port}/docs                              ║
+╚═══════════════════════════════════════════════════════════════╝
+""")
+    uvicorn.run(
+        "app:app",
+        host=settings.host,
+        port=settings.port,
+        log_level=settings.log_level,
+        workers=1,
+    )

config.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Configuration for LFM2.5 FastAPI Backend.
+Optimized for HuggingFace Spaces deployment (2 vCPU, 16GB RAM).
+Uses ONNX Runtime for fast CPU inference.
+"""
+from functools import lru_cache
+from typing import List
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    """Application settings optimized for HuggingFace Spaces."""
+    # Application metadata
+    app_name: str = "LFM2.5 API"
+    app_version: str = "1.0.0"
+    # Model settings - Using official ONNX model with Q8 for ~95% accuracy
+    model_id: str = "LiquidAI/LFM2.5-1.2B-Instruct-ONNX"
+    model_variant: str = "q8"  # Options: q4 (fastest), q8 (balanced), fp16 (best quality)
+    # Server settings (HuggingFace Spaces uses port 7860)
+    host: str = "0.0.0.0"
+    port: int = 7860
+    # CORS settings
+    cors_origins: List[str] = ["*"]
+    # Generation defaults (from LiquidAI recommendations)
+    temperature: float = 0.1
+    top_k: int = 50
+    top_p: float = 0.1
+    max_tokens: int = 2000  # Max output tokens (model supports 32K context)
+    repetition_penalty: float = 1.05
+    # CPU optimization - increase threads for better performance
+    num_threads: int = 2  # Set higher if you have more cores (check with: python -c "import os; print(os.cpu_count())")
+    # Logging
+    log_level: str = "info"
+    class Config:
+        env_prefix = "LFM_"
+@lru_cache()
+def get_settings() -> Settings:
+    """Get cached settings."""
+    return Settings()
+settings = get_settings()

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# FastAPI LFM2.5 Backend Dependencies
+# Lightweight CPU-friendly with ONNX Runtime (no heavy PyTorch GPU deps)
+# Web Framework
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+# Server-Sent Events for Streaming
+sse-starlette>=2.0.0
+# ONNX Runtime for fast CPU inference (lightweight, no compilation)
+onnxruntime>=1.17.0
+optimum[onnxruntime]>=1.17.0
+# Transformers for tokenizer only (lightweight)
+transformers>=4.40.0
+huggingface-hub>=0.21.0
+tokenizers>=0.19.0  # Required for LFM custom tokenizer
+# Configuration
+pydantic-settings>=2.1.0
+# Utilities
+python-multipart>=0.0.9
+numpy>=1.24.0