Spaces:

sidmaz666
/

bonsaiapi

Paused

App Files Files Community

sidmaz666 commited on Apr 20

Commit

fb6570d

verified ·

1 Parent(s): b4e4f36

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -370

app.py CHANGED Viewed

@@ -7,31 +7,40 @@ import os
 import time
 import uuid
 from contextlib import asynccontextmanager
-from typing import Any, Dict, List, Optional, Union
-import numpy as np
-import onnxruntime as ort
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
-from huggingface_hub import snapshot_download
 from pydantic import BaseModel, Field, ValidationError
-from transformers import AutoTokenizer
 # ---------- Configuration ----------
-MODEL_ID = os.getenv("MODEL_ID", "onnx-community/Bonsai-1.7B-ONNX")
-MODEL_QUANTIZATION = os.getenv("MODEL_QUANTIZATION", "q1")
-ONNX_MODEL_FILE = f"model_{MODEL_QUANTIZATION}.onnx"
 HF_TOKEN = os.getenv("HF_TOKEN")
-LOCAL_MODEL_DIR = os.getenv("LOCAL_MODEL_DIR", "/data/bonsai-onnx")
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
 API_KEY = os.getenv("API_KEY", None)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("uvicorn.error")
-# ---------- Pydantic Models ----------
 class Message(BaseModel):
     role: str = Field(..., pattern="^(system|user|assistant)$")
     content: str
@@ -65,30 +74,20 @@ class ChatCompletionResponse(BaseModel):
 class ModelInfo(BaseModel):
     model_id: str
-    quantization: str
-    onnx_model_file: str
     device: str
 class ErrorResponse(BaseModel):
     error: str
     detail: Optional[str] = None
 # ---------- Global State ----------
-tokenizer = None
-ort_session = None
 model_load_error = None
 MODEL_LOCK = asyncio.Lock()
-# Cached model metadata
-past_input_names = []
-past_output_names = []
-num_layers = 0
-num_kv_heads = 0
-head_dim = 0
-kv_cache_dtype = np.float32
-has_num_logits_input = False
-has_position_ids = False
 # ---------- Helper Functions ----------
 def _verify_api_key(request: Request) -> None:
     if API_KEY is None:
@@ -97,366 +96,105 @@ def _verify_api_key(request: Request) -> None:
     if not auth or auth != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid or missing API key")
-def _model_device() -> str:
-    return "cuda" if ort.get_device().lower() == "gpu" else "cpu"
-def _download_model_snapshot() -> str:
     os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)
-    allow_patterns = [
-        "config.json",
-        "tokenizer.json",
-        "tokenizer_config.json",
-        "chat_template.jinja",
-        f"onnx/{ONNX_MODEL_FILE}",
-        f"onnx/{ONNX_MODEL_FILE}_data",
-    ]
     try:
-        snapshot_download(
             repo_id=MODEL_ID,
             local_dir=LOCAL_MODEL_DIR,
-            local_dir_use_symlinks=False,
-            allow_patterns=allow_patterns,
             token=HF_TOKEN,
         )
     except Exception as e:
         logger.error(f"Model download failed: {e}")
         raise RuntimeError(f"Failed to download model: {str(e)}")
-    return LOCAL_MODEL_DIR
-def _create_ort_session(model_path: str) -> ort.InferenceSession:
-    # --- OPTIMIZATION 1: Configure Session Options for LLMs ---
-    so = ort.SessionOptions()
-    # Disable all graph optimizations; they can be counter-productive for LLMs.
-    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
-    # Set intra-op threads to 1 to reduce thread pool overhead for small batch sizes.
-    so.intra_op_num_threads = 1
-    so.inter_op_num_threads = 1
-    so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    # Disable memory pattern optimization; it can increase memory fragmentation.
-    so.enable_mem_pattern = False
-    # Enable CPU memory arena for faster allocations.
-    so.enable_cpu_mem_arena = True
-    # Add an optimized execution provider for Intel CPUs.
-    # This provider is specifically designed to accelerate LLM inference.
-    providers = [
-        ('OpenVINOExecutionProvider', {'device_type': 'CPU_FP32'}),
-        'CPUExecutionProvider'
-    ]
-    try:
-        return ort.InferenceSession(model_path, sess_options=so, providers=providers)
-    except Exception as e:
-        logger.error(f"Failed to load ONNX session from {model_path}: {e}")
-        raise RuntimeError(f"ONNX session creation failed: {str(e)}")
 async def _ensure_loaded():
-    global tokenizer, ort_session, model_load_error
-    global past_input_names, past_output_names, num_layers, num_kv_heads, head_dim, kv_cache_dtype
-    global has_num_logits_input, has_position_ids
     async with MODEL_LOCK:
-        if tokenizer is not None and ort_session is not None:
             return
         if model_load_error:
             raise HTTPException(status_code=503, detail=f"Model failed to load: {model_load_error}")
         try:
-            local_dir = _download_model_snapshot()
-            tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
-            onnx_path = os.path.join(local_dir, "onnx", ONNX_MODEL_FILE)
-            ort_session = _create_ort_session(onnx_path)
-            # Read model architecture from config.json
-            import json
-            with open(os.path.join(local_dir, "config.json"), "r") as f:
-                config = json.load(f)
-            num_layers = config.get("num_hidden_layers", 28)
-            num_kv_heads = config.get("num_key_value_heads", 8)
-            head_dim = config.get("head_dim", 128)
-            # Identify input/output names and special inputs
-            inputs = ort_session.get_inputs()
-            outputs = ort_session.get_outputs()
-            past_input_names = [inp.name for inp in inputs if inp.name.startswith("past_key_values")]
-            past_output_names = [out.name for out in outputs if out.name.startswith("present")]
-            for inp in inputs:
-                if inp.name.startswith("past_key_values"):
-                    kv_cache_dtype = np.float16 if inp.type == "tensor(float16)" else np.float32
-                    break
-            has_num_logits_input = "num_logits_to_keep" in [inp.name for inp in inputs]
-            has_position_ids = "position_ids" in [inp.name for inp in inputs]
-            logger.info(f"Model loaded: {MODEL_ID} ({MODEL_QUANTIZATION})")
-            logger.info(f"Layers: {num_layers}, KV heads: {num_kv_heads}, head dim: {head_dim}")
-            logger.info(f"Past inputs: {len(past_input_names)}, outputs: {len(past_output_names)}")
-            logger.info(f"num_logits_to_keep: {has_num_logits_input}, position_ids: {has_position_ids}")
         except Exception as e:
             model_load_error = str(e)
             logger.exception("Model loading failed")
             raise HTTPException(status_code=503, detail=f"Model unavailable: {model_load_error}")
 def _build_chat_prompt(messages: List[Message]) -> str:
-    if tokenizer is None:
-        raise HTTPException(status_code=503, detail="Tokenizer not loaded")
-    try:
-        formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
-        prompt = tokenizer.apply_chat_template(
-            formatted_messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        return prompt
-    except Exception as e:
-        logger.error(f"Chat template error: {e}")
-        prompt = ""
-        for msg in messages:
-            prompt += f"<|{msg.role}|>\n{msg.content}\n"
-        prompt += "<|assistant|>\n"
-        return prompt
-def _count_tokens(text: str) -> int:
-    if tokenizer is None:
-        return len(text.split())
-    return len(tokenizer.encode(text))
-def _softmax(x: np.ndarray) -> np.ndarray:
-    e_x = np.exp(x - np.max(x))
-    return e_x / e_x.sum(axis=-1, keepdims=True)
-def _top_p_sampling(logits: np.ndarray, top_p: float) -> int:
-    sorted_indices = np.argsort(logits)[::-1]
-    sorted_logits = logits[sorted_indices]
-    probs = _softmax(sorted_logits)
-    cum_probs = np.cumsum(probs)
-    cutoff_index = np.searchsorted(cum_probs, top_p) + 1
-    top_indices = sorted_indices[:cutoff_index]
-    top_probs = probs[:cutoff_index]
-    top_probs /= top_probs.sum()
-    return int(np.random.choice(top_indices, p=top_probs))
-def _sample_token(logits: np.ndarray, temperature: float, top_p: float) -> int:
-    if temperature <= 0:
-        return int(np.argmax(logits))
-    logits = logits / temperature
-    if top_p < 1.0:
-        return _top_p_sampling(logits, top_p)
-    probs = _softmax(logits)
-    return int(np.random.choice(len(probs), p=probs))
-def _init_past_key_values(batch_size: int = 1) -> Dict[str, np.ndarray]:
-    """Create zero-filled KV cache tensors with correct shape and dtype."""
-    past = {}
-    empty_shape = (batch_size, num_kv_heads, 0, head_dim)
-    empty_tensor = np.zeros(empty_shape, dtype=kv_cache_dtype)
-    for name in past_input_names:
-        past[name] = empty_tensor.copy()
-    return past
-def _prepare_inputs(
-    input_ids: np.ndarray,
-    attention_mask: np.ndarray,
-    past_kv: Dict[str, np.ndarray],
-    position_ids: Optional[np.ndarray] = None,
-) -> Dict[str, np.ndarray]:
-    """Build input feed dictionary with all required tensors."""
-    feed = {
-        "input_ids": input_ids.astype(np.int64),
-        "attention_mask": attention_mask.astype(np.int64),
-    }
-    for name, tensor in past_kv.items():
-        feed[name] = tensor
-    if has_position_ids and position_ids is not None:
-        feed["position_ids"] = position_ids.astype(np.int64)
-    if has_num_logits_input:
-        feed["num_logits_to_keep"] = np.array(1, dtype=np.int64)
-    return feed
-def _generate_full_sync(
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    stop_sequences: Optional[List[str]] = None,
-) -> str:
-    if ort_session is None or tokenizer is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
-    input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)
-    attention_mask = np.ones_like(input_ids, dtype=np.int64)
-    seq_len = input_ids.shape[1]
-    past_kv = _init_past_key_values(batch_size=1)
-    generated_tokens = []
-    stop_sequences = stop_sequences or []
-    eos_token_id = tokenizer.eos_token_id
-    # --- OPTIMIZATION 2: Use IOBinding to avoid copying KV cache ---
-    # Create an IOBinding object to bind inputs and outputs to device memory
-    io_binding = ort_session.io_binding()
-    # Prefill step
-    position_ids = np.arange(seq_len, dtype=np.int64).reshape(1, -1) if has_position_ids else None
-    feed = _prepare_inputs(input_ids, attention_mask, past_kv, position_ids)
-    # Bind inputs
-    for name, tensor in feed.items():
-        io_binding.bind_cpu_input(name, tensor)
-    # Bind outputs
-    for output in ort_session.get_outputs():
-        io_binding.bind_output(output.name)
-    ort_session.run_with_iobinding(io_binding)
-    outputs = io_binding.copy_outputs_to_cpu()
-    logits = outputs[0][:, -1, :]
-    next_token = _sample_token(logits[0], temperature, top_p)
-    generated_tokens.append(next_token)
-    past_kv_outputs = outputs[1:]
-    past_kv = dict(zip(past_input_names, past_kv_outputs))
-    for _ in range(1, max_new_tokens):
-        last_token = np.array([[next_token]], dtype=np.int64)
-        attention_mask = np.ones((1, seq_len + 1), dtype=np.int64)
-        position_ids = np.array([[seq_len]], dtype=np.int64) if has_position_ids else None
-        seq_len += 1
-        feed = _prepare_inputs(last_token, attention_mask, past_kv, position_ids)
-        # Bind inputs for the next token
-        io_binding.clear_binding_inputs()
-        for name, tensor in feed.items():
-            io_binding.bind_cpu_input(name, tensor)
-        # Bind outputs
-        io_binding.clear_binding_outputs()
-        for output in ort_session.get_outputs():
-            io_binding.bind_output(output.name)
-        ort_session.run_with_iobinding(io_binding)
-        outputs = io_binding.copy_outputs_to_cpu()
-        logits = outputs[0][:, -1, :]
-        next_token = _sample_token(logits[0], temperature, top_p)
-        generated_tokens.append(next_token)
-        past_kv_outputs = outputs[1:]
-        past_kv = dict(zip(past_input_names, past_kv_outputs))
-        if next_token == eos_token_id:
-            break
-        partial_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        for stop_seq in stop_sequences:
-            if stop_seq in partial_text:
-                return partial_text.split(stop_seq)[0].strip()
-    full_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    return full_text.strip()
-async def _generate_full(
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    stop_sequences: Optional[List[str]] = None,
-) -> str:
     return await asyncio.to_thread(
-        _generate_full_sync,
-        prompt, max_new_tokens, temperature, top_p, stop_sequences
     )
-async def _generate_stream(
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    stop_sequences: Optional[List[str]] = None,
-):
-    if ort_session is None or tokenizer is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
-    input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)
-    attention_mask = np.ones_like(input_ids, dtype=np.int64)
-    seq_len = input_ids.shape[1]
-    past_kv = _init_past_key_values(batch_size=1)
-    generated_tokens = []
-    stop_sequences = stop_sequences or []
-    eos_token_id = tokenizer.eos_token_id
-    # Use IOBinding for prefill
-    io_binding = ort_session.io_binding()
-    def prefill_step():
-        position_ids = np.arange(seq_len, dtype=np.int64).reshape(1, -1) if has_position_ids else None
-        feed = _prepare_inputs(input_ids, attention_mask, past_kv, position_ids)
-        io_binding.clear_binding_inputs()
-        io_binding.clear_binding_outputs()
-        for name, tensor in feed.items():
-            io_binding.bind_cpu_input(name, tensor)
-        for output in ort_session.get_outputs():
-            io_binding.bind_output(output.name)
-        ort_session.run_with_iobinding(io_binding)
-        return io_binding.copy_outputs_to_cpu()
-    outputs = await asyncio.to_thread(prefill_step)
-    logits = outputs[0][:, -1, :]
-    next_token = _sample_token(logits[0], temperature, top_p)
-    generated_tokens.append(next_token)
-    past_kv_outputs = outputs[1:]
-    past_kv = dict(zip(past_input_names, past_kv_outputs))
-    new_text = tokenizer.decode([next_token], skip_special_tokens=True)
-    if new_text:
-        yield new_text
-    for _ in range(1, max_new_tokens):
-        last_token = np.array([[next_token]], dtype=np.int64)
-        attention_mask = np.ones((1, seq_len + 1), dtype=np.int64)
-        position_ids = np.array([[seq_len]], dtype=np.int64) if has_position_ids else None
-        seq_len += 1
-        def step():
-            feed = _prepare_inputs(last_token, attention_mask, past_kv, position_ids)
-            io_binding.clear_binding_inputs()
-            io_binding.clear_binding_outputs()
-            for name, tensor in feed.items():
-                io_binding.bind_cpu_input(name, tensor)
-            for output in ort_session.get_outputs():
-                io_binding.bind_output(output.name)
-            ort_session.run_with_iobinding(io_binding)
-            return io_binding.copy_outputs_to_cpu()
-        outputs = await asyncio.to_thread(step)
-        logits = outputs[0][:, -1, :]
-        next_token = _sample_token(logits[0], temperature, top_p)
-        generated_tokens.append(next_token)
-        past_kv_outputs = outputs[1:]
-        past_kv = dict(zip(past_input_names, past_kv_outputs))
-        new_text = tokenizer.decode([next_token], skip_special_tokens=True)
-        if new_text:
-            yield new_text
-        full_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        for stop_seq in stop_sequences:
-            if stop_seq in full_text:
-                return
-        if next_token == eos_token_id:
-            break
-# ---------- FastAPI App ----------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
@@ -465,14 +203,13 @@ async def lifespan(app: FastAPI):
     except Exception as e:
         logger.error(f"Startup model load failed: {e}")
     yield
-    global tokenizer, ort_session
-    tokenizer = None
-    ort_session = None
 app = FastAPI(
-    title="Bonsai ONNX Inference API",
-    version="1.0.0",
-    description="Fast, production-ready inference for 1-bit Bonsai LLMs using ONNX Runtime.",
     docs_url="/docs",
     redoc_url="/redoc",
     lifespan=lifespan,
@@ -516,17 +253,16 @@ async def generic_exception_handler(request, exc):
 @app.get("/", summary="Root")
 def root():
-    return {"message": "Bonsai ONNX API is running", "docs": "/docs"}
 @app.get("/health", summary="Health check")
 def health():
-    loaded = tokenizer is not None and ort_session is not None
     return {
         "status": "ok" if loaded else "degraded",
         "model_loaded": loaded,
         "model_id": MODEL_ID,
-        "quantization": MODEL_QUANTIZATION,
-        "device": _model_device(),
         "error": model_load_error if model_load_error else None,
     }
@@ -534,9 +270,10 @@ def health():
 def model_info():
     return ModelInfo(
         model_id=MODEL_ID,
-        quantization=MODEL_QUANTIZATION,
-        onnx_model_file=ONNX_MODEL_FILE,
-        device=_model_device(),
     )
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
@@ -564,9 +301,9 @@ async def chat_completions(req: ChatCompletionRequest):
         text = await _generate_full(prompt, req.max_tokens, req.temperature, req.top_p, stop_seq)
         assistant_msg = Message(role="assistant", content=text)
         usage = Usage(
-            prompt_tokens=_count_tokens(prompt),
-            completion_tokens=_count_tokens(text),
-            total_tokens=_count_tokens(prompt) + _count_tokens(text),
         )
         return ChatCompletionResponse(
             id=f"chatcmpl-{uuid.uuid4().hex[:12]}",

 import time
 import uuid
 from contextlib import asynccontextmanager
+from typing import List, Optional, Union
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
+from huggingface_hub import hf_hub_download
 from pydantic import BaseModel, Field, ValidationError
+# NEW: Import llama.cpp
+from llama_cpp import Llama
 # ---------- Configuration ----------
+# You can now use GGUF models for even faster inference!
+# These are specifically optimized by the PrismML team.
+MODEL_ID = os.getenv("MODEL_ID", "prism-ml/Bonsai-1.7B-gguf")
+MODEL_FILENAME = os.getenv("MODEL_FILENAME", "Bonsai-1.7B-v1.0-Q1_0.gguf")
+# Quantization types in GGUF: Q1_0 is for 1-bit models.
+# For 8B, use MODEL_ID="prism-ml/Bonsai-8B-gguf" and MODEL_FILENAME="Bonsai-8B-v1.0-Q1_0.gguf"
 HF_TOKEN = os.getenv("HF_TOKEN")
+LOCAL_MODEL_DIR = os.getenv("LOCAL_MODEL_DIR", "/data/models")
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
 API_KEY = os.getenv("API_KEY", None)
+# Performance settings for CPU inference
+N_CTX = int(os.getenv("N_CTX", "4096"))  # Context window
+N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads to use
+N_BATCH = int(os.getenv("N_BATCH", "512")) # Batch size for prompt processing
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("uvicorn.error")
+# ---------- Pydantic Models (Same as before) ----------
 class Message(BaseModel):
     role: str = Field(..., pattern="^(system|user|assistant)$")
     content: str
 class ModelInfo(BaseModel):
     model_id: str
+    filename: str
     device: str
+    n_ctx: int
+    n_threads: int
 class ErrorResponse(BaseModel):
     error: str
     detail: Optional[str] = None
 # ---------- Global State ----------
+llm = None
 model_load_error = None
 MODEL_LOCK = asyncio.Lock()
 # ---------- Helper Functions ----------
 def _verify_api_key(request: Request) -> None:
     if API_KEY is None:
     if not auth or auth != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid or missing API key")
+def _download_model() -> str:
     os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)
+    local_path = os.path.join(LOCAL_MODEL_DIR, MODEL_FILENAME)
+    if os.path.exists(local_path):
+        logger.info(f"Model already downloaded at {local_path}")
+        return local_path
+    logger.info(f"Downloading model {MODEL_ID}/{MODEL_FILENAME}...")
     try:
+        hf_hub_download(
             repo_id=MODEL_ID,
+            filename=MODEL_FILENAME,
             local_dir=LOCAL_MODEL_DIR,
             token=HF_TOKEN,
         )
+        logger.info("Model downloaded successfully.")
+        return local_path
     except Exception as e:
         logger.error(f"Model download failed: {e}")
         raise RuntimeError(f"Failed to download model: {str(e)}")
 async def _ensure_loaded():
+    global llm, model_load_error
     async with MODEL_LOCK:
+        if llm is not None:
             return
         if model_load_error:
             raise HTTPException(status_code=503, detail=f"Model failed to load: {model_load_error}")
         try:
+            model_path = _download_model()
+            # Load the model with CPU-optimized settings
+            llm = Llama(
+                model_path=model_path,
+                n_ctx=N_CTX,         # Context window
+                n_threads=N_THREADS, # Number of CPU threads
+                n_batch=N_BATCH,     # Batch size for prompt processing
+                verbose=False,
+            )
+            logger.info(f"Model loaded successfully: {MODEL_ID} ({MODEL_FILENAME})")
+            logger.info(f"Context: {N_CTX}, Threads: {N_THREADS}, Batch: {N_BATCH}")
         except Exception as e:
             model_load_error = str(e)
             logger.exception("Model loading failed")
             raise HTTPException(status_code=503, detail=f"Model unavailable: {model_load_error}")
 def _build_chat_prompt(messages: List[Message]) -> str:
+    # llama.cpp handles chat templates automatically, so we can just pass the messages directly.
+    # This is for compatibility; the actual formatting is done by llama.cpp.
+    if llm is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    # The create_chat_completion method expects a list of messages in this format
+    return [{"role": msg.role, "content": msg.content} for msg in messages]
+async def _generate_full(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None) -> str:
+    if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    # Run the blocking llama.cpp call in a thread
     return await asyncio.to_thread(
+        lambda: llm.create_chat_completion(
+            messages=prompt,
+            max_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop_sequences,
+            stream=False,
+        )["choices"][0]["message"]["content"]
     )
+async def _generate_stream(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None):
+    if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    # llama.cpp can yield a Python generator. We'll run it in a thread and yield the results.
+    def generator():
+        for chunk in llm.create_chat_completion(
+            messages=prompt,
+            max_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop_sequences,
+            stream=True,
+        ):
+            if "content" in chunk["choices"][0]["delta"]:
+                yield chunk["choices"][0]["delta"]["content"]
+    # We need a helper to bridge the sync generator to an async one
+    def sync_generator():
+        for item in generator():
+            yield item
+    # Run the sync generator in a thread and yield items as they come
+    for item in await asyncio.to_thread(list, sync_generator()):
+        yield item
+        await asyncio.sleep(0) # Yield control to the event loop
+# ---------- FastAPI App (Same structure) ----------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
     except Exception as e:
         logger.error(f"Startup model load failed: {e}")
     yield
+    global llm
+    llm = None
 app = FastAPI(
+    title="Bonsai CPU-Optimized Inference API",
+    version="2.0.0",
+    description="Lightning-fast inference for 1-bit Bonsai LLMs using llama.cpp.",
     docs_url="/docs",
     redoc_url="/redoc",
     lifespan=lifespan,
 @app.get("/", summary="Root")
 def root():
+    return {"message": "Bonsai CPU API is running", "docs": "/docs"}
 @app.get("/health", summary="Health check")
 def health():
+    loaded = llm is not None
     return {
         "status": "ok" if loaded else "degraded",
         "model_loaded": loaded,
         "model_id": MODEL_ID,
+        "filename": MODEL_FILENAME,
         "error": model_load_error if model_load_error else None,
     }
 def model_info():
     return ModelInfo(
         model_id=MODEL_ID,
+        filename=MODEL_FILENAME,
+        device="CPU",
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
     )
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
         text = await _generate_full(prompt, req.max_tokens, req.temperature, req.top_p, stop_seq)
         assistant_msg = Message(role="assistant", content=text)
         usage = Usage(
+            prompt_tokens=0, # llama.cpp can return this, but we can omit for simplicity
+            completion_tokens=0,
+            total_tokens=0,
         )
         return ChatCompletionResponse(
             id=f"chatcmpl-{uuid.uuid4().hex[:12]}",