Spaces:

sidmaz666
/

bonsaiapi

Paused

App Files Files Community

sidmaz666 commited on Apr 20

Commit

28340f8

verified ·

1 Parent(s): 4862316

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -57

app.py CHANGED Viewed

@@ -14,33 +14,25 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from huggingface_hub import hf_hub_download
 from pydantic import BaseModel, Field, ValidationError
-# NEW: Import llama.cpp
 from llama_cpp import Llama
 # ---------- Configuration ----------
-# You can now use GGUF models for even faster inference!
-# These are specifically optimized by the PrismML team.
 MODEL_ID = os.getenv("MODEL_ID", "prism-ml/Bonsai-1.7B-gguf")
-MODEL_FILENAME = os.getenv("MODEL_FILENAME", "Bonsai-1.7B-v1.0-Q1_0.gguf")
-# Quantization types in GGUF: Q1_0 is for 1-bit models.
-# For 8B, use MODEL_ID="prism-ml/Bonsai-8B-gguf" and MODEL_FILENAME="Bonsai-8B-v1.0-Q1_0.gguf"
 HF_TOKEN = os.getenv("HF_TOKEN")
 LOCAL_MODEL_DIR = os.getenv("LOCAL_MODEL_DIR", "/data/models")
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
 API_KEY = os.getenv("API_KEY", None)
-# Performance settings for CPU inference
-N_CTX = int(os.getenv("N_CTX", "4096"))  # Context window
-N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads to use
-N_BATCH = int(os.getenv("N_BATCH", "512")) # Batch size for prompt processing
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("uvicorn.error")
-# ---------- Pydantic Models (Same as before) ----------
 class Message(BaseModel):
     role: str = Field(..., pattern="^(system|user|assistant)$")
     content: str
@@ -127,12 +119,11 @@ async def _ensure_loaded():
             raise HTTPException(status_code=503, detail=f"Model failed to load: {model_load_error}")
         try:
             model_path = _download_model()
-            # Load the model with CPU-optimized settings
             llm = Llama(
                 model_path=model_path,
-                n_ctx=N_CTX,         # Context window
-                n_threads=N_THREADS, # Number of CPU threads
-                n_batch=N_BATCH,     # Batch size for prompt processing
                 verbose=False,
             )
             logger.info(f"Model loaded successfully: {MODEL_ID} ({MODEL_FILENAME})")
@@ -142,21 +133,13 @@ async def _ensure_loaded():
             logger.exception("Model loading failed")
             raise HTTPException(status_code=503, detail=f"Model unavailable: {model_load_error}")
-def _build_chat_prompt(messages: List[Message]) -> str:
-    # llama.cpp handles chat templates automatically, so we can just pass the messages directly.
-    # This is for compatibility; the actual formatting is done by llama.cpp.
-    if llm is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-    # The create_chat_completion method expects a list of messages in this format
     return [{"role": msg.role, "content": msg.content} for msg in messages]
 async def _generate_full(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None) -> str:
     if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
-    # Run the blocking llama.cpp call in a thread
-    return await asyncio.to_thread(
         lambda: llm.create_chat_completion(
             messages=prompt,
             max_tokens=max_new_tokens,
@@ -164,15 +147,14 @@ async def _generate_full(prompt: list, max_new_tokens: int, temperature: float,
             top_p=top_p,
             stop=stop_sequences,
             stream=False,
-        )["choices"][0]["message"]["content"]
     )
 async def _generate_stream(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None):
     if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
-    # llama.cpp can yield a Python generator. We'll run it in a thread and yield the results.
-    def generator():
         for chunk in llm.create_chat_completion(
             messages=prompt,
             max_tokens=max_new_tokens,
@@ -183,18 +165,12 @@ async def _generate_stream(prompt: list, max_new_tokens: int, temperature: float
         ):
             if "content" in chunk["choices"][0]["delta"]:
                 yield chunk["choices"][0]["delta"]["content"]
-    # We need a helper to bridge the sync generator to an async one
-    def sync_generator():
-        for item in generator():
-            yield item
-    # Run the sync generator in a thread and yield items as they come
-    for item in await asyncio.to_thread(list, sync_generator()):
-        yield item
-        await asyncio.sleep(0) # Yield control to the event loop
-# ---------- FastAPI App (Same structure) ----------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
@@ -233,14 +209,14 @@ async def auth_middleware(request: Request, call_next):
 async def http_exception_handler(request, exc):
     return JSONResponse(
         status_code=exc.status_code,
-        content=ErrorResponse(error=exc.detail, detail=str(exc.detail)).dict(),
     )
 @app.exception_handler(ValidationError)
 async def validation_exception_handler(request, exc):
     return JSONResponse(
         status_code=422,
-        content=ErrorResponse(error="Validation error", detail=str(exc)).dict(),
     )
 @app.exception_handler(Exception)
@@ -248,7 +224,7 @@ async def generic_exception_handler(request, exc):
     logger.exception("Unhandled exception")
     return JSONResponse(
         status_code=500,
-        content=ErrorResponse(error="Internal server error", detail=str(exc)).dict(),
     )
 @app.get("/", summary="Root")
@@ -279,12 +255,7 @@ def model_info():
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def chat_completions(req: ChatCompletionRequest):
     await _ensure_loaded()
-    try:
-        prompt = _build_chat_prompt(req.messages)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Prompt formatting error: {str(e)}")
     stop_seq = req.stop if isinstance(req.stop, list) else ([req.stop] if req.stop else None)
     if req.stream:
@@ -300,11 +271,7 @@ async def chat_completions(req: ChatCompletionRequest):
     else:
         text = await _generate_full(prompt, req.max_tokens, req.temperature, req.top_p, stop_seq)
         assistant_msg = Message(role="assistant", content=text)
-        usage = Usage(
-            prompt_tokens=0, # llama.cpp can return this, but we can omit for simplicity
-            completion_tokens=0,
-            total_tokens=0,
-        )
         return ChatCompletionResponse(
             id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
             created=int(time.time()),

 from fastapi.responses import JSONResponse, StreamingResponse
 from huggingface_hub import hf_hub_download
 from pydantic import BaseModel, Field, ValidationError
 from llama_cpp import Llama
 # ---------- Configuration ----------
 MODEL_ID = os.getenv("MODEL_ID", "prism-ml/Bonsai-1.7B-gguf")
+MODEL_FILENAME = os.getenv("MODEL_FILENAME", "Bonsai-1.7B-Q1_0.gguf")
 HF_TOKEN = os.getenv("HF_TOKEN")
 LOCAL_MODEL_DIR = os.getenv("LOCAL_MODEL_DIR", "/data/models")
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
 API_KEY = os.getenv("API_KEY", None)
+# Performance settings
+N_CTX = int(os.getenv("N_CTX", "4096"))
+N_THREADS = int(os.getenv("N_THREADS", "4"))
+N_BATCH = int(os.getenv("N_BATCH", "512"))
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("uvicorn.error")
+# ---------- Pydantic Models ----------
 class Message(BaseModel):
     role: str = Field(..., pattern="^(system|user|assistant)$")
     content: str
             raise HTTPException(status_code=503, detail=f"Model failed to load: {model_load_error}")
         try:
             model_path = _download_model()
             llm = Llama(
                 model_path=model_path,
+                n_ctx=N_CTX,
+                n_threads=N_THREADS,
+                n_batch=N_BATCH,
                 verbose=False,
             )
             logger.info(f"Model loaded successfully: {MODEL_ID} ({MODEL_FILENAME})")
             logger.exception("Model loading failed")
             raise HTTPException(status_code=503, detail=f"Model unavailable: {model_load_error}")
+def _build_chat_prompt(messages: List[Message]) -> list:
     return [{"role": msg.role, "content": msg.content} for msg in messages]
 async def _generate_full(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None) -> str:
     if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    result = await asyncio.to_thread(
         lambda: llm.create_chat_completion(
             messages=prompt,
             max_tokens=max_new_tokens,
             top_p=top_p,
             stop=stop_sequences,
             stream=False,
+        )
     )
+    return result["choices"][0]["message"]["content"]
 async def _generate_stream(prompt: list, max_new_tokens: int, temperature: float, top_p: float, stop_sequences: Optional[List[str]] = None):
     if llm is None:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    def sync_gen():
         for chunk in llm.create_chat_completion(
             messages=prompt,
             max_tokens=max_new_tokens,
         ):
             if "content" in chunk["choices"][0]["delta"]:
                 yield chunk["choices"][0]["delta"]["content"]
+    # Convert sync generator to async
+    for token in await asyncio.to_thread(list, sync_gen()):
+        yield token
+        await asyncio.sleep(0)
+# ---------- FastAPI App ----------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
 async def http_exception_handler(request, exc):
     return JSONResponse(
         status_code=exc.status_code,
+        content=ErrorResponse(error=exc.detail, detail=str(exc.detail)).model_dump(),
     )
 @app.exception_handler(ValidationError)
 async def validation_exception_handler(request, exc):
     return JSONResponse(
         status_code=422,
+        content=ErrorResponse(error="Validation error", detail=str(exc)).model_dump(),
     )
 @app.exception_handler(Exception)
     logger.exception("Unhandled exception")
     return JSONResponse(
         status_code=500,
+        content=ErrorResponse(error="Internal server error", detail=str(exc)).model_dump(),
     )
 @app.get("/", summary="Root")
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def chat_completions(req: ChatCompletionRequest):
     await _ensure_loaded()
+    prompt = _build_chat_prompt(req.messages)
     stop_seq = req.stop if isinstance(req.stop, list) else ([req.stop] if req.stop else None)
     if req.stream:
     else:
         text = await _generate_full(prompt, req.max_tokens, req.temperature, req.top_p, stop_seq)
         assistant_msg = Message(role="assistant", content=text)
+        usage = Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0)
         return ChatCompletionResponse(
             id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
             created=int(time.time()),