Spaces:

Vishinka
/

Code_LLM

Sleeping

AnatoliiG commited on Jan 19

Commit

2d1f66e

1 Parent(s): 17af010

multithread

Files changed (2) hide show

src/api/routes.py CHANGED Viewed

@@ -106,13 +106,13 @@ async def chat_completions(request: Request):
     if stream:
         return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    # For non-streaming responses: keep sequential processing but run blocking work in a thread
-    async with engine.lock:
-        result = await asyncio.to_thread(
-            engine.generate,
-            messages,
-            data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
-            data.get("temperature", settings.DEFAULT_TEMP),
-            stream=False,
-        )
-        return result

     if stream:
         return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        async with engine.lock:
+            result = await asyncio.to_thread(
+                engine.generate,
+                messages,
+                data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
+                data.get("temperature", settings.DEFAULT_TEMP),
+                stream=False,
+            )
+            return result

src/core/config.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from pydantic_settings import BaseSettings
@@ -8,7 +10,7 @@ class Settings(BaseSettings):
     CONTEXT_SIZE: int = 8192
     DEFAULT_MAX_TOKENS: int = 4096
     DEFAULT_TEMP: float = 0.4
-    N_THREADS: int = 2
     N_GPU_LAYERS: int = 0

+import os
 from pydantic_settings import BaseSettings
     CONTEXT_SIZE: int = 8192
     DEFAULT_MAX_TOKENS: int = 4096
     DEFAULT_TEMP: float = 0.4
+    N_THREADS: int = os.cpu_count()
     N_GPU_LAYERS: int = 0