AnatoliiG commited on
Commit
2d1f66e
·
1 Parent(s): 17af010

multithread

Browse files
Files changed (2) hide show
  1. src/api/routes.py +10 -10
  2. src/core/config.py +3 -1
src/api/routes.py CHANGED
@@ -106,13 +106,13 @@ async def chat_completions(request: Request):
106
  if stream:
107
  return StreamingResponse(stream_generator(), media_type="text/event-stream")
108
 
109
- # For non-streaming responses: keep sequential processing but run blocking work in a thread
110
- async with engine.lock:
111
- result = await asyncio.to_thread(
112
- engine.generate,
113
- messages,
114
- data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
115
- data.get("temperature", settings.DEFAULT_TEMP),
116
- stream=False,
117
- )
118
- return result
 
106
  if stream:
107
  return StreamingResponse(stream_generator(), media_type="text/event-stream")
108
 
109
+ else:
110
+ async with engine.lock:
111
+ result = await asyncio.to_thread(
112
+ engine.generate,
113
+ messages,
114
+ data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
115
+ data.get("temperature", settings.DEFAULT_TEMP),
116
+ stream=False,
117
+ )
118
+ return result
src/core/config.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from pydantic_settings import BaseSettings
2
 
3
 
@@ -8,7 +10,7 @@ class Settings(BaseSettings):
8
  CONTEXT_SIZE: int = 8192
9
  DEFAULT_MAX_TOKENS: int = 4096
10
  DEFAULT_TEMP: float = 0.4
11
- N_THREADS: int = 2
12
  N_GPU_LAYERS: int = 0
13
 
14
 
 
1
+ import os
2
+
3
  from pydantic_settings import BaseSettings
4
 
5
 
 
10
  CONTEXT_SIZE: int = 8192
11
  DEFAULT_MAX_TOKENS: int = 4096
12
  DEFAULT_TEMP: float = 0.4
13
+ N_THREADS: int = os.cpu_count()
14
  N_GPU_LAYERS: int = 0
15
 
16