Dmitry Beresnev commited on
Commit
62a5a49
·
1 Parent(s): e1e4b82

fix context window size

Browse files
Files changed (1) hide show
  1. app.py +5 -1
app.py CHANGED
@@ -112,6 +112,7 @@ WEB_SEARCH_CACHE_TTL = int(os.getenv("WEB_SEARCH_CACHE_TTL", "3600")) # 1 hour
112
  REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
113
  LOG_REQUEST_BODY = os.getenv("LOG_REQUEST_BODY", "1") == "1"
114
  LOG_REQUEST_BODY_MAX_CHARS = int(os.getenv("LOG_REQUEST_BODY_MAX_CHARS", "2000"))
 
115
 
116
 
117
  @dataclass
@@ -594,7 +595,7 @@ async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen
594
  "-hf", model_id,
595
  "--host", "0.0.0.0",
596
  "--port", str(port),
597
- "-c", "2048", # Context size
598
  "-t", "4", # CPU threads
599
  "-ngl", "0", # GPU layers (0 for CPU-only)
600
  "--cont-batching", # Enable continuous batching
@@ -925,6 +926,9 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
925
  metrics.record_request(current_model, request_latency)
926
 
927
  return result
 
 
 
928
  except aiohttp.ClientError as e:
929
  logger.exception(f"request_id={request_id} llama-server error")
930
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
 
112
  REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
113
  LOG_REQUEST_BODY = os.getenv("LOG_REQUEST_BODY", "1") == "1"
114
  LOG_REQUEST_BODY_MAX_CHARS = int(os.getenv("LOG_REQUEST_BODY_MAX_CHARS", "2000"))
115
+ CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "2048"))
116
 
117
 
118
  @dataclass
 
595
  "-hf", model_id,
596
  "--host", "0.0.0.0",
597
  "--port", str(port),
598
+ "-c", str(CONTEXT_SIZE), # Context size
599
  "-t", "4", # CPU threads
600
  "-ngl", "0", # GPU layers (0 for CPU-only)
601
  "--cont-batching", # Enable continuous batching
 
926
  metrics.record_request(current_model, request_latency)
927
 
928
  return result
929
+ except aiohttp.ClientResponseError as e:
930
+ logger.exception(f"request_id={request_id} llama-server error")
931
+ raise HTTPException(status_code=e.status, detail=f"llama-server error: {e.message}")
932
  except aiohttp.ClientError as e:
933
  logger.exception(f"request_id={request_id} llama-server error")
934
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")