Spaces:
Running
Running
Dmitry Beresnev commited on
Commit ·
62a5a49
1
Parent(s): e1e4b82
fix context window size
Browse files
app.py
CHANGED
|
@@ -112,6 +112,7 @@ WEB_SEARCH_CACHE_TTL = int(os.getenv("WEB_SEARCH_CACHE_TTL", "3600")) # 1 hour
|
|
| 112 |
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
|
| 113 |
LOG_REQUEST_BODY = os.getenv("LOG_REQUEST_BODY", "1") == "1"
|
| 114 |
LOG_REQUEST_BODY_MAX_CHARS = int(os.getenv("LOG_REQUEST_BODY_MAX_CHARS", "2000"))
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
@dataclass
|
|
@@ -594,7 +595,7 @@ async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen
|
|
| 594 |
"-hf", model_id,
|
| 595 |
"--host", "0.0.0.0",
|
| 596 |
"--port", str(port),
|
| 597 |
-
"-c",
|
| 598 |
"-t", "4", # CPU threads
|
| 599 |
"-ngl", "0", # GPU layers (0 for CPU-only)
|
| 600 |
"--cont-batching", # Enable continuous batching
|
|
@@ -925,6 +926,9 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
|
|
| 925 |
metrics.record_request(current_model, request_latency)
|
| 926 |
|
| 927 |
return result
|
|
|
|
|
|
|
|
|
|
| 928 |
except aiohttp.ClientError as e:
|
| 929 |
logger.exception(f"request_id={request_id} llama-server error")
|
| 930 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|
|
|
|
| 112 |
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
|
| 113 |
LOG_REQUEST_BODY = os.getenv("LOG_REQUEST_BODY", "1") == "1"
|
| 114 |
LOG_REQUEST_BODY_MAX_CHARS = int(os.getenv("LOG_REQUEST_BODY_MAX_CHARS", "2000"))
|
| 115 |
+
CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "2048"))
|
| 116 |
|
| 117 |
|
| 118 |
@dataclass
|
|
|
|
| 595 |
"-hf", model_id,
|
| 596 |
"--host", "0.0.0.0",
|
| 597 |
"--port", str(port),
|
| 598 |
+
"-c", str(CONTEXT_SIZE), # Context size
|
| 599 |
"-t", "4", # CPU threads
|
| 600 |
"-ngl", "0", # GPU layers (0 for CPU-only)
|
| 601 |
"--cont-batching", # Enable continuous batching
|
|
|
|
| 926 |
metrics.record_request(current_model, request_latency)
|
| 927 |
|
| 928 |
return result
|
| 929 |
+
except aiohttp.ClientResponseError as e:
|
| 930 |
+
logger.exception(f"request_id={request_id} llama-server error")
|
| 931 |
+
raise HTTPException(status_code=e.status, detail=f"llama-server error: {e.message}")
|
| 932 |
except aiohttp.ClientError as e:
|
| 933 |
logger.exception(f"request_id={request_id} llama-server error")
|
| 934 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|