Spaces:
Paused
Paused
github-actions commited on
Commit ·
5bafa93
1
Parent(s): 952058d
fix: /v1/chat/completions 엔드포인트 추가 — LLMPlannerAdapter 404 수정
Browse files- src/inference/api_server.py +71 -0
src/inference/api_server.py
CHANGED
|
@@ -1376,6 +1376,77 @@ async def generate(
|
|
| 1376 |
)
|
| 1377 |
|
| 1378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1379 |
@app.post("/v1/stream")
|
| 1380 |
@_rate_limit("30/minute")
|
| 1381 |
async def stream_generate(
|
|
|
|
| 1376 |
)
|
| 1377 |
|
| 1378 |
|
| 1379 |
+
@app.post("/v1/chat/completions")
|
| 1380 |
+
async def chat_completions(
|
| 1381 |
+
request: Request,
|
| 1382 |
+
_: None = Depends(verify_api_key),
|
| 1383 |
+
):
|
| 1384 |
+
"""OpenAI-compatible /v1/chat/completions — LLMPlannerAdapter(ChatOpenAI) 전용.
|
| 1385 |
+
|
| 1386 |
+
vLLM AsyncLLM을 직접 호출하여 chat template 없이 메시지를 단순 연결한 프롬프트로 생성한다.
|
| 1387 |
+
tool calling / function calling 은 지원하지 않는다.
|
| 1388 |
+
"""
|
| 1389 |
+
body = await request.json()
|
| 1390 |
+
messages: list[dict] = body.get("messages", [])
|
| 1391 |
+
max_tokens: int = int(body.get("max_tokens", 512))
|
| 1392 |
+
temperature: float = float(body.get("temperature", 0.7))
|
| 1393 |
+
model: str = body.get("model", runtime_config.model.model_path)
|
| 1394 |
+
|
| 1395 |
+
# 메시지 → 프롬프트 변환 (EXAONE chat template 형식)
|
| 1396 |
+
prompt_parts: list[str] = []
|
| 1397 |
+
for msg in messages:
|
| 1398 |
+
role = msg.get("role", "user")
|
| 1399 |
+
content = msg.get("content", "")
|
| 1400 |
+
if role == "system":
|
| 1401 |
+
prompt_parts.append(f"[|system|]{content}[|endofturn|]")
|
| 1402 |
+
elif role == "user":
|
| 1403 |
+
prompt_parts.append(f"[|user|]{content}[|endofturn|]")
|
| 1404 |
+
elif role == "assistant":
|
| 1405 |
+
prompt_parts.append(f"[|assistant|]{content}[|endofturn|]")
|
| 1406 |
+
prompt_parts.append("[|assistant|]")
|
| 1407 |
+
prompt = "\n".join(prompt_parts)
|
| 1408 |
+
|
| 1409 |
+
if manager.engine is None:
|
| 1410 |
+
raise HTTPException(status_code=503, detail="Model engine not initialized.")
|
| 1411 |
+
|
| 1412 |
+
request_id = str(uuid.uuid4())
|
| 1413 |
+
sampling_params = SamplingParams(
|
| 1414 |
+
max_tokens=max_tokens,
|
| 1415 |
+
temperature=temperature,
|
| 1416 |
+
stop=["[|endofturn|]"],
|
| 1417 |
+
)
|
| 1418 |
+
|
| 1419 |
+
try:
|
| 1420 |
+
final_output = await manager._run_engine(prompt, sampling_params, request_id)
|
| 1421 |
+
except Exception as exc:
|
| 1422 |
+
raise HTTPException(status_code=500, detail=str(exc))
|
| 1423 |
+
|
| 1424 |
+
if final_output is None:
|
| 1425 |
+
raise HTTPException(status_code=500, detail="Generation failed.")
|
| 1426 |
+
|
| 1427 |
+
text = manager._strip_thought_blocks(final_output.outputs[0].text)
|
| 1428 |
+
prompt_tokens = len(final_output.prompt_token_ids)
|
| 1429 |
+
completion_tokens = len(final_output.outputs[0].token_ids)
|
| 1430 |
+
|
| 1431 |
+
return {
|
| 1432 |
+
"id": f"chatcmpl-{request_id}",
|
| 1433 |
+
"object": "chat.completion",
|
| 1434 |
+
"model": model,
|
| 1435 |
+
"choices": [
|
| 1436 |
+
{
|
| 1437 |
+
"index": 0,
|
| 1438 |
+
"message": {"role": "assistant", "content": text},
|
| 1439 |
+
"finish_reason": "stop",
|
| 1440 |
+
}
|
| 1441 |
+
],
|
| 1442 |
+
"usage": {
|
| 1443 |
+
"prompt_tokens": prompt_tokens,
|
| 1444 |
+
"completion_tokens": completion_tokens,
|
| 1445 |
+
"total_tokens": prompt_tokens + completion_tokens,
|
| 1446 |
+
},
|
| 1447 |
+
}
|
| 1448 |
+
|
| 1449 |
+
|
| 1450 |
@app.post("/v1/stream")
|
| 1451 |
@_rate_limit("30/minute")
|
| 1452 |
async def stream_generate(
|