github-actions commited on
Commit
5bafa93
·
1 Parent(s): 952058d

fix: /v1/chat/completions 엔드포인트 추가 — LLMPlannerAdapter 404 수정

Browse files
Files changed (1) hide show
  1. src/inference/api_server.py +71 -0
src/inference/api_server.py CHANGED
@@ -1376,6 +1376,77 @@ async def generate(
1376
  )
1377
 
1378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1379
  @app.post("/v1/stream")
1380
  @_rate_limit("30/minute")
1381
  async def stream_generate(
 
1376
  )
1377
 
1378
 
1379
+ @app.post("/v1/chat/completions")
1380
+ async def chat_completions(
1381
+ request: Request,
1382
+ _: None = Depends(verify_api_key),
1383
+ ):
1384
+ """OpenAI-compatible /v1/chat/completions — LLMPlannerAdapter(ChatOpenAI) 전용.
1385
+
1386
+ vLLM AsyncLLM을 직접 호출하여 chat template 없이 메시지를 단순 연결한 프롬프트로 생성한다.
1387
+ tool calling / function calling 은 지원하지 않는다.
1388
+ """
1389
+ body = await request.json()
1390
+ messages: list[dict] = body.get("messages", [])
1391
+ max_tokens: int = int(body.get("max_tokens", 512))
1392
+ temperature: float = float(body.get("temperature", 0.7))
1393
+ model: str = body.get("model", runtime_config.model.model_path)
1394
+
1395
+ # 메시지 → 프롬프트 변환 (EXAONE chat template 형식)
1396
+ prompt_parts: list[str] = []
1397
+ for msg in messages:
1398
+ role = msg.get("role", "user")
1399
+ content = msg.get("content", "")
1400
+ if role == "system":
1401
+ prompt_parts.append(f"[|system|]{content}[|endofturn|]")
1402
+ elif role == "user":
1403
+ prompt_parts.append(f"[|user|]{content}[|endofturn|]")
1404
+ elif role == "assistant":
1405
+ prompt_parts.append(f"[|assistant|]{content}[|endofturn|]")
1406
+ prompt_parts.append("[|assistant|]")
1407
+ prompt = "\n".join(prompt_parts)
1408
+
1409
+ if manager.engine is None:
1410
+ raise HTTPException(status_code=503, detail="Model engine not initialized.")
1411
+
1412
+ request_id = str(uuid.uuid4())
1413
+ sampling_params = SamplingParams(
1414
+ max_tokens=max_tokens,
1415
+ temperature=temperature,
1416
+ stop=["[|endofturn|]"],
1417
+ )
1418
+
1419
+ try:
1420
+ final_output = await manager._run_engine(prompt, sampling_params, request_id)
1421
+ except Exception as exc:
1422
+ raise HTTPException(status_code=500, detail=str(exc))
1423
+
1424
+ if final_output is None:
1425
+ raise HTTPException(status_code=500, detail="Generation failed.")
1426
+
1427
+ text = manager._strip_thought_blocks(final_output.outputs[0].text)
1428
+ prompt_tokens = len(final_output.prompt_token_ids)
1429
+ completion_tokens = len(final_output.outputs[0].token_ids)
1430
+
1431
+ return {
1432
+ "id": f"chatcmpl-{request_id}",
1433
+ "object": "chat.completion",
1434
+ "model": model,
1435
+ "choices": [
1436
+ {
1437
+ "index": 0,
1438
+ "message": {"role": "assistant", "content": text},
1439
+ "finish_reason": "stop",
1440
+ }
1441
+ ],
1442
+ "usage": {
1443
+ "prompt_tokens": prompt_tokens,
1444
+ "completion_tokens": completion_tokens,
1445
+ "total_tokens": prompt_tokens + completion_tokens,
1446
+ },
1447
+ }
1448
+
1449
+
1450
  @app.post("/v1/stream")
1451
  @_rate_limit("30/minute")
1452
  async def stream_generate(