jeanbaptdzd commited on
Commit
772dd21
·
1 Parent(s): efa48fc

Add detailed error logging to vLLM provider and router

Browse files
app/main.py CHANGED
@@ -1,8 +1,11 @@
1
  from fastapi import FastAPI
2
  from app.middleware import api_key_guard
3
-
4
  from app.routers import openai_api, extract
 
5
 
 
 
 
6
 
7
  app = FastAPI(title="PRIIPs LLM Service (vLLM)")
8
 
@@ -13,9 +16,24 @@ app.include_router(extract.router)
13
  # Optional API key middleware
14
  app.middleware("http")(api_key_guard)
15
 
 
 
 
 
 
16
 
17
  @app.get("/")
18
  async def root():
19
- return {"status": "ok"}
 
 
 
 
 
 
 
 
 
 
20
 
21
 
 
1
  from fastapi import FastAPI
2
  from app.middleware import api_key_guard
 
3
  from app.routers import openai_api, extract
4
+ import logging
5
 
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
 
10
  app = FastAPI(title="PRIIPs LLM Service (vLLM)")
11
 
 
16
  # Optional API key middleware
17
  app.middleware("http")(api_key_guard)
18
 
19
+ @app.on_event("startup")
20
+ async def startup_event():
21
+ """Preload the model on startup"""
22
+ logger.info("Starting PRIIPs LLM Service...")
23
+ logger.info("Model will be loaded on first request to optimize startup time")
24
 
25
  @app.get("/")
26
  async def root():
27
+ return {
28
+ "status": "ok",
29
+ "service": "PRIIPs LLM Service",
30
+ "version": "1.0.0",
31
+ "model": "DragonLLM/LLM-Pro-Finance-Small",
32
+ "backend": "vLLM"
33
+ }
34
+
35
+ @app.get("/health")
36
+ async def health():
37
+ return {"status": "healthy", "service": "PRIIPs LLM Service"}
38
 
39
 
app/providers/vllm.py CHANGED
@@ -1,24 +1,135 @@
1
- import httpx
2
- from app.config import settings
 
 
 
3
 
 
 
 
4
 
5
- async def list_models():
6
- async with httpx.AsyncClient(timeout=30) as client:
7
- r = await client.get(f"{settings.vllm_base_url}/models")
8
- r.raise_for_status()
9
- return r.json()
10
-
11
-
12
- async def chat(payload, stream: bool = False):
13
- async with httpx.AsyncClient(timeout=None) as client:
14
- if stream:
15
- return await client.stream(
16
- "POST", f"{settings.vllm_base_url}/chat/completions", json=payload
 
 
 
 
 
 
 
 
 
 
 
17
  )
18
- r = await client.post(
19
- f"{settings.vllm_base_url}/chat/completions", json=payload
20
- )
21
- r.raise_for_status()
22
- return r.json()
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any, AsyncIterator
3
+ from vllm import LLM, SamplingParams
4
+ from vllm.entrypoints.openai.api_server import build_async_engine_client
5
+ import asyncio
6
 
7
+ # Model configuration
8
+ model_name = "DragonLLM/LLM-Pro-Finance-Small"
9
+ llm_engine = None
10
 
11
+ def initialize_vllm():
12
+ """Initialize vLLM engine with the model"""
13
+ global llm_engine
14
+
15
+ if llm_engine is None:
16
+ print(f"Initializing vLLM with model: {model_name}")
17
+
18
+ # Get HF token from environment
19
+ hf_token = os.getenv("HF_TOKEN_LC")
20
+ if hf_token:
21
+ os.environ["HF_TOKEN"] = hf_token
22
+ os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
23
+
24
+ try:
25
+ # Initialize vLLM engine
26
+ llm_engine = LLM(
27
+ model=model_name,
28
+ trust_remote_code=True,
29
+ dtype="float16",
30
+ max_model_len=4096,
31
+ gpu_memory_utilization=0.9,
32
+ tensor_parallel_size=1, # L40 has 1 GPU
33
+ download_dir="/tmp/huggingface",
34
  )
35
+ print(f"vLLM engine initialized successfully!")
36
+ except Exception as e:
37
+ print(f"Error initializing vLLM: {e}")
38
+ raise
 
39
 
40
 
41
+ class VLLMProvider:
42
+ def __init__(self):
43
+ # Don't initialize at import time
44
+ pass
45
+
46
+ async def list_models(self) -> Dict[str, Any]:
47
+ return {
48
+ "object": "list",
49
+ "data": [
50
+ {
51
+ "id": model_name,
52
+ "object": "model",
53
+ "created": 1677610602,
54
+ "owned_by": "DragonLLM",
55
+ "permission": [],
56
+ "root": model_name,
57
+ "parent": None,
58
+ }
59
+ ]
60
+ }
61
+
62
+ async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Dict[str, Any]:
63
+ import logging
64
+ logger = logging.getLogger(__name__)
65
+
66
+ try:
67
+ # Initialize vLLM on first use
68
+ if llm_engine is None:
69
+ logger.info("vLLM engine not initialized, initializing now...")
70
+ initialize_vllm()
71
+ logger.info("vLLM engine initialized successfully")
72
+
73
+ messages = payload.get("messages", [])
74
+ temperature = payload.get("temperature", 0.7)
75
+ max_tokens = payload.get("max_tokens", 1000)
76
+ top_p = payload.get("top_p", 1.0)
77
+
78
+ # Convert messages to prompt
79
+ prompt = self._messages_to_prompt(messages)
80
+ logger.info(f"Generating response for prompt: {prompt[:100]}...")
81
+
82
+ # Set up sampling parameters
83
+ sampling_params = SamplingParams(
84
+ temperature=temperature,
85
+ top_p=top_p,
86
+ max_tokens=max_tokens,
87
+ )
88
+
89
+ # Generate response using vLLM
90
+ outputs = llm_engine.generate([prompt], sampling_params)
91
+
92
+ # Extract the generated text
93
+ generated_text = outputs[0].outputs[0].text
94
+ logger.info(f"Generated text: {generated_text[:100]}...")
95
+
96
+ # Build OpenAI-compatible response
97
+ return {
98
+ "id": f"chatcmpl-{os.urandom(12).hex()}",
99
+ "object": "chat.completion",
100
+ "created": int(asyncio.get_event_loop().time()),
101
+ "model": model_name,
102
+ "choices": [
103
+ {
104
+ "index": 0,
105
+ "message": {
106
+ "role": "assistant",
107
+ "content": generated_text
108
+ },
109
+ "finish_reason": "stop"
110
+ }
111
+ ],
112
+ "usage": {
113
+ "prompt_tokens": len(outputs[0].prompt_token_ids),
114
+ "completion_tokens": len(outputs[0].outputs[0].token_ids),
115
+ "total_tokens": len(outputs[0].prompt_token_ids) + len(outputs[0].outputs[0].token_ids)
116
+ }
117
+ }
118
+ except Exception as e:
119
+ logger.error(f"Error in chat completion: {str(e)}", exc_info=True)
120
+ raise
121
+
122
+ def _messages_to_prompt(self, messages: list) -> str:
123
+ """Convert OpenAI messages format to prompt"""
124
+ prompt = ""
125
+ for message in messages:
126
+ role = message["role"]
127
+ content = message["content"]
128
+ if role == "system":
129
+ prompt += f"System: {content}\n"
130
+ elif role == "user":
131
+ prompt += f"User: {content}\n"
132
+ elif role == "assistant":
133
+ prompt += f"Assistant: {content}\n"
134
+ prompt += "Assistant: "
135
+ return prompt
app/routers/openai_api.py CHANGED
@@ -19,31 +19,43 @@ async def list_models():
19
 
20
  @router.post("/chat/completions")
21
  async def chat_completions(body: ChatCompletionRequest):
22
- payload: Dict[str, Any] = {
23
- "model": body.model or settings.model,
24
- "messages": [m.model_dump() for m in body.messages],
25
- "temperature": body.temperature,
26
- **({"max_tokens": body.max_tokens} if body.max_tokens is not None else {}),
27
- "stream": body.stream or False,
28
- }
29
-
30
- if body.stream:
31
- upstream = await chat_service.chat(payload, stream=True)
32
-
33
- async def event_stream():
34
- async for line in upstream.aiter_lines():
35
- if not line:
36
- continue
37
- if line.startswith("data:"):
38
- yield f"{line}\n\n"
39
- else:
40
- yield f"data: {line}\n\n"
41
-
42
- return StreamingResponse(event_stream(), media_type="text/event-stream")
43
-
44
- data = await chat_service.chat(payload, stream=False)
45
- # Assume vLLM already returns OpenAI-compatible schema; pass through.
46
- # If needed, normalize here.
47
- return JSONResponse(content=data)
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
 
19
 
20
  @router.post("/chat/completions")
21
  async def chat_completions(body: ChatCompletionRequest):
22
+ import logging
23
+ logger = logging.getLogger(__name__)
24
+
25
+ try:
26
+ payload: Dict[str, Any] = {
27
+ "model": body.model or settings.model,
28
+ "messages": [m.model_dump() for m in body.messages],
29
+ "temperature": body.temperature,
30
+ **({"max_tokens": body.max_tokens} if body.max_tokens is not None else {}),
31
+ "stream": body.stream or False,
32
+ }
33
+
34
+ logger.info(f"Chat completion request: {payload}")
35
+
36
+ if body.stream:
37
+ upstream = await chat_service.chat(payload, stream=True)
38
+
39
+ async def event_stream():
40
+ async for line in upstream.aiter_lines():
41
+ if not line:
42
+ continue
43
+ if line.startswith("data:"):
44
+ yield f"{line}\n\n"
45
+ else:
46
+ yield f"data: {line}\n\n"
47
+
48
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
49
+
50
+ data = await chat_service.chat(payload, stream=False)
51
+ # Assume vLLM already returns OpenAI-compatible schema; pass through.
52
+ # If needed, normalize here.
53
+ return JSONResponse(content=data)
54
+ except Exception as e:
55
+ logger.error(f"Error in chat completions endpoint: {str(e)}", exc_info=True)
56
+ return JSONResponse(
57
+ status_code=500,
58
+ content={"error": {"message": str(e), "type": "internal_error"}}
59
+ )
60
 
61
 
app/services/chat_service.py CHANGED
@@ -1,12 +1,12 @@
1
  from typing import Any, Dict
 
2
 
3
- from app.providers import vllm as provider
4
-
5
 
6
  async def list_models() -> Dict[str, Any]:
7
  return await provider.list_models()
8
 
9
-
10
  async def chat(payload: Dict[str, Any], stream: bool = False):
11
  return await provider.chat(payload, stream=stream)
12
 
 
1
  from typing import Any, Dict
2
+ from app.providers.vllm import VLLMProvider
3
 
4
+ # Initialize the provider
5
+ provider = VLLMProvider()
6
 
7
  async def list_models() -> Dict[str, Any]:
8
  return await provider.list_models()
9
 
 
10
  async def chat(payload: Dict[str, Any], stream: bool = False):
11
  return await provider.chat(payload, stream=stream)
12