Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

App Files Files Community

jeanbaptdzd commited on Oct 28

Commit

772dd21

1 Parent(s): efa48fc

Add detailed error logging to vLLM provider and router

Browse files

Files changed (4) hide show

app/main.py +20 -2
app/providers/vllm.py +130 -19
app/routers/openai_api.py +38 -26
app/services/chat_service.py +3 -3

app/main.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from fastapi import FastAPI
 from app.middleware import api_key_guard
 from app.routers import openai_api, extract
 app = FastAPI(title="PRIIPs LLM Service (vLLM)")
@@ -13,9 +16,24 @@ app.include_router(extract.router)
 # Optional API key middleware
 app.middleware("http")(api_key_guard)
 @app.get("/")
 async def root():
-    return {"status": "ok"}

 from fastapi import FastAPI
 from app.middleware import api_key_guard
 from app.routers import openai_api, extract
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI(title="PRIIPs LLM Service (vLLM)")
 # Optional API key middleware
 app.middleware("http")(api_key_guard)
+@app.on_event("startup")
+async def startup_event():
+    """Preload the model on startup"""
+    logger.info("Starting PRIIPs LLM Service...")
+    logger.info("Model will be loaded on first request to optimize startup time")
 @app.get("/")
 async def root():
+    return {
+        "status": "ok",
+        "service": "PRIIPs LLM Service",
+        "version": "1.0.0",
+        "model": "DragonLLM/LLM-Pro-Finance-Small",
+        "backend": "vLLM"
+    }
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "PRIIPs LLM Service"}

app/providers/vllm.py CHANGED Viewed

@@ -1,24 +1,135 @@
-import httpx
-from app.config import settings
-async def list_models():
-    async with httpx.AsyncClient(timeout=30) as client:
-        r = await client.get(f"{settings.vllm_base_url}/models")
-        r.raise_for_status()
-        return r.json()
-async def chat(payload, stream: bool = False):
-    async with httpx.AsyncClient(timeout=None) as client:
-        if stream:
-            return await client.stream(
-                "POST", f"{settings.vllm_base_url}/chat/completions", json=payload
             )
-        r = await client.post(
-            f"{settings.vllm_base_url}/chat/completions", json=payload
-        )
-        r.raise_for_status()
-        return r.json()

+import os
+from typing import Dict, Any, AsyncIterator
+from vllm import LLM, SamplingParams
+from vllm.entrypoints.openai.api_server import build_async_engine_client
+import asyncio
+# Model configuration
+model_name = "DragonLLM/LLM-Pro-Finance-Small"
+llm_engine = None
+def initialize_vllm():
+    """Initialize vLLM engine with the model"""
+    global llm_engine
+    if llm_engine is None:
+        print(f"Initializing vLLM with model: {model_name}")
+        # Get HF token from environment
+        hf_token = os.getenv("HF_TOKEN_LC")
+        if hf_token:
+            os.environ["HF_TOKEN"] = hf_token
+            os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
+        try:
+            # Initialize vLLM engine
+            llm_engine = LLM(
+                model=model_name,
+                trust_remote_code=True,
+                dtype="float16",
+                max_model_len=4096,
+                gpu_memory_utilization=0.9,
+                tensor_parallel_size=1,  # L40 has 1 GPU
+                download_dir="/tmp/huggingface",
             )
+            print(f"vLLM engine initialized successfully!")
+        except Exception as e:
+            print(f"Error initializing vLLM: {e}")
+            raise
+class VLLMProvider:
+    def __init__(self):
+        # Don't initialize at import time
+        pass
+    async def list_models(self) -> Dict[str, Any]:
+        return {
+            "object": "list",
+            "data": [
+                {
+                    "id": model_name,
+                    "object": "model",
+                    "created": 1677610602,
+                    "owned_by": "DragonLLM",
+                    "permission": [],
+                    "root": model_name,
+                    "parent": None,
+                }
+            ]
+        }
+    async def chat(self, payload: Dict[str, Any], stream: bool = False) -> Dict[str, Any]:
+        import logging
+        logger = logging.getLogger(__name__)
+        try:
+            # Initialize vLLM on first use
+            if llm_engine is None:
+                logger.info("vLLM engine not initialized, initializing now...")
+                initialize_vllm()
+                logger.info("vLLM engine initialized successfully")
+            messages = payload.get("messages", [])
+            temperature = payload.get("temperature", 0.7)
+            max_tokens = payload.get("max_tokens", 1000)
+            top_p = payload.get("top_p", 1.0)
+            # Convert messages to prompt
+            prompt = self._messages_to_prompt(messages)
+            logger.info(f"Generating response for prompt: {prompt[:100]}...")
+            # Set up sampling parameters
+            sampling_params = SamplingParams(
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens,
+            )
+            # Generate response using vLLM
+            outputs = llm_engine.generate([prompt], sampling_params)
+            # Extract the generated text
+            generated_text = outputs[0].outputs[0].text
+            logger.info(f"Generated text: {generated_text[:100]}...")
+            # Build OpenAI-compatible response
+            return {
+                "id": f"chatcmpl-{os.urandom(12).hex()}",
+                "object": "chat.completion",
+                "created": int(asyncio.get_event_loop().time()),
+                "model": model_name,
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": generated_text
+                        },
+                        "finish_reason": "stop"
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": len(outputs[0].prompt_token_ids),
+                    "completion_tokens": len(outputs[0].outputs[0].token_ids),
+                    "total_tokens": len(outputs[0].prompt_token_ids) + len(outputs[0].outputs[0].token_ids)
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error in chat completion: {str(e)}", exc_info=True)
+            raise
+    def _messages_to_prompt(self, messages: list) -> str:
+        """Convert OpenAI messages format to prompt"""
+        prompt = ""
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            if role == "system":
+                prompt += f"System: {content}\n"
+            elif role == "user":
+                prompt += f"User: {content}\n"
+            elif role == "assistant":
+                prompt += f"Assistant: {content}\n"
+        prompt += "Assistant: "
+        return prompt

app/routers/openai_api.py CHANGED Viewed

@@ -19,31 +19,43 @@ async def list_models():
 @router.post("/chat/completions")
 async def chat_completions(body: ChatCompletionRequest):
-    payload: Dict[str, Any] = {
-        "model": body.model or settings.model,
-        "messages": [m.model_dump() for m in body.messages],
-        "temperature": body.temperature,
-        **({"max_tokens": body.max_tokens} if body.max_tokens is not None else {}),
-        "stream": body.stream or False,
-    }
-    if body.stream:
-        upstream = await chat_service.chat(payload, stream=True)
-        async def event_stream():
-            async for line in upstream.aiter_lines():
-                if not line:
-                    continue
-                if line.startswith("data:"):
-                    yield f"{line}\n\n"
-                else:
-                    yield f"data: {line}\n\n"
-        return StreamingResponse(event_stream(), media_type="text/event-stream")
-    data = await chat_service.chat(payload, stream=False)
-    # Assume vLLM already returns OpenAI-compatible schema; pass through.
-    # If needed, normalize here.
-    return JSONResponse(content=data)

 @router.post("/chat/completions")
 async def chat_completions(body: ChatCompletionRequest):
+    import logging
+    logger = logging.getLogger(__name__)
+    try:
+        payload: Dict[str, Any] = {
+            "model": body.model or settings.model,
+            "messages": [m.model_dump() for m in body.messages],
+            "temperature": body.temperature,
+            **({"max_tokens": body.max_tokens} if body.max_tokens is not None else {}),
+            "stream": body.stream or False,
+        }
+        logger.info(f"Chat completion request: {payload}")
+        if body.stream:
+            upstream = await chat_service.chat(payload, stream=True)
+            async def event_stream():
+                async for line in upstream.aiter_lines():
+                    if not line:
+                        continue
+                    if line.startswith("data:"):
+                        yield f"{line}\n\n"
+                    else:
+                        yield f"data: {line}\n\n"
+            return StreamingResponse(event_stream(), media_type="text/event-stream")
+        data = await chat_service.chat(payload, stream=False)
+        # Assume vLLM already returns OpenAI-compatible schema; pass through.
+        # If needed, normalize here.
+        return JSONResponse(content=data)
+    except Exception as e:
+        logger.error(f"Error in chat completions endpoint: {str(e)}", exc_info=True)
+        return JSONResponse(
+            status_code=500,
+            content={"error": {"message": str(e), "type": "internal_error"}}
+        )

app/services/chat_service.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from typing import Any, Dict
-from app.providers import vllm as provider
 async def list_models() -> Dict[str, Any]:
     return await provider.list_models()
 async def chat(payload: Dict[str, Any], stream: bool = False):
     return await provider.chat(payload, stream=stream)

 from typing import Any, Dict
+from app.providers.vllm import VLLMProvider
+# Initialize the provider
+provider = VLLMProvider()
 async def list_models() -> Dict[str, Any]:
     return await provider.list_models()
 async def chat(payload: Dict[str, Any], stream: bool = False):
     return await provider.chat(payload, stream=stream)