Commit
Β·
e3878fa
1
Parent(s):
8c38d11
Add vLLM deployment for Koyeb with CUDA optimizations
Browse files- Add Dockerfile.koyeb: vLLM-optimized image with native OpenAI API
- Add start-vllm.sh: vLLM server startup script with optimizations
- Add start.sh: HF Spaces startup script
- Update README.md: Document both HF Spaces and Koyeb deployments
- Add KOYEB_VLLM_DEPLOYMENT.md: Detailed Koyeb setup guide
- Remove redundant status/setup docs
- Remove ad-hoc test scripts
Docker Hub public images:
- jeanbapt/dragon-llm-inference:vllm (Koyeb - vLLM)
- jeanbapt/dragon-llm-inference:latest (HF Spaces - Transformers)
- Dockerfile +12 -3
- Dockerfile.koyeb +57 -0
- KOYEB_VLLM_DEPLOYMENT.md +101 -0
- README.md +80 -99
- start-vllm.sh +68 -0
- start.sh +10 -0
- test_deployment.sh +0 -101
Dockerfile
CHANGED
|
@@ -68,14 +68,23 @@ RUN test -f /app/app/providers/transformers_provider.py && \
|
|
| 68 |
grep -q "def initialize_model" /app/app/providers/transformers_provider.py || \
|
| 69 |
(echo "ERROR: transformers_provider.py not found or invalid!" && exit 1)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
| 71 |
# Create non-root user and cache directories in single layer
|
| 72 |
# Use ${HF_HOME} variable (defaults to /tmp/huggingface if not set)
|
| 73 |
RUN useradd -m -u 1000 user && \
|
| 74 |
mkdir -p ${HF_HOME:-/tmp/huggingface} /tmp/torch/inductor /tmp/triton && \
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
USER user
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
| 68 |
grep -q "def initialize_model" /app/app/providers/transformers_provider.py || \
|
| 69 |
(echo "ERROR: transformers_provider.py not found or invalid!" && exit 1)
|
| 70 |
|
| 71 |
+
# Copy startup script
|
| 72 |
+
COPY start.sh /app/start.sh
|
| 73 |
+
|
| 74 |
# Create non-root user and cache directories in single layer
|
| 75 |
# Use ${HF_HOME} variable (defaults to /tmp/huggingface if not set)
|
| 76 |
RUN useradd -m -u 1000 user && \
|
| 77 |
mkdir -p ${HF_HOME:-/tmp/huggingface} /tmp/torch/inductor /tmp/triton && \
|
| 78 |
+
chmod +x /app/start.sh && \
|
| 79 |
+
chown -R user:user /app ${HF_HOME:-/tmp/huggingface} /tmp/torch /tmp/triton && \
|
| 80 |
+
# Verify startup script is executable and has correct shebang
|
| 81 |
+
test -x /app/start.sh && head -1 /app/start.sh | grep -q "^#!/bin/bash" || (echo "ERROR: start.sh not executable or wrong shebang!" && exit 1)
|
| 82 |
|
| 83 |
USER user
|
| 84 |
|
| 85 |
+
# Expose ports for both HF Spaces (7860) and Koyeb (8000)
|
| 86 |
+
# PORT environment variable controls which port the app actually uses
|
| 87 |
+
EXPOSE 7860 8000
|
| 88 |
|
| 89 |
+
# Use startup script for more reliable execution
|
| 90 |
+
CMD ["/app/start.sh"]
|
Dockerfile.koyeb
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Koyeb-optimized Dockerfile using vLLM's native OpenAI API server
|
| 2 |
+
# This leverages vLLM's built-in optimizations: continuous batching, PagedAttention, CUDA graphs
|
| 3 |
+
|
| 4 |
+
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
|
| 5 |
+
|
| 6 |
+
# Build argument for cache control
|
| 7 |
+
ARG CACHE_BUST=20250125_vllm
|
| 8 |
+
|
| 9 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 10 |
+
DEBIAN_FRONTEND=noninteractive \
|
| 11 |
+
HF_HOME=/tmp/huggingface \
|
| 12 |
+
VLLM_ATTENTION_BACKEND=FLASH_ATTN \
|
| 13 |
+
CUDA_VISIBLE_DEVICES=0
|
| 14 |
+
|
| 15 |
+
# Install system dependencies
|
| 16 |
+
RUN apt-get update && \
|
| 17 |
+
apt-get install -y --no-install-recommends \
|
| 18 |
+
python3.11 \
|
| 19 |
+
python3.11-dev \
|
| 20 |
+
python3-pip \
|
| 21 |
+
git \
|
| 22 |
+
curl && \
|
| 23 |
+
rm -rf /var/lib/apt/lists/* && \
|
| 24 |
+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
|
| 25 |
+
update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
|
| 26 |
+
python3 -m pip install --upgrade pip
|
| 27 |
+
|
| 28 |
+
WORKDIR /app
|
| 29 |
+
|
| 30 |
+
# Install PyTorch with CUDA 12.4
|
| 31 |
+
RUN pip install --no-cache-dir \
|
| 32 |
+
torch>=2.5.0 \
|
| 33 |
+
--index-url https://download.pytorch.org/whl/cu124
|
| 34 |
+
|
| 35 |
+
# Install vLLM with all CUDA optimizations
|
| 36 |
+
# vLLM includes: Flash Attention, PagedAttention, continuous batching, CUDA graphs
|
| 37 |
+
RUN pip install --no-cache-dir \
|
| 38 |
+
vllm>=0.6.0 \
|
| 39 |
+
huggingface-hub>=0.20.0
|
| 40 |
+
|
| 41 |
+
# Create non-root user and cache directories
|
| 42 |
+
RUN useradd -m -u 1000 user && \
|
| 43 |
+
mkdir -p /tmp/huggingface /tmp/vllm && \
|
| 44 |
+
chown -R user:user /app /tmp/huggingface /tmp/vllm
|
| 45 |
+
|
| 46 |
+
# Copy startup script
|
| 47 |
+
COPY start-vllm.sh /app/start-vllm.sh
|
| 48 |
+
RUN chmod +x /app/start-vllm.sh && chown user:user /app/start-vllm.sh
|
| 49 |
+
|
| 50 |
+
USER user
|
| 51 |
+
|
| 52 |
+
# vLLM OpenAI server default port
|
| 53 |
+
EXPOSE 8000
|
| 54 |
+
|
| 55 |
+
# Use vLLM's native OpenAI-compatible server
|
| 56 |
+
CMD ["/app/start-vllm.sh"]
|
| 57 |
+
|
KOYEB_VLLM_DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Koyeb vLLM Deployment
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The Koyeb deployment uses **vLLM's native OpenAI-compatible API server** with full CUDA optimizations for maximum inference performance.
|
| 6 |
+
|
| 7 |
+
## Docker Image
|
| 8 |
+
|
| 9 |
+
**Public image on Docker Hub:**
|
| 10 |
+
```
|
| 11 |
+
jeanbapt/dragon-llm-inference:vllm
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
Built from `Dockerfile.koyeb` with:
|
| 15 |
+
- NVIDIA CUDA 12.4 base
|
| 16 |
+
- vLLM 0.6.0+ with all optimizations
|
| 17 |
+
- Native OpenAI-compatible server
|
| 18 |
+
|
| 19 |
+
## vLLM Optimizations
|
| 20 |
+
|
| 21 |
+
| Feature | Benefit |
|
| 22 |
+
|---------|---------|
|
| 23 |
+
| **Flash Attention 2** | Faster attention computation |
|
| 24 |
+
| **PagedAttention** | Efficient KV cache management |
|
| 25 |
+
| **Continuous Batching** | Handle multiple requests simultaneously |
|
| 26 |
+
| **Prefix Caching** | Reuse KV cache for common prefixes |
|
| 27 |
+
| **Chunked Prefill** | Better memory utilization |
|
| 28 |
+
| **CUDA Graphs** | Reduced kernel launch overhead |
|
| 29 |
+
|
| 30 |
+
## Koyeb Configuration
|
| 31 |
+
|
| 32 |
+
### Environment Variables
|
| 33 |
+
|
| 34 |
+
| Variable | Value | Description |
|
| 35 |
+
|----------|-------|-------------|
|
| 36 |
+
| `MODEL` | `DragonLLM/Qwen-Open-Finance-R-8B` | Model to serve |
|
| 37 |
+
| `HF_TOKEN_LC2` | (secret) | Hugging Face token |
|
| 38 |
+
| `PORT` | `8000` | Server port |
|
| 39 |
+
| `MAX_MODEL_LEN` | `8192` | Maximum context length |
|
| 40 |
+
| `GPU_MEMORY_UTILIZATION` | `0.90` | GPU memory usage (90%) |
|
| 41 |
+
|
| 42 |
+
### Instance Type
|
| 43 |
+
|
| 44 |
+
- **Recommended**: `gpu-nvidia-l40s` (48GB VRAM)
|
| 45 |
+
- **Alternative**: `gpu-nvidia-rtx-4000-sff-ada` (20GB VRAM)
|
| 46 |
+
|
| 47 |
+
### Health Check
|
| 48 |
+
|
| 49 |
+
- **Path**: `/health`
|
| 50 |
+
- **Port**: 8000
|
| 51 |
+
- **Grace Period**: 300s (model loading time)
|
| 52 |
+
- **Interval**: 60s
|
| 53 |
+
|
| 54 |
+
## API Endpoints
|
| 55 |
+
|
| 56 |
+
vLLM's native OpenAI-compatible server provides:
|
| 57 |
+
|
| 58 |
+
```
|
| 59 |
+
POST /v1/chat/completions - Chat completions
|
| 60 |
+
POST /v1/completions - Text completions
|
| 61 |
+
GET /v1/models - List models
|
| 62 |
+
GET /health - Health check
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Usage Example
|
| 66 |
+
|
| 67 |
+
```python
|
| 68 |
+
from openai import OpenAI
|
| 69 |
+
|
| 70 |
+
client = OpenAI(
|
| 71 |
+
base_url="https://dragon-llm-dealexmachina.koyeb.app/v1",
|
| 72 |
+
api_key="not-needed"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
response = client.chat.completions.create(
|
| 76 |
+
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 77 |
+
messages=[
|
| 78 |
+
{"role": "user", "content": "Analyze the impact of rising interest rates on bond portfolios"}
|
| 79 |
+
],
|
| 80 |
+
temperature=0.7,
|
| 81 |
+
max_tokens=1024
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
print(response.choices[0].message.content)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Build & Push (Development)
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
# Build vLLM image
|
| 91 |
+
docker build -f Dockerfile.koyeb -t jeanbapt/dragon-llm-inference:vllm .
|
| 92 |
+
|
| 93 |
+
# Push to Docker Hub
|
| 94 |
+
docker push jeanbapt/dragon-llm-inference:vllm
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## Performance Notes
|
| 98 |
+
|
| 99 |
+
- **First request**: Slower due to model loading + CUDA warmup
|
| 100 |
+
- **Subsequent requests**: Benefit from batching, KV cache reuse, CUDA graphs
|
| 101 |
+
- **L40s GPU**: 48GB VRAM provides ample room for 8B model with long context
|
README.md
CHANGED
|
@@ -11,33 +11,38 @@ suggested_hardware: l4x1
|
|
| 11 |
|
| 12 |
# Open Finance LLM 8B
|
| 13 |
|
| 14 |
-
OpenAI-compatible API powered by DragonLLM/Qwen-Open-Finance-R-8B
|
| 15 |
|
| 16 |
-
##
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
## Features
|
| 21 |
|
| 22 |
-
- OpenAI-compatible API - Drop-in replacement for OpenAI
|
| 23 |
-
- French and English support - Automatic language detection
|
| 24 |
-
- Rate limiting - Built-in protection (30 req/min, 500 req/hour)
|
| 25 |
-
- Statistics tracking - Token usage and request metrics via `/v1/stats`
|
| 26 |
-
- Health monitoring - Model readiness status in `/health` endpoint
|
| 27 |
-
- Streaming support - Real-time response streaming
|
| 28 |
-
- Tool calls support - OpenAI-compatible tool/function calling
|
| 29 |
-
- Structured outputs - JSON format support via response_format
|
| 30 |
|
| 31 |
## API Endpoints
|
| 32 |
|
| 33 |
-
### List Models
|
| 34 |
-
```bash
|
| 35 |
-
curl -X GET "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/models"
|
| 36 |
-
```
|
| 37 |
-
|
| 38 |
### Chat Completions
|
| 39 |
```bash
|
| 40 |
-
curl -X POST "https://
|
| 41 |
-H "Content-Type: application/json" \
|
| 42 |
-d '{
|
| 43 |
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
|
@@ -47,9 +52,14 @@ curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completio
|
|
| 47 |
}'
|
| 48 |
```
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
### Streaming
|
| 51 |
```bash
|
| 52 |
-
curl -X POST "https://
|
| 53 |
-H "Content-Type: application/json" \
|
| 54 |
-d '{
|
| 55 |
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
|
@@ -58,25 +68,11 @@ curl -X POST "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/chat/completio
|
|
| 58 |
}'
|
| 59 |
```
|
| 60 |
|
| 61 |
-
### Statistics
|
| 62 |
-
```bash
|
| 63 |
-
curl -X GET "https://jeanbaptdzd-open-finance-llm-8b.hf.space/v1/stats"
|
| 64 |
-
```
|
| 65 |
-
|
| 66 |
### Health Check
|
| 67 |
```bash
|
| 68 |
-
curl -X GET "https://
|
| 69 |
```
|
| 70 |
|
| 71 |
-
## Response Format
|
| 72 |
-
|
| 73 |
-
Responses include chain-of-thought reasoning in `<think>` tags followed by the answer. Reasoning typically consumes 40-60% of tokens.
|
| 74 |
-
|
| 75 |
-
**Recommended `max_tokens`:**
|
| 76 |
-
- Simple queries: 300-400
|
| 77 |
-
- Complex queries: 500-800
|
| 78 |
-
- Detailed analysis: 800-1200
|
| 79 |
-
|
| 80 |
## Configuration
|
| 81 |
|
| 82 |
### Environment Variables
|
|
@@ -85,11 +81,10 @@ Responses include chain-of-thought reasoning in `<think>` tags followed by the a
|
|
| 85 |
- `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
|
| 86 |
|
| 87 |
**Optional:**
|
| 88 |
-
- `MODEL` - Model name (default: DragonLLM/Qwen-Open-Finance-R-8B)
|
|
|
|
| 89 |
- `SERVICE_API_KEY` - API key for authentication
|
| 90 |
-
- `LOG_LEVEL` - Logging level (default: info)
|
| 91 |
-
- `HF_HOME` - Hugging Face cache directory (default: /tmp/huggingface)
|
| 92 |
-
- `FORCE_MODEL_RELOAD` - Force reload model from Hub on startup (default: false)
|
| 93 |
|
| 94 |
Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
|
| 95 |
|
|
@@ -103,8 +98,8 @@ Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_
|
|
| 103 |
from openai import OpenAI
|
| 104 |
|
| 105 |
client = OpenAI(
|
| 106 |
-
base_url="https://
|
| 107 |
-
api_key="not-needed"
|
| 108 |
)
|
| 109 |
|
| 110 |
response = client.chat.completions.create(
|
|
@@ -114,6 +109,27 @@ response = client.chat.completions.create(
|
|
| 114 |
)
|
| 115 |
```
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
## Technical Specifications
|
| 119 |
|
|
@@ -122,36 +138,36 @@ response = client.chat.completions.create(
|
|
| 122 |
- Fine-tuned on financial data
|
| 123 |
- English and French support
|
| 124 |
|
| 125 |
-
**Backend:**
|
| 126 |
- Transformers 4.45.0+
|
| 127 |
- PyTorch 2.5.0+ (CUDA 12.4)
|
| 128 |
-
- Accelerate 0.30.0+
|
| 129 |
|
| 130 |
-
**
|
| 131 |
-
-
|
| 132 |
-
-
|
| 133 |
-
-
|
| 134 |
|
| 135 |
**Hardware:**
|
| 136 |
-
-
|
| 137 |
-
-
|
| 138 |
-
|
| 139 |
-
## Recent Improvements
|
| 140 |
-
|
| 141 |
-
### Code Quality & Hugging Face Best Practices Alignment
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
- **Simplified Memory Management**: Removed redundant manual GPU memory cleanup - `device_map="auto"` handles this automatically
|
| 146 |
-
- **Streamlined Token Management**: Hugging Face Hub now auto-detects tokens from environment variables
|
| 147 |
-
- **Auto-Loading Chat Templates**: Leverages transformers 4.45.0+ automatic chat template loading
|
| 148 |
-
- **Automatic Device Placement**: Removed manual device management - `device_map="auto"` handles GPU/CPU placement
|
| 149 |
-
- **Improved Thread Safety**: Enhanced model access checks with thread-safe helpers
|
| 150 |
-
- **Centralized Version Management**: Single source of truth for API version
|
| 151 |
-
|
| 152 |
-
### Deprecated Functions
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
## Development
|
| 157 |
|
|
@@ -164,50 +180,15 @@ uvicorn app.main:app --reload --port 8080
|
|
| 164 |
|
| 165 |
### Testing
|
| 166 |
|
| 167 |
-
**Unit Tests:**
|
| 168 |
```bash
|
|
|
|
| 169 |
pytest tests/ -v
|
| 170 |
-
```
|
| 171 |
-
|
| 172 |
-
**Integration Tests:**
|
| 173 |
-
The integration tests evaluate the model's ability to produce valid JSON outputs and execute tool calls, which are critical requirements for financial applications.
|
| 174 |
|
| 175 |
-
|
| 176 |
-
# Basic API functionality
|
| 177 |
python tests/integration/test_space_basic.py
|
| 178 |
-
|
| 179 |
-
# Tool calls and JSON format
|
| 180 |
-
python tests/integration/test_space_with_tools.py
|
| 181 |
-
|
| 182 |
-
# Detailed tool call validation
|
| 183 |
python tests/integration/test_tool_calls.py
|
| 184 |
```
|
| 185 |
|
| 186 |
-
**Test Coverage:**
|
| 187 |
-
- API endpoints (health, models, chat completions)
|
| 188 |
-
- Tool calls with `tool_choice` parameter
|
| 189 |
-
- Structured JSON outputs via `response_format`
|
| 190 |
-
- Model response parsing and validation
|
| 191 |
-
|
| 192 |
-
These tests verify that the small 8B model can reliably produce valid JSON and execute tool calls, which is mandatory for financial workflows requiring structured data and function execution.
|
| 193 |
-
|
| 194 |
-
## Project Structure
|
| 195 |
-
|
| 196 |
-
```
|
| 197 |
-
.
|
| 198 |
-
βββ app/ # Main API application
|
| 199 |
-
β βββ main.py # FastAPI app
|
| 200 |
-
β βββ routers/ # API routes
|
| 201 |
-
β βββ providers/ # Model providers
|
| 202 |
-
β βββ middleware/ # Rate limiting, auth
|
| 203 |
-
β βββ utils/ # Utilities, stats tracking
|
| 204 |
-
βββ docs/ # Documentation
|
| 205 |
-
βββ tests/ # Test suite
|
| 206 |
-
β βββ integration/ # Integration tests (API, tool calls, JSON)
|
| 207 |
-
β βββ performance/ # Performance benchmarks
|
| 208 |
-
βββ scripts/ # Utility scripts
|
| 209 |
-
```
|
| 210 |
-
|
| 211 |
## License
|
| 212 |
|
| 213 |
MIT License - see [LICENSE](LICENSE) file.
|
|
|
|
| 11 |
|
| 12 |
# Open Finance LLM 8B
|
| 13 |
|
| 14 |
+
OpenAI-compatible API powered by DragonLLM/Qwen-Open-Finance-R-8B.
|
| 15 |
|
| 16 |
+
## Deployment Options
|
| 17 |
|
| 18 |
+
| Platform | Backend | Docker Image | Port |
|
| 19 |
+
|----------|---------|--------------|------|
|
| 20 |
+
| **HF Spaces** | Transformers | Default (builds from `Dockerfile`) | 7860 |
|
| 21 |
+
| **Koyeb** | vLLM (optimized) | `jeanbapt/dragon-llm-inference:vllm` | 8000 |
|
| 22 |
+
|
| 23 |
+
### Docker Hub Public Images
|
| 24 |
+
|
| 25 |
+
```
|
| 26 |
+
jeanbapt/dragon-llm-inference:vllm # Koyeb - vLLM with CUDA optimizations
|
| 27 |
+
jeanbapt/dragon-llm-inference:latest # HF Spaces - Transformers backend
|
| 28 |
+
```
|
| 29 |
|
| 30 |
## Features
|
| 31 |
|
| 32 |
+
- **OpenAI-compatible API** - Drop-in replacement for OpenAI SDK
|
| 33 |
+
- **French and English support** - Automatic language detection
|
| 34 |
+
- **Rate limiting** - Built-in protection (30 req/min, 500 req/hour)
|
| 35 |
+
- **Statistics tracking** - Token usage and request metrics via `/v1/stats`
|
| 36 |
+
- **Health monitoring** - Model readiness status in `/health` endpoint
|
| 37 |
+
- **Streaming support** - Real-time response streaming
|
| 38 |
+
- **Tool calls support** - OpenAI-compatible tool/function calling
|
| 39 |
+
- **Structured outputs** - JSON format support via `response_format`
|
| 40 |
|
| 41 |
## API Endpoints
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
### Chat Completions
|
| 44 |
```bash
|
| 45 |
+
curl -X POST "https://your-endpoint/v1/chat/completions" \
|
| 46 |
-H "Content-Type: application/json" \
|
| 47 |
-d '{
|
| 48 |
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
|
|
|
| 52 |
}'
|
| 53 |
```
|
| 54 |
|
| 55 |
+
### List Models
|
| 56 |
+
```bash
|
| 57 |
+
curl -X GET "https://your-endpoint/v1/models"
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
### Streaming
|
| 61 |
```bash
|
| 62 |
+
curl -X POST "https://your-endpoint/v1/chat/completions" \
|
| 63 |
-H "Content-Type: application/json" \
|
| 64 |
-d '{
|
| 65 |
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
|
|
|
|
| 68 |
}'
|
| 69 |
```
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
### Health Check
|
| 72 |
```bash
|
| 73 |
+
curl -X GET "https://your-endpoint/health"
|
| 74 |
```
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
## Configuration
|
| 77 |
|
| 78 |
### Environment Variables
|
|
|
|
| 81 |
- `HF_TOKEN_LC2` - Hugging Face token with access to DragonLLM models
|
| 82 |
|
| 83 |
**Optional:**
|
| 84 |
+
- `MODEL` - Model name (default: `DragonLLM/Qwen-Open-Finance-R-8B`)
|
| 85 |
+
- `PORT` - Server port (default: 7860 for HF, 8000 for Koyeb)
|
| 86 |
- `SERVICE_API_KEY` - API key for authentication
|
| 87 |
+
- `LOG_LEVEL` - Logging level (default: `info`)
|
|
|
|
|
|
|
| 88 |
|
| 89 |
Token priority: `HF_TOKEN_LC2` > `HF_TOKEN_LC` > `HF_TOKEN` > `HUGGING_FACE_HUB_TOKEN`
|
| 90 |
|
|
|
|
| 98 |
from openai import OpenAI
|
| 99 |
|
| 100 |
client = OpenAI(
|
| 101 |
+
base_url="https://your-endpoint/v1",
|
| 102 |
+
api_key="not-needed" # or your SERVICE_API_KEY
|
| 103 |
)
|
| 104 |
|
| 105 |
response = client.chat.completions.create(
|
|
|
|
| 109 |
)
|
| 110 |
```
|
| 111 |
|
| 112 |
+
## Koyeb Deployment (vLLM)
|
| 113 |
+
|
| 114 |
+
The Koyeb deployment uses vLLM's native OpenAI-compatible server with full CUDA optimizations:
|
| 115 |
+
|
| 116 |
+
- **Flash Attention 2** - Faster attention computation
|
| 117 |
+
- **PagedAttention** - Efficient GPU memory management
|
| 118 |
+
- **Continuous batching** - High throughput inference
|
| 119 |
+
- **Prefix caching** - Reuse KV cache for common prefixes
|
| 120 |
+
|
| 121 |
+
See [KOYEB_VLLM_DEPLOYMENT.md](KOYEB_VLLM_DEPLOYMENT.md) for detailed setup.
|
| 122 |
+
|
| 123 |
+
### Quick Deploy to Koyeb
|
| 124 |
+
|
| 125 |
+
1. Create app in Koyeb dashboard
|
| 126 |
+
2. Set Docker image: `jeanbapt/dragon-llm-inference:vllm`
|
| 127 |
+
3. Add environment variables:
|
| 128 |
+
- `MODEL`: `DragonLLM/Qwen-Open-Finance-R-8B`
|
| 129 |
+
- `HF_TOKEN_LC2`: (your HF token as secret)
|
| 130 |
+
- `PORT`: `8000`
|
| 131 |
+
4. Select GPU instance (L40s recommended)
|
| 132 |
+
5. Set health check: `GET /health` on port 8000
|
| 133 |
|
| 134 |
## Technical Specifications
|
| 135 |
|
|
|
|
| 138 |
- Fine-tuned on financial data
|
| 139 |
- English and French support
|
| 140 |
|
| 141 |
+
**HF Spaces Backend:**
|
| 142 |
- Transformers 4.45.0+
|
| 143 |
- PyTorch 2.5.0+ (CUDA 12.4)
|
|
|
|
| 144 |
|
| 145 |
+
**Koyeb Backend:**
|
| 146 |
+
- vLLM 0.6.0+
|
| 147 |
+
- Flash Attention 2
|
| 148 |
+
- CUDA 12.4
|
| 149 |
|
| 150 |
**Hardware:**
|
| 151 |
+
- Minimum: L4 GPU (24GB VRAM)
|
| 152 |
+
- Recommended: L40s GPU (48GB VRAM)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
## Project Structure
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
+
```
|
| 157 |
+
.
|
| 158 |
+
βββ app/ # Main API application
|
| 159 |
+
β βββ main.py # FastAPI app (HF Spaces)
|
| 160 |
+
β βββ routers/ # API routes
|
| 161 |
+
β βββ providers/ # Model providers (Transformers)
|
| 162 |
+
β βββ middleware/ # Rate limiting, auth
|
| 163 |
+
β βββ utils/ # Utilities, stats tracking
|
| 164 |
+
βββ Dockerfile # HF Spaces (Transformers)
|
| 165 |
+
βββ Dockerfile.koyeb # Koyeb (vLLM)
|
| 166 |
+
βββ start.sh # HF Spaces startup
|
| 167 |
+
βββ start-vllm.sh # Koyeb vLLM startup
|
| 168 |
+
βββ docs/ # Technical documentation
|
| 169 |
+
βββ tests/ # Test suite
|
| 170 |
+
```
|
| 171 |
|
| 172 |
## Development
|
| 173 |
|
|
|
|
| 180 |
|
| 181 |
### Testing
|
| 182 |
|
|
|
|
| 183 |
```bash
|
| 184 |
+
# Unit tests
|
| 185 |
pytest tests/ -v
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
# Integration tests
|
|
|
|
| 188 |
python tests/integration/test_space_basic.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
python tests/integration/test_tool_calls.py
|
| 190 |
```
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
## License
|
| 193 |
|
| 194 |
MIT License - see [LICENSE](LICENSE) file.
|
start-vllm.sh
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# vLLM OpenAI-compatible API server startup script for Koyeb
|
| 3 |
+
# Uses vLLM's native server with all CUDA optimizations
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
# Configuration from environment
|
| 8 |
+
MODEL=${MODEL:-"DragonLLM/Qwen-Open-Finance-R-8B"}
|
| 9 |
+
PORT=${PORT:-8000}
|
| 10 |
+
MAX_MODEL_LEN=${MAX_MODEL_LEN:-8192}
|
| 11 |
+
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.90}
|
| 12 |
+
|
| 13 |
+
# HF Token (try multiple env var names)
|
| 14 |
+
HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}"
|
| 15 |
+
|
| 16 |
+
echo "=========================================="
|
| 17 |
+
echo "vLLM OpenAI Server - Koyeb Deployment"
|
| 18 |
+
echo "=========================================="
|
| 19 |
+
echo "Model: $MODEL"
|
| 20 |
+
echo "Port: $PORT"
|
| 21 |
+
echo "Max Model Length: $MAX_MODEL_LEN"
|
| 22 |
+
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
|
| 23 |
+
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
|
| 24 |
+
echo "CUDA Devices: ${CUDA_VISIBLE_DEVICES:-auto}"
|
| 25 |
+
echo "=========================================="
|
| 26 |
+
|
| 27 |
+
# Check for GPU
|
| 28 |
+
if command -v nvidia-smi &> /dev/null; then
|
| 29 |
+
echo "GPU Info:"
|
| 30 |
+
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
|
| 31 |
+
echo "=========================================="
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
# Build vLLM serve command with optimizations
|
| 35 |
+
VLLM_ARGS=(
|
| 36 |
+
"--model" "$MODEL"
|
| 37 |
+
"--port" "$PORT"
|
| 38 |
+
"--host" "0.0.0.0"
|
| 39 |
+
"--dtype" "bfloat16"
|
| 40 |
+
"--max-model-len" "$MAX_MODEL_LEN"
|
| 41 |
+
"--gpu-memory-utilization" "$GPU_MEMORY_UTILIZATION"
|
| 42 |
+
"--trust-remote-code"
|
| 43 |
+
# Optimization flags
|
| 44 |
+
"--enable-prefix-caching" # Cache common prefixes for faster inference
|
| 45 |
+
"--enable-chunked-prefill" # Better memory management
|
| 46 |
+
"--max-num-batched-tokens" "8192" # Batch optimization
|
| 47 |
+
"--max-num-seqs" "256" # Concurrent request handling
|
| 48 |
+
# Disable logging overhead in production
|
| 49 |
+
"--disable-log-requests"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Add HF token if available
|
| 53 |
+
if [ -n "$HF_TOKEN" ]; then
|
| 54 |
+
export HF_TOKEN
|
| 55 |
+
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 56 |
+
fi
|
| 57 |
+
|
| 58 |
+
echo "Starting vLLM OpenAI server..."
|
| 59 |
+
echo "Endpoints available:"
|
| 60 |
+
echo " - POST /v1/chat/completions"
|
| 61 |
+
echo " - POST /v1/completions"
|
| 62 |
+
echo " - GET /v1/models"
|
| 63 |
+
echo " - GET /health"
|
| 64 |
+
echo "=========================================="
|
| 65 |
+
|
| 66 |
+
# Start vLLM server
|
| 67 |
+
exec python -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
|
| 68 |
+
|
start.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Get port from environment variable, default to 7860
|
| 3 |
+
PORT=${PORT:-7860}
|
| 4 |
+
|
| 5 |
+
# Redirect all output to stderr so it shows in logs
|
| 6 |
+
exec >&2
|
| 7 |
+
|
| 8 |
+
# Start uvicorn with the specified port
|
| 9 |
+
exec python -m uvicorn app.main:app --host 0.0.0.0 --port "$PORT"
|
| 10 |
+
|
test_deployment.sh
DELETED
|
@@ -1,101 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# Quick deployment test script
|
| 3 |
-
# Tests the new features without requiring the full model to be loaded
|
| 4 |
-
|
| 5 |
-
set -e
|
| 6 |
-
|
| 7 |
-
echo "=========================================="
|
| 8 |
-
echo "Testing New Features"
|
| 9 |
-
echo "=========================================="
|
| 10 |
-
echo ""
|
| 11 |
-
|
| 12 |
-
# Check if server is running
|
| 13 |
-
if ! curl -s http://localhost:8080/health > /dev/null 2>&1; then
|
| 14 |
-
echo "β οΈ Server not running on localhost:8080"
|
| 15 |
-
echo " Start server with: uvicorn app.main:app --host 0.0.0.0 --port 8080"
|
| 16 |
-
echo ""
|
| 17 |
-
echo "Or test against deployed instance by setting API_URL:"
|
| 18 |
-
echo " export API_URL=https://your-space.hf.space"
|
| 19 |
-
echo " ./test_deployment.sh"
|
| 20 |
-
exit 1
|
| 21 |
-
fi
|
| 22 |
-
|
| 23 |
-
API_URL="${API_URL:-http://localhost:8080}"
|
| 24 |
-
echo "Testing against: $API_URL"
|
| 25 |
-
echo ""
|
| 26 |
-
|
| 27 |
-
# Test 1: Health endpoint
|
| 28 |
-
echo "1. Testing /health endpoint..."
|
| 29 |
-
HEALTH=$(curl -s "$API_URL/health")
|
| 30 |
-
if echo "$HEALTH" | grep -q "model_ready"; then
|
| 31 |
-
echo " β Health endpoint includes model_ready field"
|
| 32 |
-
echo " Response: $HEALTH"
|
| 33 |
-
else
|
| 34 |
-
echo " β Health endpoint missing model_ready field"
|
| 35 |
-
exit 1
|
| 36 |
-
fi
|
| 37 |
-
echo ""
|
| 38 |
-
|
| 39 |
-
# Test 2: Stats endpoint
|
| 40 |
-
echo "2. Testing /v1/stats endpoint..."
|
| 41 |
-
STATS=$(curl -s "$API_URL/v1/stats")
|
| 42 |
-
if echo "$STATS" | grep -q "total_requests"; then
|
| 43 |
-
echo " β Stats endpoint working"
|
| 44 |
-
echo " Response preview: $(echo "$STATS" | head -c 200)..."
|
| 45 |
-
else
|
| 46 |
-
echo " β Stats endpoint not working"
|
| 47 |
-
exit 1
|
| 48 |
-
fi
|
| 49 |
-
echo ""
|
| 50 |
-
|
| 51 |
-
# Test 3: Rate limiting headers
|
| 52 |
-
echo "3. Testing rate limiting headers..."
|
| 53 |
-
HEADERS=$(curl -s -I "$API_URL/v1/models")
|
| 54 |
-
if echo "$HEADERS" | grep -q "X-RateLimit-Limit-Minute"; then
|
| 55 |
-
echo " β Rate limit headers present"
|
| 56 |
-
echo "$HEADERS" | grep "X-RateLimit"
|
| 57 |
-
else
|
| 58 |
-
echo " β Rate limit headers missing"
|
| 59 |
-
exit 1
|
| 60 |
-
fi
|
| 61 |
-
echo ""
|
| 62 |
-
|
| 63 |
-
# Test 4: Error sanitization
|
| 64 |
-
echo "4. Testing error sanitization..."
|
| 65 |
-
ERROR_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL/v1/chat/completions" \
|
| 66 |
-
-H "Content-Type: application/json" \
|
| 67 |
-
-d '{"model":"test","messages":[]}')
|
| 68 |
-
HTTP_CODE=$(echo "$ERROR_RESPONSE" | tail -n1)
|
| 69 |
-
ERROR_BODY=$(echo "$ERROR_RESPONSE" | head -n-1)
|
| 70 |
-
|
| 71 |
-
if [ "$HTTP_CODE" = "400" ]; then
|
| 72 |
-
if echo "$ERROR_BODY" | grep -q "messages list cannot be empty"; then
|
| 73 |
-
echo " β Error properly formatted (400 with clear message)"
|
| 74 |
-
else
|
| 75 |
-
echo " β οΈ Got 400 but error message format unexpected"
|
| 76 |
-
fi
|
| 77 |
-
else
|
| 78 |
-
echo " β οΈ Expected 400, got $HTTP_CODE"
|
| 79 |
-
fi
|
| 80 |
-
echo ""
|
| 81 |
-
|
| 82 |
-
# Test 5: Root endpoint
|
| 83 |
-
echo "5. Testing / endpoint..."
|
| 84 |
-
ROOT=$(curl -s "$API_URL/")
|
| 85 |
-
if echo "$ROOT" | grep -q "status"; then
|
| 86 |
-
echo " β Root endpoint working"
|
| 87 |
-
else
|
| 88 |
-
echo " β Root endpoint not working"
|
| 89 |
-
exit 1
|
| 90 |
-
fi
|
| 91 |
-
echo ""
|
| 92 |
-
|
| 93 |
-
echo "=========================================="
|
| 94 |
-
echo "β
All basic tests passed!"
|
| 95 |
-
echo "=========================================="
|
| 96 |
-
echo ""
|
| 97 |
-
echo "Next steps:"
|
| 98 |
-
echo "1. Test with actual model requests (requires model to be loaded)"
|
| 99 |
-
echo "2. Test rate limiting by making 31 requests in a minute"
|
| 100 |
-
echo "3. Check stats endpoint after making some requests"
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|