Commit
·
0d30de3
1
Parent(s):
e3878fa
Update Dockerfile.koyeb to use official vLLM base image
Browse files- Based on vllm/vllm-openai:latest (Koyeb's proven approach)
- Includes all CUDA/vLLM optimizations out of the box
- Flash Attention 2, PagedAttention, continuous batching
- Model args embedded in CMD
- Dockerfile.koyeb +19 -49
- KOYEB_VLLM_DEPLOYMENT.md +43 -49
- start-vllm.sh +51 -35
Dockerfile.koyeb
CHANGED
|
@@ -1,57 +1,27 @@
|
|
| 1 |
-
# Koyeb-optimized Dockerfile using vLLM
|
| 2 |
-
#
|
| 3 |
|
| 4 |
-
FROM
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN \
|
| 13 |
-
CUDA_VISIBLE_DEVICES=0
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
apt-get install -y --no-install-recommends \
|
| 18 |
-
python3.11 \
|
| 19 |
-
python3.11-dev \
|
| 20 |
-
python3-pip \
|
| 21 |
-
git \
|
| 22 |
-
curl && \
|
| 23 |
-
rm -rf /var/lib/apt/lists/* && \
|
| 24 |
-
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
|
| 25 |
-
update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
|
| 26 |
-
python3 -m pip install --upgrade pip
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
# Install PyTorch with CUDA 12.4
|
| 31 |
-
RUN pip install --no-cache-dir \
|
| 32 |
-
torch>=2.5.0 \
|
| 33 |
-
--index-url https://download.pytorch.org/whl/cu124
|
| 34 |
-
|
| 35 |
-
# Install vLLM with all CUDA optimizations
|
| 36 |
-
# vLLM includes: Flash Attention, PagedAttention, continuous batching, CUDA graphs
|
| 37 |
-
RUN pip install --no-cache-dir \
|
| 38 |
-
vllm>=0.6.0 \
|
| 39 |
-
huggingface-hub>=0.20.0
|
| 40 |
-
|
| 41 |
-
# Create non-root user and cache directories
|
| 42 |
-
RUN useradd -m -u 1000 user && \
|
| 43 |
-
mkdir -p /tmp/huggingface /tmp/vllm && \
|
| 44 |
-
chown -R user:user /app /tmp/huggingface /tmp/vllm
|
| 45 |
-
|
| 46 |
-
# Copy startup script
|
| 47 |
-
COPY start-vllm.sh /app/start-vllm.sh
|
| 48 |
-
RUN chmod +x /app/start-vllm.sh && chown user:user /app/start-vllm.sh
|
| 49 |
-
|
| 50 |
-
USER user
|
| 51 |
-
|
| 52 |
-
# vLLM OpenAI server default port
|
| 53 |
EXPOSE 8000
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Koyeb-optimized Dockerfile using official vLLM OpenAI image
|
| 2 |
+
# Based on Koyeb's proven vLLM deployment approach
|
| 3 |
|
| 4 |
+
FROM vllm/vllm-openai:latest
|
| 5 |
|
| 6 |
+
# Environment variables
|
| 7 |
+
ENV HF_HOME=/tmp/huggingface \
|
| 8 |
+
VLLM_ATTENTION_BACKEND=FLASH_ATTN
|
| 9 |
|
| 10 |
+
# Create cache directories with proper permissions
|
| 11 |
+
USER root
|
| 12 |
+
RUN mkdir -p /tmp/huggingface && chmod 777 /tmp/huggingface
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
# Switch back to default user
|
| 15 |
+
USER 1000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# Expose vLLM default port
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
EXPOSE 8000
|
| 19 |
|
| 20 |
+
# Default model and settings - can be overridden via Koyeb env/args
|
| 21 |
+
ENV MODEL="DragonLLM/Qwen-Open-Finance-R-8B"
|
| 22 |
+
ENV MAX_MODEL_LEN="8192"
|
| 23 |
+
ENV DTYPE="bfloat16"
|
| 24 |
|
| 25 |
+
# Use vLLM's native OpenAI server entrypoint
|
| 26 |
+
# Model is specified via environment or command args
|
| 27 |
+
CMD ["--model", "DragonLLM/Qwen-Open-Finance-R-8B", "--trust-remote-code", "--dtype", "bfloat16", "--max-model-len", "8192", "--gpu-memory-utilization", "0.90"]
|
KOYEB_VLLM_DEPLOYMENT.md
CHANGED
|
@@ -2,61 +2,51 @@
|
|
| 2 |
|
| 3 |
## Overview
|
| 4 |
|
| 5 |
-
The Koyeb deployment uses **vLLM's
|
| 6 |
|
| 7 |
-
##
|
| 8 |
-
|
| 9 |
-
**Public image on Docker Hub:**
|
| 10 |
-
```
|
| 11 |
-
jeanbapt/dragon-llm-inference:vllm
|
| 12 |
-
```
|
| 13 |
-
|
| 14 |
-
Built from `Dockerfile.koyeb` with:
|
| 15 |
-
- NVIDIA CUDA 12.4 base
|
| 16 |
-
- vLLM 0.6.0+ with all optimizations
|
| 17 |
-
- Native OpenAI-compatible server
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|---------|---------|
|
| 23 |
-
| **Flash Attention 2** | Faster attention computation |
|
| 24 |
-
| **PagedAttention** | Efficient KV cache management |
|
| 25 |
-
| **Continuous Batching** | Handle multiple requests simultaneously |
|
| 26 |
-
| **Prefix Caching** | Reuse KV cache for common prefixes |
|
| 27 |
-
| **Chunked Prefill** | Better memory utilization |
|
| 28 |
-
| **CUDA Graphs** | Reduced kernel launch overhead |
|
| 29 |
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
### Environment Variables
|
| 33 |
|
| 34 |
| Variable | Value | Description |
|
| 35 |
|----------|-------|-------------|
|
| 36 |
-
| `
|
| 37 |
-
| `
|
| 38 |
-
| `PORT` | `8000` | Server port |
|
| 39 |
-
| `MAX_MODEL_LEN` | `8192` | Maximum context length |
|
| 40 |
-
| `GPU_MEMORY_UTILIZATION` | `0.90` | GPU memory usage (90%) |
|
| 41 |
|
| 42 |
### Instance Type
|
| 43 |
|
| 44 |
- **Recommended**: `gpu-nvidia-l40s` (48GB VRAM)
|
| 45 |
-
- **
|
| 46 |
|
| 47 |
### Health Check
|
| 48 |
|
| 49 |
-
- **
|
| 50 |
- **Port**: 8000
|
| 51 |
-
- **Grace Period**:
|
| 52 |
-
- **Interval**: 60s
|
| 53 |
|
| 54 |
-
##
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
```
|
| 59 |
-
POST /v1/chat/completions - Chat completions
|
| 60 |
POST /v1/completions - Text completions
|
| 61 |
GET /v1/models - List models
|
| 62 |
GET /health - Health check
|
|
@@ -69,33 +59,37 @@ from openai import OpenAI
|
|
| 69 |
|
| 70 |
client = OpenAI(
|
| 71 |
base_url="https://dragon-llm-dealexmachina.koyeb.app/v1",
|
| 72 |
-
api_key="not-needed"
|
| 73 |
)
|
| 74 |
|
| 75 |
response = client.chat.completions.create(
|
| 76 |
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 77 |
messages=[
|
| 78 |
-
{"role": "user", "content": "Analyze the impact of rising interest rates
|
| 79 |
],
|
| 80 |
temperature=0.7,
|
| 81 |
max_tokens=1024
|
| 82 |
)
|
| 83 |
-
|
| 84 |
-
print(response.choices[0].message.content)
|
| 85 |
```
|
| 86 |
|
| 87 |
-
##
|
| 88 |
|
| 89 |
-
|
| 90 |
-
# Build vLLM image
|
| 91 |
-
docker build -f Dockerfile.koyeb -t jeanbapt/dragon-llm-inference:vllm .
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
- **Subsequent requests**: Benefit from batching, KV cache reuse, CUDA graphs
|
| 101 |
-
- **L40s GPU**: 48GB VRAM provides ample room for 8B model with long context
|
|
|
|
| 2 |
|
| 3 |
## Overview
|
| 4 |
|
| 5 |
+
The Koyeb deployment uses **vLLM's official Docker image** (`vllm/vllm-openai`) for maximum compatibility and performance.
|
| 6 |
|
| 7 |
+
## Koyeb Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
### Using Official vLLM Image (Recommended)
|
| 10 |
|
| 11 |
+
**Docker Image:** `vllm/vllm-openai:latest`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
**Command args:**
|
| 14 |
+
```
|
| 15 |
+
--model DragonLLM/Qwen-Open-Finance-R-8B --trust-remote-code --dtype bfloat16 --max-model-len 8192
|
| 16 |
+
```
|
| 17 |
|
| 18 |
### Environment Variables
|
| 19 |
|
| 20 |
| Variable | Value | Description |
|
| 21 |
|----------|-------|-------------|
|
| 22 |
+
| `HF_TOKEN` | (secret) | Hugging Face token for gated model |
|
| 23 |
+
| `VLLM_API_KEY` | (optional) | API key to protect the endpoint |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
### Instance Type
|
| 26 |
|
| 27 |
- **Recommended**: `gpu-nvidia-l40s` (48GB VRAM)
|
| 28 |
+
- **Region**: `na` (North America) - where L40s is most available
|
| 29 |
|
| 30 |
### Health Check
|
| 31 |
|
| 32 |
+
- **Type**: TCP
|
| 33 |
- **Port**: 8000
|
| 34 |
+
- **Grace Period**: 900 seconds (15 minutes for model loading)
|
|
|
|
| 35 |
|
| 36 |
+
## Koyeb Dashboard Setup
|
| 37 |
|
| 38 |
+
1. **Create new service** in `dragon-llm` app
|
| 39 |
+
2. **Docker image**: `vllm/vllm-openai:latest`
|
| 40 |
+
3. **Command args**: `--model DragonLLM/Qwen-Open-Finance-R-8B --trust-remote-code --dtype bfloat16 --max-model-len 8192`
|
| 41 |
+
4. **Environment**: Add `HF_TOKEN` secret (your HuggingFace token)
|
| 42 |
+
5. **Instance**: `gpu-nvidia-l40s` in `na` region
|
| 43 |
+
6. **Port**: 8000 (HTTP)
|
| 44 |
+
7. **Health check**: TCP on port 8000, grace period 900s
|
| 45 |
+
|
| 46 |
+
## API Endpoints (vLLM Native)
|
| 47 |
|
| 48 |
```
|
| 49 |
+
POST /v1/chat/completions - Chat completions (OpenAI compatible)
|
| 50 |
POST /v1/completions - Text completions
|
| 51 |
GET /v1/models - List models
|
| 52 |
GET /health - Health check
|
|
|
|
| 59 |
|
| 60 |
client = OpenAI(
|
| 61 |
base_url="https://dragon-llm-dealexmachina.koyeb.app/v1",
|
| 62 |
+
api_key="not-needed" # or your VLLM_API_KEY
|
| 63 |
)
|
| 64 |
|
| 65 |
response = client.chat.completions.create(
|
| 66 |
model="DragonLLM/Qwen-Open-Finance-R-8B",
|
| 67 |
messages=[
|
| 68 |
+
{"role": "user", "content": "Analyze the impact of rising interest rates"}
|
| 69 |
],
|
| 70 |
temperature=0.7,
|
| 71 |
max_tokens=1024
|
| 72 |
)
|
|
|
|
|
|
|
| 73 |
```
|
| 74 |
|
| 75 |
+
## Troubleshooting
|
| 76 |
|
| 77 |
+
### "Application exited with code 8" with no logs
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
This usually means GPU allocation failed at the hypervisor level. Try:
|
| 80 |
+
1. Different region (try `na` for L40s availability)
|
| 81 |
+
2. Different GPU type (`gpu-nvidia-a100`)
|
| 82 |
+
3. Wait and retry later (GPU availability varies)
|
| 83 |
+
|
| 84 |
+
### Model download issues
|
| 85 |
+
|
| 86 |
+
Ensure `HF_TOKEN` is set and the token has access to the gated model.
|
| 87 |
|
| 88 |
+
## Custom Image (Alternative)
|
| 89 |
+
|
| 90 |
+
If you prefer a custom image, use:
|
| 91 |
+
```
|
| 92 |
+
jeanbapt/dragon-llm-inference:vllm
|
| 93 |
+
```
|
| 94 |
|
| 95 |
+
Built from `Dockerfile.koyeb` in this repository.
|
|
|
|
|
|
start-vllm.sh
CHANGED
|
@@ -2,7 +2,15 @@
|
|
| 2 |
# vLLM OpenAI-compatible API server startup script for Koyeb
|
| 3 |
# Uses vLLM's native server with all CUDA optimizations
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Configuration from environment
|
| 8 |
MODEL=${MODEL:-"DragonLLM/Qwen-Open-Finance-R-8B"}
|
|
@@ -14,55 +22,63 @@ GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.90}
|
|
| 14 |
HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}"
|
| 15 |
|
| 16 |
echo "=========================================="
|
| 17 |
-
echo "
|
| 18 |
-
echo "
|
| 19 |
-
echo "
|
| 20 |
-
echo "
|
| 21 |
-
echo "
|
| 22 |
-
echo "
|
| 23 |
-
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
|
| 24 |
-
echo "CUDA Devices: ${CUDA_VISIBLE_DEVICES:-auto}"
|
| 25 |
echo "=========================================="
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Check for GPU
|
|
|
|
|
|
|
| 28 |
if command -v nvidia-smi &> /dev/null; then
|
| 29 |
-
echo "
|
| 30 |
-
nvidia-smi
|
| 31 |
-
|
|
|
|
| 32 |
fi
|
|
|
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
VLLM_ARGS=(
|
| 36 |
-
"--model" "$MODEL"
|
| 37 |
-
"--port" "$PORT"
|
| 38 |
-
"--host" "0.0.0.0"
|
| 39 |
-
"--dtype" "bfloat16"
|
| 40 |
-
"--max-model-len" "$MAX_MODEL_LEN"
|
| 41 |
-
"--gpu-memory-utilization" "$GPU_MEMORY_UTILIZATION"
|
| 42 |
-
"--trust-remote-code"
|
| 43 |
-
# Optimization flags
|
| 44 |
-
"--enable-prefix-caching" # Cache common prefixes for faster inference
|
| 45 |
-
"--enable-chunked-prefill" # Better memory management
|
| 46 |
-
"--max-num-batched-tokens" "8192" # Batch optimization
|
| 47 |
-
"--max-num-seqs" "256" # Concurrent request handling
|
| 48 |
-
# Disable logging overhead in production
|
| 49 |
-
"--disable-log-requests"
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
# Add HF token if available
|
| 53 |
if [ -n "$HF_TOKEN" ]; then
|
| 54 |
export HF_TOKEN
|
| 55 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
|
|
|
|
|
|
|
|
| 56 |
fi
|
| 57 |
|
|
|
|
| 58 |
echo "Starting vLLM OpenAI server..."
|
| 59 |
-
echo "Endpoints
|
| 60 |
echo " - POST /v1/chat/completions"
|
| 61 |
echo " - POST /v1/completions"
|
| 62 |
echo " - GET /v1/models"
|
| 63 |
echo " - GET /health"
|
| 64 |
echo "=========================================="
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
exec python -m vllm.entrypoints.openai.api_server
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# vLLM OpenAI-compatible API server startup script for Koyeb
|
| 3 |
# Uses vLLM's native server with all CUDA optimizations
|
| 4 |
|
| 5 |
+
# Redirect all output to stderr for Koyeb logs
|
| 6 |
+
exec 2>&1
|
| 7 |
+
|
| 8 |
+
echo "=========================================="
|
| 9 |
+
echo "vLLM OpenAI Server - Starting"
|
| 10 |
+
echo "=========================================="
|
| 11 |
+
echo "Date: $(date)"
|
| 12 |
+
echo "User: $(whoami)"
|
| 13 |
+
echo "PWD: $(pwd)"
|
| 14 |
|
| 15 |
# Configuration from environment
|
| 16 |
MODEL=${MODEL:-"DragonLLM/Qwen-Open-Finance-R-8B"}
|
|
|
|
| 22 |
HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}"
|
| 23 |
|
| 24 |
echo "=========================================="
|
| 25 |
+
echo "Configuration:"
|
| 26 |
+
echo " Model: $MODEL"
|
| 27 |
+
echo " Port: $PORT"
|
| 28 |
+
echo " Max Model Length: $MAX_MODEL_LEN"
|
| 29 |
+
echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
|
| 30 |
+
echo " HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
|
|
|
|
|
|
|
| 31 |
echo "=========================================="
|
| 32 |
|
| 33 |
+
# Check Python
|
| 34 |
+
echo "Checking Python..."
|
| 35 |
+
which python || { echo "ERROR: python not found!"; exit 1; }
|
| 36 |
+
python --version
|
| 37 |
+
|
| 38 |
+
# Check vLLM
|
| 39 |
+
echo "Checking vLLM installation..."
|
| 40 |
+
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" || {
|
| 41 |
+
echo "ERROR: vLLM not installed correctly!"
|
| 42 |
+
exit 1
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
# Check for GPU
|
| 46 |
+
echo "=========================================="
|
| 47 |
+
echo "GPU Information:"
|
| 48 |
if command -v nvidia-smi &> /dev/null; then
|
| 49 |
+
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader || echo "nvidia-smi failed"
|
| 50 |
+
nvidia-smi || echo "nvidia-smi full output failed"
|
| 51 |
+
else
|
| 52 |
+
echo "WARNING: nvidia-smi not found - GPU may not be available!"
|
| 53 |
fi
|
| 54 |
+
echo "=========================================="
|
| 55 |
|
| 56 |
+
# Set HF token for model download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if [ -n "$HF_TOKEN" ]; then
|
| 58 |
export HF_TOKEN
|
| 59 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 60 |
+
echo "HF Token exported for model download"
|
| 61 |
+
else
|
| 62 |
+
echo "WARNING: No HF token set - model download may fail for gated models!"
|
| 63 |
fi
|
| 64 |
|
| 65 |
+
echo "=========================================="
|
| 66 |
echo "Starting vLLM OpenAI server..."
|
| 67 |
+
echo "Endpoints:"
|
| 68 |
echo " - POST /v1/chat/completions"
|
| 69 |
echo " - POST /v1/completions"
|
| 70 |
echo " - GET /v1/models"
|
| 71 |
echo " - GET /health"
|
| 72 |
echo "=========================================="
|
| 73 |
|
| 74 |
+
# Build vLLM serve command
|
| 75 |
+
exec python -m vllm.entrypoints.openai.api_server \
|
| 76 |
+
--model "$MODEL" \
|
| 77 |
+
--port "$PORT" \
|
| 78 |
+
--host "0.0.0.0" \
|
| 79 |
+
--dtype "bfloat16" \
|
| 80 |
+
--max-model-len "$MAX_MODEL_LEN" \
|
| 81 |
+
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
| 82 |
+
--trust-remote-code \
|
| 83 |
+
--enable-prefix-caching \
|
| 84 |
+
--disable-log-requests
|