Commit
·
5550dcb
1
Parent(s):
cc88da6
Revert to working CUDA/PyTorch/vLLM combination
Browse filesBack to the combination that was working:
- PyTorch: 2.4.0+cu124 (was working)
- vLLM: 0.6.4.post1 (was working)
- CUDA: 12.4.0-devel (unchanged)
- Model: DragonLLM/qwen3-8b-fin-v1.0 (back to original)
The issue was version incompatibility, not the model itself.
This combination was loading successfully before we started upgrading.
- Dockerfile +2 -2
- app/providers/vllm.py +4 -5
Dockerfile
CHANGED
|
@@ -26,11 +26,11 @@ WORKDIR /app
|
|
| 26 |
|
| 27 |
# Install PyTorch with CUDA 12.4 support FIRST (critical for vLLM compatibility)
|
| 28 |
RUN pip install --no-cache-dir \
|
| 29 |
-
torch==2.
|
| 30 |
--index-url https://download.pytorch.org/whl/cu124
|
| 31 |
|
| 32 |
# Install vLLM (will use the PyTorch we just installed)
|
| 33 |
-
RUN pip install --no-cache-dir vllm==0.
|
| 34 |
|
| 35 |
# Install application dependencies
|
| 36 |
RUN pip install --no-cache-dir \
|
|
|
|
| 26 |
|
| 27 |
# Install PyTorch with CUDA 12.4 support FIRST (critical for vLLM compatibility)
|
| 28 |
RUN pip install --no-cache-dir \
|
| 29 |
+
torch==2.4.0 \
|
| 30 |
--index-url https://download.pytorch.org/whl/cu124
|
| 31 |
|
| 32 |
# Install vLLM (will use the PyTorch we just installed)
|
| 33 |
+
RUN pip install --no-cache-dir vllm==0.6.4.post1
|
| 34 |
|
| 35 |
# Install application dependencies
|
| 36 |
RUN pip install --no-cache-dir \
|
app/providers/vllm.py
CHANGED
|
@@ -5,7 +5,7 @@ from vllm.entrypoints.openai.api_server import build_async_engine_client
|
|
| 5 |
import asyncio
|
| 6 |
from huggingface_hub import login
|
| 7 |
|
| 8 |
-
# Model configuration -
|
| 9 |
model_name = "DragonLLM/qwen3-8b-fin-v1.0"
|
| 10 |
llm_engine = None
|
| 11 |
|
|
@@ -38,15 +38,14 @@ def initialize_vllm():
|
|
| 38 |
try:
|
| 39 |
# Initialize vLLM engine with explicit token
|
| 40 |
print(f"Attempting to load model: {model_name}")
|
| 41 |
-
print(f"Model type: Qwen3 8B (bfloat16) -
|
| 42 |
print(f"Download directory: /tmp/huggingface")
|
| 43 |
print(f"Trust remote code: True")
|
| 44 |
print(f"L4 GPU: 24GB VRAM available")
|
| 45 |
print(f"Mode: Eager mode (CUDA graphs disabled for L4)")
|
| 46 |
print(f"GPU memory utilization: 0.85")
|
| 47 |
-
print(f"vLLM: v0.
|
| 48 |
-
print(f"
|
| 49 |
-
print(f"PyTorch: 2.5.1+cu124 (CUDA 12.4 binary)")
|
| 50 |
|
| 51 |
llm_engine = LLM(
|
| 52 |
model=model_name,
|
|
|
|
| 5 |
import asyncio
|
| 6 |
from huggingface_hub import login
|
| 7 |
|
| 8 |
+
# Model configuration - back to working DragonLLM model
|
| 9 |
model_name = "DragonLLM/qwen3-8b-fin-v1.0"
|
| 10 |
llm_engine = None
|
| 11 |
|
|
|
|
| 38 |
try:
|
| 39 |
# Initialize vLLM engine with explicit token
|
| 40 |
print(f"Attempting to load model: {model_name}")
|
| 41 |
+
print(f"Model type: DragonLLM Qwen3 8B (bfloat16) - Back to working combo")
|
| 42 |
print(f"Download directory: /tmp/huggingface")
|
| 43 |
print(f"Trust remote code: True")
|
| 44 |
print(f"L4 GPU: 24GB VRAM available")
|
| 45 |
print(f"Mode: Eager mode (CUDA graphs disabled for L4)")
|
| 46 |
print(f"GPU memory utilization: 0.85")
|
| 47 |
+
print(f"vLLM: v0.6.4.post1 (working combination)")
|
| 48 |
+
print(f"PyTorch: 2.4.0+cu124 (CUDA 12.4 binary)")
|
|
|
|
| 49 |
|
| 50 |
llm_engine = LLM(
|
| 51 |
model=model_name,
|