jeanbaptdzd commited on
Commit
5550dcb
·
1 Parent(s): cc88da6

Revert to working CUDA/PyTorch/vLLM combination

Browse files

Back to the combination that was working:
- PyTorch: 2.4.0+cu124 (was working)
- vLLM: 0.6.4.post1 (was working)
- CUDA: 12.4.0-devel (unchanged)
- Model: DragonLLM/qwen3-8b-fin-v1.0 (back to original)

The issue was version incompatibility, not the model itself.
This combination was loading successfully before we started upgrading.

Files changed (2) hide show
  1. Dockerfile +2 -2
  2. app/providers/vllm.py +4 -5
Dockerfile CHANGED
@@ -26,11 +26,11 @@ WORKDIR /app
26
 
27
  # Install PyTorch with CUDA 12.4 support FIRST (critical for vLLM compatibility)
28
  RUN pip install --no-cache-dir \
29
- torch==2.5.1 \
30
  --index-url https://download.pytorch.org/whl/cu124
31
 
32
  # Install vLLM (will use the PyTorch we just installed)
33
- RUN pip install --no-cache-dir vllm==0.9.0
34
 
35
  # Install application dependencies
36
  RUN pip install --no-cache-dir \
 
26
 
27
  # Install PyTorch with CUDA 12.4 support FIRST (critical for vLLM compatibility)
28
  RUN pip install --no-cache-dir \
29
+ torch==2.4.0 \
30
  --index-url https://download.pytorch.org/whl/cu124
31
 
32
  # Install vLLM (will use the PyTorch we just installed)
33
+ RUN pip install --no-cache-dir vllm==0.6.4.post1
34
 
35
  # Install application dependencies
36
  RUN pip install --no-cache-dir \
app/providers/vllm.py CHANGED
@@ -5,7 +5,7 @@ from vllm.entrypoints.openai.api_server import build_async_engine_client
5
  import asyncio
6
  from huggingface_hub import login
7
 
8
- # Model configuration - optimized for 8B Qwen3 on L4
9
  model_name = "DragonLLM/qwen3-8b-fin-v1.0"
10
  llm_engine = None
11
 
@@ -38,15 +38,14 @@ def initialize_vllm():
38
  try:
39
  # Initialize vLLM engine with explicit token
40
  print(f"Attempting to load model: {model_name}")
41
- print(f"Model type: Qwen3 8B (bfloat16) - Optimized for L4")
42
  print(f"Download directory: /tmp/huggingface")
43
  print(f"Trust remote code: True")
44
  print(f"L4 GPU: 24GB VRAM available")
45
  print(f"Mode: Eager mode (CUDA graphs disabled for L4)")
46
  print(f"GPU memory utilization: 0.85")
47
- print(f"vLLM: v0.9.0 (official Qwen3 support, stable)")
48
- print(f"Engine: Legacy (v0) - single-process, stable (VLLM_USE_V1=0)")
49
- print(f"PyTorch: 2.5.1+cu124 (CUDA 12.4 binary)")
50
 
51
  llm_engine = LLM(
52
  model=model_name,
 
5
  import asyncio
6
  from huggingface_hub import login
7
 
8
+ # Model configuration - back to working DragonLLM model
9
  model_name = "DragonLLM/qwen3-8b-fin-v1.0"
10
  llm_engine = None
11
 
 
38
  try:
39
  # Initialize vLLM engine with explicit token
40
  print(f"Attempting to load model: {model_name}")
41
+ print(f"Model type: DragonLLM Qwen3 8B (bfloat16) - Back to working combo")
42
  print(f"Download directory: /tmp/huggingface")
43
  print(f"Trust remote code: True")
44
  print(f"L4 GPU: 24GB VRAM available")
45
  print(f"Mode: Eager mode (CUDA graphs disabled for L4)")
46
  print(f"GPU memory utilization: 0.85")
47
+ print(f"vLLM: v0.6.4.post1 (working combination)")
48
+ print(f"PyTorch: 2.4.0+cu124 (CUDA 12.4 binary)")
 
49
 
50
  llm_engine = LLM(
51
  model=model_name,