Spaces:
Paused
Paused
feat(cuda): add cuda information
Browse files
main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import torch
|
| 2 |
from typing import Any
|
| 3 |
from typing import Optional
|
|
@@ -8,6 +9,26 @@ from vllm import LLM, SamplingParams, RequestOutput
|
|
| 8 |
|
| 9 |
# Don't forget to set HF_TOKEN in the env during running
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
app = FastAPI()
|
| 12 |
|
| 13 |
# Initialize the LLM engine
|
|
@@ -22,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
|
|
| 22 |
max_num_batched_tokens=512, # Reduced for T4
|
| 23 |
max_num_seqs=16, # Reduced for T4
|
| 24 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 25 |
-
tensor_parallel_size=
|
| 26 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
| 27 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
| 28 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
|
@@ -43,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
|
|
| 43 |
max_num_batched_tokens=512, # Reduced for T4
|
| 44 |
max_num_seqs=16, # Reduced for T4
|
| 45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 46 |
-
tensor_parallel_size=
|
| 47 |
# max_model_len=32768,
|
| 48 |
enforce_eager=True, # Disable CUDA graph
|
| 49 |
dtype='auto', # Use 'half' if you want half precision
|
|
|
|
| 1 |
+
import random
|
| 2 |
import torch
|
| 3 |
from typing import Any
|
| 4 |
from typing import Optional
|
|
|
|
| 9 |
|
| 10 |
# Don't forget to set HF_TOKEN in the env during running
|
| 11 |
|
| 12 |
+
cuda_num_device: int = 0
|
| 13 |
+
if torch.cuda.is_available() == 'cuda':
|
| 14 |
+
random_seed = 42
|
| 15 |
+
random.seed(random_seed)
|
| 16 |
+
|
| 17 |
+
device = torch.device('cuda')
|
| 18 |
+
torch.cuda.manual_seed(random_seed)
|
| 19 |
+
|
| 20 |
+
print(f"Using device: {device}")
|
| 21 |
+
print(f"CUDA available and enabled. {torch.cuda}")
|
| 22 |
+
print(f"CUDA is available: {torch.cuda.is_available()}")
|
| 23 |
+
print(f"CUDA device count: {torch.cuda.device_count()}")
|
| 24 |
+
print(f"CUDA current device: {torch.cuda.current_device()}")
|
| 25 |
+
|
| 26 |
+
for i in range(torch.cuda.device_count()):
|
| 27 |
+
print('=================================================================')
|
| 28 |
+
print(torch.cuda.get_device_name(i))
|
| 29 |
+
print('Memory Usage:')
|
| 30 |
+
print('Allocated:', round(torch.cuda.memory_allocated(i) / 1024 ** 3, 1), 'GB')
|
| 31 |
+
print('Cached: ', round(torch.cuda.memory_reserved(i) / 1024 ** 3, 1), 'GB')
|
| 32 |
app = FastAPI()
|
| 33 |
|
| 34 |
# Initialize the LLM engine
|
|
|
|
| 43 |
max_num_batched_tokens=512, # Reduced for T4
|
| 44 |
max_num_seqs=16, # Reduced for T4
|
| 45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 46 |
+
tensor_parallel_size=cuda_num_device,
|
| 47 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
| 48 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
| 49 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
|
|
|
| 64 |
max_num_batched_tokens=512, # Reduced for T4
|
| 65 |
max_num_seqs=16, # Reduced for T4
|
| 66 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 67 |
+
tensor_parallel_size=cuda_num_device,
|
| 68 |
# max_model_len=32768,
|
| 69 |
enforce_eager=True, # Disable CUDA graph
|
| 70 |
dtype='auto', # Use 'half' if you want half precision
|