chore: Update GPU mapping for model launches by commenting out unused configurations and adjust GPU memory utilization to 0.3. Add additional dependencies to requirements.txt for enhanced functionality.
Browse files- app.py +15 -8
- requirements.txt +7 -1
app.py
CHANGED
|
@@ -20,13 +20,20 @@ MODELS = dict()
|
|
| 20 |
|
| 21 |
# Launch models via vLLM
|
| 22 |
model_gpu_mapping = [
|
| 23 |
-
(0, 1000),
|
| 24 |
-
|
| 25 |
-
(
|
| 26 |
-
|
| 27 |
-
(
|
| 28 |
-
|
| 29 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
]
|
| 31 |
|
| 32 |
for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
|
|
@@ -44,7 +51,7 @@ for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
|
|
| 44 |
"--model", model_name,
|
| 45 |
"--port", str(port),
|
| 46 |
"--quantization", "bitsandbytes",
|
| 47 |
-
"--gpu-memory-utilization", "0.
|
| 48 |
"--trust-remote-code",
|
| 49 |
],
|
| 50 |
env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
|
|
|
|
| 20 |
|
| 21 |
# Launch models via vLLM
|
| 22 |
model_gpu_mapping = [
|
| 23 |
+
# (0, 1000),
|
| 24 |
+
# (0, 1500),
|
| 25 |
+
# (1, 2000),
|
| 26 |
+
# (1, 2500),
|
| 27 |
+
# (2, 3000),
|
| 28 |
+
# (2, 3500),
|
| 29 |
+
(2, 4000),
|
| 30 |
+
# (3, 4500),
|
| 31 |
+
(2, 5000),
|
| 32 |
+
# (4, 5500),
|
| 33 |
+
(3, 6000),
|
| 34 |
+
# (5, 6500),
|
| 35 |
+
(3, 7000),
|
| 36 |
+
# (6, 7500),
|
| 37 |
]
|
| 38 |
|
| 39 |
for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
|
|
|
|
| 51 |
"--model", model_name,
|
| 52 |
"--port", str(port),
|
| 53 |
"--quantization", "bitsandbytes",
|
| 54 |
+
"--gpu-memory-utilization", "0.3",
|
| 55 |
"--trust-remote-code",
|
| 56 |
],
|
| 57 |
env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
|
requirements.txt
CHANGED
|
@@ -1,2 +1,8 @@
|
|
| 1 |
gradio
|
| 2 |
-
gradio[oauth]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
gradio[oauth]
|
| 3 |
+
vllm
|
| 4 |
+
bitsandbytes
|
| 5 |
+
transformers
|
| 6 |
+
datasets
|
| 7 |
+
ninja
|
| 8 |
+
flash-attn
|