Spaces:
Paused
Paused
aeb56
commited on
Commit
·
96b6724
1
Parent(s):
3fb1215
Fix multi-GPU: use parallelize=True instead of device_map, update env var
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import time
|
|
| 9 |
|
| 10 |
# Set environment variables for flash-linear-attention and memory management
|
| 11 |
os.environ["FLA_USE_TRITON"] = "1"
|
| 12 |
-
os.environ["
|
| 13 |
|
| 14 |
# Model configuration
|
| 15 |
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
|
|
@@ -179,10 +179,11 @@ class ChatBot:
|
|
| 179 |
yield f"✅ **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
|
| 180 |
|
| 181 |
# Run lm_eval with optimized memory settings
|
|
|
|
| 182 |
cmd = [
|
| 183 |
"lm_eval",
|
| 184 |
"--model", "hf",
|
| 185 |
-
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,
|
| 186 |
"--tasks", task_string,
|
| 187 |
"--batch_size", "1", # Reduced to minimize memory usage
|
| 188 |
"--output_path", output_dir,
|
|
|
|
| 9 |
|
| 10 |
# Set environment variables for flash-linear-attention and memory management
|
| 11 |
os.environ["FLA_USE_TRITON"] = "1"
|
| 12 |
+
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" # Updated from PYTORCH_CUDA_ALLOC_CONF
|
| 13 |
|
| 14 |
# Model configuration
|
| 15 |
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
|
|
|
|
| 179 |
yield f"✅ **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
|
| 180 |
|
| 181 |
# Run lm_eval with optimized memory settings
|
| 182 |
+
# Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
|
| 183 |
cmd = [
|
| 184 |
"lm_eval",
|
| 185 |
"--model", "hf",
|
| 186 |
+
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
|
| 187 |
"--tasks", task_string,
|
| 188 |
"--batch_size", "1", # Reduced to minimize memory usage
|
| 189 |
"--output_path", output_dir,
|