aeb56 commited on
Commit
96b6724
·
1 Parent(s): 3fb1215

Fix multi-GPU: use parallelize=True instead of device_map, update env var

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -9,7 +9,7 @@ import time
9
 
10
  # Set environment variables for flash-linear-attention and memory management
11
  os.environ["FLA_USE_TRITON"] = "1"
12
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
13
 
14
  # Model configuration
15
  MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
@@ -179,10 +179,11 @@ class ChatBot:
179
  yield f"✅ **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
180
 
181
  # Run lm_eval with optimized memory settings
 
182
  cmd = [
183
  "lm_eval",
184
  "--model", "hf",
185
- "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,device_map=auto,low_cpu_mem_usage=True",
186
  "--tasks", task_string,
187
  "--batch_size", "1", # Reduced to minimize memory usage
188
  "--output_path", output_dir,
 
9
 
10
  # Set environment variables for flash-linear-attention and memory management
11
  os.environ["FLA_USE_TRITON"] = "1"
12
+ os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" # Updated from PYTORCH_CUDA_ALLOC_CONF
13
 
14
  # Model configuration
15
  MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
 
179
  yield f"✅ **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
180
 
181
  # Run lm_eval with optimized memory settings
182
+ # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
183
  cmd = [
184
  "lm_eval",
185
  "--model", "hf",
186
+ "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
187
  "--tasks", task_string,
188
  "--batch_size", "1", # Reduced to minimize memory usage
189
  "--output_path", output_dir,