aeb56 commited on
Commit
ef25cbe
·
1 Parent(s): 0cefed5

Add flash-attn dependency required by Kimi model

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. requirements.txt +3 -0
app.py CHANGED
@@ -217,11 +217,11 @@ class ChatBot:
217
 
218
  # Run lm_eval with optimized memory settings
219
  # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
220
- # attn_implementation=eager is required because flash attention isn't properly installed
221
  cmd = [
222
  "lm_eval",
223
  "--model", "hf",
224
- "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True,attn_implementation=eager",
225
  "--tasks", task_string,
226
  "--batch_size", "1", # Reduced to minimize memory usage
227
  "--output_path", output_dir,
 
217
 
218
  # Run lm_eval with optimized memory settings
219
  # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
220
+ # We need to install flash-attn for this model to work properly
221
  cmd = [
222
  "lm_eval",
223
  "--model", "hf",
224
+ "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
225
  "--tasks", task_string,
226
  "--batch_size", "1", # Reduced to minimize memory usage
227
  "--output_path", output_dir,
requirements.txt CHANGED
@@ -10,6 +10,9 @@ triton>=3.0.0
10
  # Flash Linear Attention (required by Kimi model)
11
  git+https://github.com/sustcsonglin/flash-linear-attention.git@main
12
 
 
 
 
13
  # Evaluation
14
  lm-eval>=0.4.0
15
 
 
10
  # Flash Linear Attention (required by Kimi model)
11
  git+https://github.com/sustcsonglin/flash-linear-attention.git@main
12
 
13
+ # Flash Attention (required for attention layers)
14
+ flash-attn>=2.5.0
15
+
16
  # Evaluation
17
  lm-eval>=0.4.0
18