Spaces:
Paused
Paused
aeb56
commited on
Commit
·
ef25cbe
1
Parent(s):
0cefed5
Add flash-attn dependency required by Kimi model
Browse files- app.py +2 -2
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -217,11 +217,11 @@ class ChatBot:
|
|
| 217 |
|
| 218 |
# Run lm_eval with optimized memory settings
|
| 219 |
# Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
|
| 220 |
-
#
|
| 221 |
cmd = [
|
| 222 |
"lm_eval",
|
| 223 |
"--model", "hf",
|
| 224 |
-
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True
|
| 225 |
"--tasks", task_string,
|
| 226 |
"--batch_size", "1", # Reduced to minimize memory usage
|
| 227 |
"--output_path", output_dir,
|
|
|
|
| 217 |
|
| 218 |
# Run lm_eval with optimized memory settings
|
| 219 |
# Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
|
| 220 |
+
# We need to install flash-attn for this model to work properly
|
| 221 |
cmd = [
|
| 222 |
"lm_eval",
|
| 223 |
"--model", "hf",
|
| 224 |
+
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
|
| 225 |
"--tasks", task_string,
|
| 226 |
"--batch_size", "1", # Reduced to minimize memory usage
|
| 227 |
"--output_path", output_dir,
|
requirements.txt
CHANGED
|
@@ -10,6 +10,9 @@ triton>=3.0.0
|
|
| 10 |
# Flash Linear Attention (required by Kimi model)
|
| 11 |
git+https://github.com/sustcsonglin/flash-linear-attention.git@main
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
# Evaluation
|
| 14 |
lm-eval>=0.4.0
|
| 15 |
|
|
|
|
| 10 |
# Flash Linear Attention (required by Kimi model)
|
| 11 |
git+https://github.com/sustcsonglin/flash-linear-attention.git@main
|
| 12 |
|
| 13 |
+
# Flash Attention (required for attention layers)
|
| 14 |
+
flash-attn>=2.5.0
|
| 15 |
+
|
| 16 |
# Evaluation
|
| 17 |
lm-eval>=0.4.0
|
| 18 |
|