Spaces:
Running
on
Zero
Running
on
Zero
jedick
commited on
Commit
·
ff1808d
1
Parent(s):
859642d
Use attn_implementation="sdpa"
Browse files- main.py +1 -2
- requirements.txt +2 -1
main.py
CHANGED
|
@@ -157,8 +157,7 @@ def GetChatModel(compute_mode, ckpt_dir=None):
|
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
-
attn_implementation="
|
| 161 |
-
device_map="auto",
|
| 162 |
)
|
| 163 |
# For Flash Attention version of Qwen3
|
| 164 |
tokenizer.padding_side = "left"
|
|
|
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
+
attn_implementation="sdpa",
|
|
|
|
| 161 |
)
|
| 162 |
# For Flash Attention version of Qwen3
|
| 163 |
tokenizer.padding_side = "left"
|
requirements.txt
CHANGED
|
@@ -15,7 +15,8 @@ flash-attn==2.8.2
|
|
| 15 |
# ValueError: Max cache length is not consistent across layers
|
| 16 |
transformers==4.51.3
|
| 17 |
tokenizers==0.21.2
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
# Required by langchain-huggingface
|
| 21 |
sentence-transformers==5.0.0
|
|
|
|
| 15 |
# ValueError: Max cache length is not consistent across layers
|
| 16 |
transformers==4.51.3
|
| 17 |
tokenizers==0.21.2
|
| 18 |
+
# Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")
|
| 19 |
+
#accelerate==1.8.1
|
| 20 |
|
| 21 |
# Required by langchain-huggingface
|
| 22 |
sentence-transformers==5.0.0
|