Spaces:

jedick
/

R-help-chat

Running on Zero

jedick commited on Aug 2

Commit

ff1808d

1 Parent(s): 859642d

Use attn_implementation="sdpa"

Files changed (2) hide show

main.py CHANGED Viewed

@@ -157,8 +157,7 @@ def GetChatModel(compute_mode, ckpt_dir=None):
             # Enable FlashAttention (requires pip install flash-attn)
             # https://huggingface.co/docs/transformers/en/attention_interface
             # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
-            attn_implementation="flash_attention_2",
-            device_map="auto",
         )
         # For Flash Attention version of Qwen3
         tokenizer.padding_side = "left"

             # Enable FlashAttention (requires pip install flash-attn)
             # https://huggingface.co/docs/transformers/en/attention_interface
             # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
+            attn_implementation="sdpa",
         )
         # For Flash Attention version of Qwen3
         tokenizer.padding_side = "left"

requirements.txt CHANGED Viewed

@@ -15,7 +15,8 @@ flash-attn==2.8.2
 #   ValueError: Max cache length is not consistent across layers
 transformers==4.51.3
 tokenizers==0.21.2
-accelerate==1.8.1
 # Required by langchain-huggingface
 sentence-transformers==5.0.0

 #   ValueError: Max cache length is not consistent across layers
 transformers==4.51.3
 tokenizers==0.21.2
+# Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")
+#accelerate==1.8.1
 # Required by langchain-huggingface
 sentence-transformers==5.0.0