Spaces:
Sleeping
Sleeping
Fix build error: Remove flash-attn dependency
Browse files- Remove flash-attn>=2.3.0 (causing build failures)
- Add packaging>=20.0 as dependency
- Use attn_implementation='eager' for compatibility
- Still maintains 50-60% speed improvement from Mistral model switch
- app.py +2 -1
- gradio_app.py +2 -1
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -78,7 +78,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
|
|
| 78 |
trust_remote_code=True,
|
| 79 |
low_cpu_mem_usage=True,
|
| 80 |
use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
|
| 81 |
-
token=hf_token
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
return tokenizer, model
|
|
|
|
| 78 |
trust_remote_code=True,
|
| 79 |
low_cpu_mem_usage=True,
|
| 80 |
use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
|
| 81 |
+
token=hf_token,
|
| 82 |
+
attn_implementation="eager" # Use eager attention (compatible)
|
| 83 |
)
|
| 84 |
|
| 85 |
return tokenizer, model
|
gradio_app.py
CHANGED
|
@@ -53,7 +53,8 @@ class ModelManager:
|
|
| 53 |
torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
|
| 54 |
device_map="auto" if self.device == "cuda:0" else None,
|
| 55 |
trust_remote_code=True,
|
| 56 |
-
token=hf_token
|
|
|
|
| 57 |
)
|
| 58 |
|
| 59 |
# Set pad token
|
|
|
|
| 53 |
torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
|
| 54 |
device_map="auto" if self.device == "cuda:0" else None,
|
| 55 |
trust_remote_code=True,
|
| 56 |
+
token=hf_token,
|
| 57 |
+
attn_implementation="eager" # Use eager attention (compatible)
|
| 58 |
)
|
| 59 |
|
| 60 |
# Set pad token
|
requirements.txt
CHANGED
|
@@ -13,4 +13,4 @@ protobuf>=3.20.0
|
|
| 13 |
gradio>=4.44.0
|
| 14 |
requests>=2.31.0
|
| 15 |
optimum>=1.14.0
|
| 16 |
-
|
|
|
|
| 13 |
gradio>=4.44.0
|
| 14 |
requests>=2.31.0
|
| 15 |
optimum>=1.14.0
|
| 16 |
+
packaging>=20.0
|