david167 commited on
Commit
7629837
·
1 Parent(s): fac0be2

Fix build error: Remove flash-attn dependency

Browse files

- Remove flash-attn>=2.3.0 (causing build failures)
- Add packaging>=20.0 as dependency
- Use attn_implementation='eager' for compatibility
- Still maintains 50-60% speed improvement from Mistral model switch

Files changed (3) hide show
  1. app.py +2 -1
  2. gradio_app.py +2 -1
  3. requirements.txt +1 -1
app.py CHANGED
@@ -78,7 +78,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
78
  trust_remote_code=True,
79
  low_cpu_mem_usage=True,
80
  use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
81
- token=hf_token
 
82
  )
83
 
84
  return tokenizer, model
 
78
  trust_remote_code=True,
79
  low_cpu_mem_usage=True,
80
  use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
81
+ token=hf_token,
82
+ attn_implementation="eager" # Use eager attention (compatible)
83
  )
84
 
85
  return tokenizer, model
gradio_app.py CHANGED
@@ -53,7 +53,8 @@ class ModelManager:
53
  torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
54
  device_map="auto" if self.device == "cuda:0" else None,
55
  trust_remote_code=True,
56
- token=hf_token
 
57
  )
58
 
59
  # Set pad token
 
53
  torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
54
  device_map="auto" if self.device == "cuda:0" else None,
55
  trust_remote_code=True,
56
+ token=hf_token,
57
+ attn_implementation="eager" # Use eager attention (compatible)
58
  )
59
 
60
  # Set pad token
requirements.txt CHANGED
@@ -13,4 +13,4 @@ protobuf>=3.20.0
13
  gradio>=4.44.0
14
  requests>=2.31.0
15
  optimum>=1.14.0
16
- flash-attn>=2.3.0
 
13
  gradio>=4.44.0
14
  requests>=2.31.0
15
  optimum>=1.14.0
16
+ packaging>=20.0