Spaces:

david167
/

question-generation-api

Sleeping

david167 commited on Aug 19, 2025

Commit

7629837

1 Parent(s): fac0be2

Fix build error: Remove flash-attn dependency

- Remove flash-attn>=2.3.0 (causing build failures)
- Add packaging>=20.0 as dependency
- Use attn_implementation='eager' for compatibility
- Still maintains 50-60% speed improvement from Mistral model switch

Files changed (3) hide show

app.py +2 -1
gradio_app.py +2 -1
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -78,7 +78,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
-                    token=hf_token
                 )
             return tokenizer, model

                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
+                    token=hf_token,
+                    attn_implementation="eager"  # Use eager attention (compatible)
                 )
             return tokenizer, model

gradio_app.py CHANGED Viewed

@@ -53,7 +53,8 @@ class ModelManager:
                 torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
                 device_map="auto" if self.device == "cuda:0" else None,
                 trust_remote_code=True,
-                token=hf_token
             )
             # Set pad token

                 torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
                 device_map="auto" if self.device == "cuda:0" else None,
                 trust_remote_code=True,
+                token=hf_token,
+                attn_implementation="eager"  # Use eager attention (compatible)
             )
             # Set pad token

requirements.txt CHANGED Viewed

@@ -13,4 +13,4 @@ protobuf>=3.20.0
 gradio>=4.44.0
 requests>=2.31.0
 optimum>=1.14.0
-flash-attn>=2.3.0

 gradio>=4.44.0
 requests>=2.31.0
 optimum>=1.14.0
+packaging>=20.0