added flash_attention
Browse files- app.py +3 -1
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -18,7 +18,9 @@ processor = AutoProcessor.from_pretrained(model_id)
|
|
| 18 |
model = LlavaForConditionalGeneration.from_pretrained(
|
| 19 |
model_id,
|
| 20 |
quantization_config=quantization_config,
|
| 21 |
-
device_map="auto"
|
|
|
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
|
|
|
| 18 |
model = LlavaForConditionalGeneration.from_pretrained(
|
| 19 |
model_id,
|
| 20 |
quantization_config=quantization_config,
|
| 21 |
+
device_map="auto",
|
| 22 |
+
use_flash_attention_2=True,
|
| 23 |
+
low_cpu_mem_usage=True
|
| 24 |
)
|
| 25 |
|
| 26 |
|
requirements.txt
CHANGED
|
@@ -150,3 +150,4 @@ webencodings==0.5.1
|
|
| 150 |
websocket-client==1.7.0
|
| 151 |
websockets==11.0.3
|
| 152 |
widgetsnbextension==4.0.9
|
|
|
|
|
|
| 150 |
websocket-client==1.7.0
|
| 151 |
websockets==11.0.3
|
| 152 |
widgetsnbextension==4.0.9
|
| 153 |
+
sentence_transformers
|