Spaces:
Paused
Paused
add flash-attn
Browse files- app.py +4 -4
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -29,8 +29,6 @@ import modelscope_studio as mgr
|
|
| 29 |
os.system("pip list|grep torch")
|
| 30 |
os.system("pip list|grep trans")
|
| 31 |
os.system("pip list|grep flash")
|
| 32 |
-
os.system("nvidia-smi")
|
| 33 |
-
os.system("ll /usr/local/cuda*")
|
| 34 |
|
| 35 |
# Argparser
|
| 36 |
parser = argparse.ArgumentParser(description='demo')
|
|
@@ -46,7 +44,8 @@ if 'int4' in model_path:
|
|
| 46 |
if device == 'mps':
|
| 47 |
print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
|
| 48 |
exit()
|
| 49 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
|
|
|
|
| 50 |
else:
|
| 51 |
if args.multi_gpus:
|
| 52 |
from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
|
|
@@ -72,7 +71,8 @@ else:
|
|
| 72 |
|
| 73 |
model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
|
| 74 |
else:
|
| 75 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
|
|
|
| 76 |
model = model.to(device=device)
|
| 77 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 78 |
model.eval()
|
|
|
|
| 29 |
os.system("pip list|grep torch")
|
| 30 |
os.system("pip list|grep trans")
|
| 31 |
os.system("pip list|grep flash")
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Argparser
|
| 34 |
parser = argparse.ArgumentParser(description='demo')
|
|
|
|
| 44 |
if device == 'mps':
|
| 45 |
print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
|
| 46 |
exit()
|
| 47 |
+
#model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
|
| 48 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
| 49 |
else:
|
| 50 |
if args.multi_gpus:
|
| 51 |
from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
|
|
|
|
| 71 |
|
| 72 |
model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
|
| 73 |
else:
|
| 74 |
+
#model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
| 75 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| 76 |
model = model.to(device=device)
|
| 77 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 78 |
model.eval()
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ torch==2.1.2
|
|
| 3 |
torchvision==0.16.2
|
| 4 |
transformers==4.40.2
|
| 5 |
sentencepiece==0.1.99
|
|
|
|
| 6 |
opencv-python
|
| 7 |
decord
|
| 8 |
gradio==4.22.0
|
|
|
|
| 3 |
torchvision==0.16.2
|
| 4 |
transformers==4.40.2
|
| 5 |
sentencepiece==0.1.99
|
| 6 |
+
https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
| 7 |
opencv-python
|
| 8 |
decord
|
| 9 |
gradio==4.22.0
|