if pytorch 使用 conda 安装 :
    conda install xformers -c xformers

elif pytorch 使用 pip 安装 :
    # cuda 11.8 version
    pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
    # cuda 12.1 version
    pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121

更多信息可参考 installing-xformers。

然后，加载模型时设置 unpad_inputs 和 use_memory_efficient_attention 为 true，并设置 torch_dtype 为 torch.float16 (or torch.bfloat16)，即可获得加速。

import torch
from transformers import AutoModel, AutoTokenizer

path = 'Alibaba-NLP/gte-Qwen2-1.5B-instruct'
device = torch.device('cuda')
tokenzier = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(
    path,
    trust_remote_code=True,
    unpad_inputs=True,
    use_memory_efficient_attention=True,
    torch_dtype=torch.float16
).to(device)

inputs = tokenzier(['test input'], truncation=True, max_length=8192, padding=True, return_tensors='pt')

with torch.inference_mode():
    outputs = model(**inputs.to(device))

也可以直接修改模型的 config.json 中 unpad_inputs 和 use_memory_efficient_attention 为 true，省去代码中的设置。

Citation

@misc{zhang2024mgte,
  title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval}, 
  author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
  year={2024},
  eprint={2407.19669},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2407.19669}, 
}