text2vector / app /embeddings.py
emilbm's picture
init project
5a5e912
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
from app.logger import logger
model = AutoModel.from_pretrained("intfloat/multilingual-e5-large")
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
"""Average pool the token embeddings."""
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
def embed_text(texts: list[str]) -> list[list[float]]:
"""
Generate embeddings for a list of texts.
The model supports a maximum of 512 tokens per input which typically corresponds to about 2000-2500 characters.
To avoid losing important information, we set a limit of 2000 characters per input text.
"""
if not texts:
raise ValueError("No input texts provided.")
if any(len(text) > 2000 for text in texts):
raise ValueError(
"One or more input texts exceed the maximum length of 2000 characters."
)
batch_dict = tokenizer(
texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
)
logger.info(
f"Tokenized {len(texts)} texts with number of tokens per text: {batch_dict['input_ids'].ne(tokenizer.pad_token_id).sum(dim=1).tolist()}"
)
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
return embeddings.detach().cpu().tolist()