from transformers import AutoTokenizer, AutoModel from torch import Tensor from app.logger import logger model = AutoModel.from_pretrained("intfloat/multilingual-e5-large") tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large") def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: """Average pool the token embeddings.""" last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] def embed_text(texts: list[str]) -> list[list[float]]: """ Generate embeddings for a list of texts. The model supports a maximum of 512 tokens per input which typically corresponds to about 2000-2500 characters. To avoid losing important information, we set a limit of 2000 characters per input text. """ if not texts: raise ValueError("No input texts provided.") if any(len(text) > 2000 for text in texts): raise ValueError( "One or more input texts exceed the maximum length of 2000 characters." ) batch_dict = tokenizer( texts, max_length=512, padding=True, truncation=True, return_tensors="pt" ) logger.info( f"Tokenized {len(texts)} texts with number of tokens per text: {batch_dict['input_ids'].ne(tokenizer.pad_token_id).sum(dim=1).tolist()}" ) outputs = model(**batch_dict) embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"]) return embeddings.detach().cpu().tolist()