File size: 1,559 Bytes
5a5e912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
from app.logger import logger

model = AutoModel.from_pretrained("intfloat/multilingual-e5-large")
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    """Average pool the token embeddings."""
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def embed_text(texts: list[str]) -> list[list[float]]:
    """
    Generate embeddings for a list of texts.

    The model supports a maximum of 512 tokens per input which typically corresponds to about 2000-2500 characters.
    To avoid losing important information, we set a limit of 2000 characters per input text.
    """
    if not texts:
        raise ValueError("No input texts provided.")
    if any(len(text) > 2000 for text in texts):
        raise ValueError(
            "One or more input texts exceed the maximum length of 2000 characters."
        )

    batch_dict = tokenizer(
        texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
    )
    logger.info(
        f"Tokenized {len(texts)} texts with number of tokens per text: {batch_dict['input_ids'].ne(tokenizer.pad_token_id).sum(dim=1).tolist()}"
    )
    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

    return embeddings.detach().cpu().tolist()