Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModel | |
| from torch import Tensor | |
| from app.logger import logger | |
| model = AutoModel.from_pretrained("intfloat/multilingual-e5-large") | |
| tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large") | |
| def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
| """Average pool the token embeddings.""" | |
| last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) | |
| return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] | |
| def embed_text(texts: list[str]) -> list[list[float]]: | |
| """ | |
| Generate embeddings for a list of texts. | |
| The model supports a maximum of 512 tokens per input which typically corresponds to about 2000-2500 characters. | |
| To avoid losing important information, we set a limit of 2000 characters per input text. | |
| """ | |
| if not texts: | |
| raise ValueError("No input texts provided.") | |
| if any(len(text) > 2000 for text in texts): | |
| raise ValueError( | |
| "One or more input texts exceed the maximum length of 2000 characters." | |
| ) | |
| batch_dict = tokenizer( | |
| texts, max_length=512, padding=True, truncation=True, return_tensors="pt" | |
| ) | |
| logger.info( | |
| f"Tokenized {len(texts)} texts with number of tokens per text: {batch_dict['input_ids'].ne(tokenizer.pad_token_id).sum(dim=1).tolist()}" | |
| ) | |
| outputs = model(**batch_dict) | |
| embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"]) | |
| return embeddings.detach().cpu().tolist() | |