Spaces:
Sleeping
Sleeping
| import torch | |
| from qdrant_client import models | |
| from qdrant_client.models import NamedVector | |
| from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer | |
| class DenseEmbeddings: | |
| def __init__( | |
| self, | |
| dense_model: AutoModel, | |
| dense_tokenizer: AutoTokenizer, | |
| sparse_model: AutoModelForMaskedLM, | |
| sparse_tokenizer: AutoTokenizer, | |
| ): | |
| self.dense_model = dense_model | |
| self.dense_tokenizer = dense_tokenizer | |
| self.sparse_model = sparse_model | |
| self.sparse_tokenizer = sparse_tokenizer | |
| def get_dense_vector(self, text: str) -> NamedVector: | |
| """ | |
| Get dense vector from the dense model | |
| :param text: str | |
| :return: NamedVector | |
| """ | |
| inputs = self.dense_tokenizer( | |
| text, return_tensors="pt", padding=True, truncation=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = self.dense_model(**inputs) | |
| dense_vector = NamedVector( | |
| name="text-dense", | |
| vector=torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy(), | |
| ) | |
| return dense_vector | |
| def get_sparse_vector(self, text: str) -> models.SparseVector: | |
| """ | |
| Get sparse vector from the sparse model | |
| :param text: str | |
| :return: SparseVector | |
| """ | |
| inputs = self.sparse_tokenizer( | |
| text, return_tensors="pt", padding=True, truncation=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = self.sparse_model(**inputs) | |
| token_scores = outputs.logits.squeeze().max(dim=0)[0] | |
| token_ids = inputs["input_ids"].squeeze() | |
| sparse_vector = { | |
| int(token_id): float(score) | |
| for token_id, score in zip(token_ids, token_scores) | |
| if score > -5.0 | |
| } | |
| sparse_vector = models.SparseVector( | |
| indices=list(sparse_vector.keys()), | |
| values=list(sparse_vector.values()), | |
| ) | |
| return sparse_vector | |