Spaces:
Sleeping
Sleeping
| """Utils for processing and encoding text.""" | |
| import torch | |
| def lemmatize_verbs(verbs: list): | |
| from nltk.stem import WordNetLemmatizer | |
| wnl = WordNetLemmatizer() | |
| return [wnl.lemmatize(verb, 'v') for verb in verbs] | |
| def lemmatize_adverbs(adverbs: list): | |
| from nltk.stem import WordNetLemmatizer | |
| wnl = WordNetLemmatizer() | |
| return [wnl.lemmatize(adverb, 'r') for adverb in adverbs] | |
| class SentenceEncoder: | |
| def __init__(self, model_name="roberta-base"): | |
| from transformers import RobertaTokenizer, RobertaModel | |
| if model_name == 'roberta-base': | |
| self.tokenizer = RobertaTokenizer.from_pretrained(model_name) | |
| self.model = RobertaModel.from_pretrained(model_name) | |
| def encode_sentence(self, sentence): | |
| inputs = self.tokenizer.encode_plus( | |
| sentence, add_special_tokens=True, return_tensors='pt', | |
| ) | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| # sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze(0) | |
| sentence_embedding = outputs.last_hidden_state[:, 0, :] | |
| return sentence_embedding | |
| def encode_sentences(self, sentences): | |
| """Encodes a list of sentences using model.""" | |
| tokenized_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') | |
| with torch.no_grad(): | |
| outputs = self.model(**tokenized_input) | |
| embeddings = outputs.last_hidden_state[:, 0, :] | |
| return embeddings | |