| """Utils for processing and encoding text.""" |
|
|
|
|
| def lemmatize_verbs(verbs: list): |
| import nltk |
| from nltk.stem import WordNetLemmatizer |
| wnl = WordNetLemmatizer() |
| return [wnl.lemmatize(verb, 'v') for verb in verbs] |
|
|
|
|
| def lemmatize_adverbs(adverbs: list): |
| import nltk |
| from nltk.stem import WordNetLemmatizer |
| wnl = WordNetLemmatizer() |
| return [wnl.lemmatize(adverb, 'r') for adverb in adverbs] |
|
|
|
|
| class SentenceEncoder: |
|
|
| def __init__(self, model_name="roberta-base"): |
| from transformers import RobertaTokenizer, RobertaModel |
| if model_name == 'roberta-base': |
| self.tokenizer = RobertaTokenizer.from_pretrained(model_name) |
| self.model = RobertaModel.from_pretrained(model_name) |
| |
| def encode_sentence(self, sentence): |
| import torch |
| inputs = self.tokenizer.encode_plus( |
| sentence, add_special_tokens=True, return_tensors='pt', |
| ) |
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
| |
| sentence_embedding = outputs.last_hidden_state[:, 0, :] |
| return sentence_embedding |
| |
| def encode_sentences(self, sentences): |
| """Encodes a list of sentences using model.""" |
| import torch |
| tokenized_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') |
| with torch.no_grad(): |
| outputs = self.model(**tokenized_input) |
| embeddings = outputs.last_hidden_state[:, 0, :] |
| return embeddings |
| |