File size: 1,053 Bytes
9f5a022 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
model.eval()
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)
def vectorize_text_with_bert(text):# from hf docs
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
with torch.no_grad():
outputs = model(**inputs)
hidden_states = outputs.hidden_states
last_layer_hidden_states = hidden_states[-1]
text_representation = torch.mean(last_layer_hidden_states, dim=1).squeeze(0)
return text_representation
if __name__ == "__main__":
text = "A man walking down the street with a dog holding a balloon in one hand."
text_representation = vectorize_text_with_bert(text)
print("Vectorized representation:", text_representation)
print(text_representation.shape) |