Bugpie commited on
Commit
524b597
·
1 Parent(s): 13a96e2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -0
README.md CHANGED
@@ -56,4 +56,44 @@ OSCAR or Open Super-large Crawled Aggregated coRpus is a multilingual corpus obt
56
  'token': 1654,
57
  'token_str': 'parfait',
58
  'sequence': 'Le camembert est parfait :)'}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ```
 
56
  'token': 1654,
57
  'token_str': 'parfait',
58
  'sequence': 'Le camembert est parfait :)'}]
59
+ ```
60
+
61
+ -**Extract contextual embedding features from Camembert output**
62
+ ```python
63
+ import torch
64
+ # Tokenize in sub-words with SentencePiece
65
+ tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
66
+ # ['▁J', "'", 'aime', '▁le', '▁ca', 'member', 't', '▁!']
67
+
68
+ # 1-hot encode and add special starting and end tokens
69
+ encoded_sentence = tokenizer.encode(tokenized_sentence)
70
+ # [5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]
71
+ # NB: Can be done in one step : tokenize.encode("J'aime le camembert !")
72
+
73
+ # Feed tokens to Camembert as a torch tensor (batch dim 1)
74
+ encoded_sentence = torch.tensor(encoded_sentence).unsqueeze(0)
75
+ embeddings, _ = camembert(encoded_sentence)
76
+ # embeddings.detach()
77
+ # embeddings.size torch.Size([1, 10, 768])
78
+ # tensor([[[-0.0254, 0.0235, 0.1027, ..., -0.1459, -0.0205, -0.0116],
79
+ # [ 0.0606, -0.1811, -0.0418, ..., -0.1815, 0.0880, -0.0766],
80
+ # [-0.1561, -0.1127, 0.2687, ..., -0.0648, 0.0249, 0.0446],
81
+ # ...,
82
+ ```
83
+
84
+ -**Extract contextual embedding features from all Camembert layers**
85
+ ```python
86
+ from transformers import CamembertConfig
87
+ # (Need to reload the model with new config)
88
+ config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=True)
89
+ camembert = CamembertModel.from_pretrained("camembert-base", config=config)
90
+
91
+ embeddings, _, all_layer_embeddings = camembert(encoded_sentence)
92
+ # all_layer_embeddings list of len(all_layer_embeddings) == 13 (input embedding layer + 12 self attention layers)
93
+ all_layer_embeddings[5]
94
+ # layer 5 contextual embedding : size torch.Size([1, 10, 768])
95
+ #tensor([[[-0.0032, 0.0075, 0.0040, ..., -0.0025, -0.0178, -0.0210],
96
+ # [-0.0996, -0.1474, 0.1057, ..., -0.0278, 0.1690, -0.2982],
97
+ # [ 0.0557, -0.0588, 0.0547, ..., -0.0726, -0.0867, 0.0699],
98
+ # ...,
99
  ```