Bugpie commited on
Commit
9a971d5
·
1 Parent(s): 524b597

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -24
README.md CHANGED
@@ -36,6 +36,7 @@ OSCAR or Open Super-large Crawled Aggregated coRpus is a multilingual corpus obt
36
  >>> camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
37
  >>> results = camembert_fill_mask("Le camembert est <mask> :)")
38
 
 
39
  [{'score': 0.49091097712516785,
40
  'token': 7200,
41
  'token_str': 'délicieux',
@@ -59,18 +60,19 @@ OSCAR or Open Super-large Crawled Aggregated coRpus is a multilingual corpus obt
59
  ```
60
 
61
  -**Extract contextual embedding features from Camembert output**
 
62
  ```python
63
  import torch
64
- # Tokenize in sub-words with SentencePiece
65
- tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
66
- # ['▁J', "'", 'aime', '▁le', '▁ca', 'member', 't', '▁!']
67
 
68
- # 1-hot encode and add special starting and end tokens
69
- encoded_sentence = tokenizer.encode(tokenized_sentence)
 
 
 
 
 
70
  # [5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]
71
- # NB: Can be done in one step : tokenize.encode("J'aime le camembert !")
72
 
73
- # Feed tokens to Camembert as a torch tensor (batch dim 1)
74
  encoded_sentence = torch.tensor(encoded_sentence).unsqueeze(0)
75
  embeddings, _ = camembert(encoded_sentence)
76
  # embeddings.detach()
@@ -79,21 +81,4 @@ embeddings, _ = camembert(encoded_sentence)
79
  # [ 0.0606, -0.1811, -0.0418, ..., -0.1815, 0.0880, -0.0766],
80
  # [-0.1561, -0.1127, 0.2687, ..., -0.0648, 0.0249, 0.0446],
81
  # ...,
82
- ```
83
-
84
- -**Extract contextual embedding features from all Camembert layers**
85
- ```python
86
- from transformers import CamembertConfig
87
- # (Need to reload the model with new config)
88
- config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=True)
89
- camembert = CamembertModel.from_pretrained("camembert-base", config=config)
90
-
91
- embeddings, _, all_layer_embeddings = camembert(encoded_sentence)
92
- # all_layer_embeddings list of len(all_layer_embeddings) == 13 (input embedding layer + 12 self attention layers)
93
- all_layer_embeddings[5]
94
- # layer 5 contextual embedding : size torch.Size([1, 10, 768])
95
- #tensor([[[-0.0032, 0.0075, 0.0040, ..., -0.0025, -0.0178, -0.0210],
96
- # [-0.0996, -0.1474, 0.1057, ..., -0.0278, 0.1690, -0.2982],
97
- # [ 0.0557, -0.0588, 0.0547, ..., -0.0726, -0.0867, 0.0699],
98
- # ...,
99
  ```
 
36
  >>> camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
37
  >>> results = camembert_fill_mask("Le camembert est <mask> :)")
38
 
39
+ >>> result
40
  [{'score': 0.49091097712516785,
41
  'token': 7200,
42
  'token_str': 'délicieux',
 
60
  ```
61
 
62
  -**Extract contextual embedding features from Camembert output**
63
+
64
  ```python
65
  import torch
 
 
 
66
 
67
+ >>> tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
68
+ >>> encoded_sentence = tokenizer.encode(tokenized_sentence)
69
+ # Can be done in one step : tokenize.encode("J'aime le camembert !")
70
+
71
+ >>> tokenized_sentence
72
+ ['▁J', "'", 'aime', '▁le', '▁ca', 'member', 't', '▁!']
73
+ >>> encoded_sentence
74
  # [5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]
 
75
 
 
76
  encoded_sentence = torch.tensor(encoded_sentence).unsqueeze(0)
77
  embeddings, _ = camembert(encoded_sentence)
78
  # embeddings.detach()
 
81
  # [ 0.0606, -0.1811, -0.0418, ..., -0.1815, 0.0880, -0.0766],
82
  # [-0.1561, -0.1127, 0.2687, ..., -0.0648, 0.0249, 0.0446],
83
  # ...,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ```