Update README.md
Browse files
README.md
CHANGED
|
@@ -26,19 +26,16 @@ Our model achieves nearly state-of-the-art performance with less than 1% of trai
|
|
| 26 |
| StoriesLM | 30 | 110 | 0.47|0.90|0.87|0.80|0.87|
|
| 27 |
| NolBERT | 30 | 109 | 0.43|0.91|0.91|0.82|0.89
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
```python
|
| 34 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 35 |
import torch
|
| 36 |
|
| 37 |
-
|
| 38 |
-
#using GPU
|
| 39 |
device = 'cuda:0'
|
| 40 |
-
|
| 41 |
-
|
| 42 |
checkpoint_path = "alikLab/NoLBERT"
|
| 43 |
|
| 44 |
model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
|
|
@@ -68,6 +65,43 @@ for i, (token, prob) in enumerate(zip(top_10_tokens, top_10_probs)):
|
|
| 68 |
print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
|
| 69 |
```
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
## Citation
|
|
|
|
| 26 |
| StoriesLM | 30 | 110 | 0.47|0.90|0.87|0.80|0.87|
|
| 27 |
| NolBERT | 30 | 109 | 0.43|0.91|0.91|0.82|0.89
|
| 28 |
|
| 29 |
+
## Usage Examples
|
| 30 |
|
| 31 |
+
### Masked Language Modeling
|
| 32 |
|
| 33 |
```python
|
| 34 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 35 |
import torch
|
| 36 |
|
| 37 |
+
# Using GPU
|
|
|
|
| 38 |
device = 'cuda:0'
|
|
|
|
|
|
|
| 39 |
checkpoint_path = "alikLab/NoLBERT"
|
| 40 |
|
| 41 |
model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
|
|
|
|
| 65 |
print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
|
| 66 |
```
|
| 67 |
|
| 68 |
+
### Getting Text Embeddings
|
| 69 |
+
|
| 70 |
+
```python
|
| 71 |
+
from transformers import AutoTokenizer, AutoModel
|
| 72 |
+
import torch
|
| 73 |
+
|
| 74 |
+
# Using GPU
|
| 75 |
+
device = 'cuda:0'
|
| 76 |
+
checkpoint_path = "alikLab/NoLBERT"
|
| 77 |
+
|
| 78 |
+
# Use AutoModel instead of AutoModelForMaskedLM to get embeddings
|
| 79 |
+
model = AutoModel.from_pretrained(checkpoint_path).to(device)
|
| 80 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, use_fast=True)
|
| 81 |
+
|
| 82 |
+
text = "The day after Monday is Tuesday."
|
| 83 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
|
| 84 |
+
|
| 85 |
+
with torch.no_grad():
|
| 86 |
+
outputs = model(**inputs)
|
| 87 |
+
|
| 88 |
+
# Get the hidden states
|
| 89 |
+
last_hidden_states = outputs.last_hidden_state
|
| 90 |
+
|
| 91 |
+
# Method 1: Use [CLS] token embedding (first token)
|
| 92 |
+
cls_embedding = last_hidden_states[0, 0, :] # Shape: [hidden_size]
|
| 93 |
+
|
| 94 |
+
# Method 2: Mean pooling over all tokens (excluding padding)
|
| 95 |
+
attention_mask = inputs['attention_mask']
|
| 96 |
+
masked_embeddings = last_hidden_states * attention_mask.unsqueeze(-1)
|
| 97 |
+
mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
|
| 98 |
+
|
| 99 |
+
print(f"CLS embedding shape: {cls_embedding.shape}")
|
| 100 |
+
print(f"Mean pooled embedding shape: {mean_embedding.shape}")
|
| 101 |
+
print(f"Text: {text}")
|
| 102 |
+
print(f"Embedding (first 10 dimensions): {cls_embedding[:10].tolist()}")
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
|
| 106 |
|
| 107 |
## Citation
|