alikLab
/

NoLBERT

@@ -26,19 +26,16 @@ Our model achieves nearly state-of-the-art performance with less than 1% of trai
 | StoriesLM           | 30             | 110                   | 0.47|0.90|0.87|0.80|0.87|
 | NolBERT           | 30             | 109                  | 0.43|0.91|0.91|0.82|0.89
-# Example Usage:
-## Usage Example (transformers 4.50+)
 ```python
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
-#using GPU
 device = 'cuda:0'
 checkpoint_path = "alikLab/NoLBERT"
 model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
@@ -68,6 +65,43 @@ for i, (token, prob) in enumerate(zip(top_10_tokens, top_10_probs)):
     print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
 ```
 ## Citation

 | StoriesLM           | 30             | 110                   | 0.47|0.90|0.87|0.80|0.87|
 | NolBERT           | 30             | 109                  | 0.43|0.91|0.91|0.82|0.89
+## Usage Examples
+### Masked Language Modeling
 ```python
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
+# Using GPU
 device = 'cuda:0'
 checkpoint_path = "alikLab/NoLBERT"
 model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
     print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
 ```
+### Getting Text Embeddings
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+# Using GPU
+device = 'cuda:0'
+checkpoint_path = "alikLab/NoLBERT"
+# Use AutoModel instead of AutoModelForMaskedLM to get embeddings
+model = AutoModel.from_pretrained(checkpoint_path).to(device)
+tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, use_fast=True)
+text = "The day after Monday is Tuesday."
+inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+# Get the hidden states
+last_hidden_states = outputs.last_hidden_state
+# Method 1: Use [CLS] token embedding (first token)
+cls_embedding = last_hidden_states[0, 0, :]  # Shape: [hidden_size]
+# Method 2: Mean pooling over all tokens (excluding padding)
+attention_mask = inputs['attention_mask']
+masked_embeddings = last_hidden_states * attention_mask.unsqueeze(-1)
+mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
+print(f"CLS embedding shape: {cls_embedding.shape}")
+print(f"Mean pooled embedding shape: {mean_embedding.shape}")
+print(f"Text: {text}")
+print(f"Embedding (first 10 dimensions): {cls_embedding[:10].tolist()}")
+```
 ## Citation