ojhfklsjhl commited on
Commit
192f883
·
verified ·
1 Parent(s): bef6c1a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -6
README.md CHANGED
@@ -26,19 +26,16 @@ Our model achieves nearly state-of-the-art performance with less than 1% of trai
26
  | StoriesLM | 30 | 110 | 0.47|0.90|0.87|0.80|0.87|
27
  | NolBERT | 30 | 109 | 0.43|0.91|0.91|0.82|0.89
28
 
29
- # Example Usage:
30
 
31
- ## Usage Example (transformers 4.50+)
32
 
33
  ```python
34
  from transformers import AutoTokenizer, AutoModelForMaskedLM
35
  import torch
36
 
37
-
38
- #using GPU
39
  device = 'cuda:0'
40
-
41
-
42
  checkpoint_path = "alikLab/NoLBERT"
43
 
44
  model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
@@ -68,6 +65,43 @@ for i, (token, prob) in enumerate(zip(top_10_tokens, top_10_probs)):
68
  print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
69
  ```
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  ## Citation
 
26
  | StoriesLM | 30 | 110 | 0.47|0.90|0.87|0.80|0.87|
27
  | NolBERT | 30 | 109 | 0.43|0.91|0.91|0.82|0.89
28
 
29
+ ## Usage Examples
30
 
31
+ ### Masked Language Modeling
32
 
33
  ```python
34
  from transformers import AutoTokenizer, AutoModelForMaskedLM
35
  import torch
36
 
37
+ # Using GPU
 
38
  device = 'cuda:0'
 
 
39
  checkpoint_path = "alikLab/NoLBERT"
40
 
41
  model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(device)
 
65
  print(f"{i+1:2d}. {token:<12} (probability: {prob:.4f})")
66
  ```
67
 
68
+ ### Getting Text Embeddings
69
+
70
+ ```python
71
+ from transformers import AutoTokenizer, AutoModel
72
+ import torch
73
+
74
+ # Using GPU
75
+ device = 'cuda:0'
76
+ checkpoint_path = "alikLab/NoLBERT"
77
+
78
+ # Use AutoModel instead of AutoModelForMaskedLM to get embeddings
79
+ model = AutoModel.from_pretrained(checkpoint_path).to(device)
80
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, use_fast=True)
81
+
82
+ text = "The day after Monday is Tuesday."
83
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
84
+
85
+ with torch.no_grad():
86
+ outputs = model(**inputs)
87
+
88
+ # Get the hidden states
89
+ last_hidden_states = outputs.last_hidden_state
90
+
91
+ # Method 1: Use [CLS] token embedding (first token)
92
+ cls_embedding = last_hidden_states[0, 0, :] # Shape: [hidden_size]
93
+
94
+ # Method 2: Mean pooling over all tokens (excluding padding)
95
+ attention_mask = inputs['attention_mask']
96
+ masked_embeddings = last_hidden_states * attention_mask.unsqueeze(-1)
97
+ mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
98
+
99
+ print(f"CLS embedding shape: {cls_embedding.shape}")
100
+ print(f"Mean pooled embedding shape: {mean_embedding.shape}")
101
+ print(f"Text: {text}")
102
+ print(f"Embedding (first 10 dimensions): {cls_embedding[:10].tolist()}")
103
+ ```
104
+
105
 
106
 
107
  ## Citation