Adding `safetensors` variant of this model
#1
by SFconvertbot - opened
- .DS_Store +0 -0
- .gitattributes +0 -1
- README.md +14 -109
- crf_config.json โ jhu-clsp-mmBERT-base-multilingual-pii/crf_config.json +0 -0
- jhu-clsp-mmBERT-base-multilingual-pii/model.safetensors +3 -0
- pytorch_model.bin โ jhu-clsp-mmBERT-base-multilingual-pii/pytorch_model.bin +0 -0
- special_tokens_map.json โ jhu-clsp-mmBERT-base-multilingual-pii/special_tokens_map.json +0 -0
- tokenizer.json โ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json +0 -0
- tokenizer_config.json โ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer_config.json +0 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
.gitattributes
CHANGED
|
@@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
README.md
CHANGED
|
@@ -101,9 +101,7 @@ Evaluated on held-out test sets per language (type-aware micro scores):
|
|
| 101 |
|
| 102 |
## Usage
|
| 103 |
|
| 104 |
-
This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class
|
| 105 |
-
|
| 106 |
-
### Setup
|
| 107 |
|
| 108 |
```python
|
| 109 |
import torch
|
|
@@ -142,7 +140,7 @@ class ModernBertCRF(nn.Module):
|
|
| 142 |
return self.crf.decode(emissions, mask=mask)
|
| 143 |
|
| 144 |
# Load model
|
| 145 |
-
model_dir = "deryaerman/mmbert_multilingual_pii_ner"
|
| 146 |
|
| 147 |
with open(f"{model_dir}/crf_config.json") as f:
|
| 148 |
config = json.load(f)
|
|
@@ -157,116 +155,23 @@ model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin", map_location=
|
|
| 157 |
model.eval()
|
| 158 |
|
| 159 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
| 160 |
-
id2label = {int(k): v for k, v in config["id2label"].items()}
|
| 161 |
-
```
|
| 162 |
-
|
| 163 |
-
### Preprocessing: Sentence Splitting
|
| 164 |
-
|
| 165 |
-
The model was trained on **sentence-level** input โ each training example is a single sentence, split and tokenized using [spaCy](https://spacy.io/). For best results, split your input into sentences before inference. Passing unsplit speaker turns (multiple sentences as one input) can cause entities to be missed.
|
| 166 |
-
|
| 167 |
-
```python
|
| 168 |
-
import re
|
| 169 |
-
import spacy
|
| 170 |
-
|
| 171 |
-
nlp = spacy.blank("en") # use "de" for German, "xx" for multilingual
|
| 172 |
-
nlp.add_pipe("sentencizer")
|
| 173 |
-
|
| 174 |
-
def split_dialogue(text):
|
| 175 |
-
"""
|
| 176 |
-
Split raw dialogue text into a list of sentences (each a list of tokens).
|
| 177 |
-
Expects lines like: 'SPEAKER_00: Hello, my name is Peter.'
|
| 178 |
-
"""
|
| 179 |
-
sentences = []
|
| 180 |
-
for line in text.strip().splitlines():
|
| 181 |
-
m = re.match(r"^(SPEAKER_\d+)\s*:\s*(.*)", line.strip())
|
| 182 |
-
if m:
|
| 183 |
-
line = m.group(2)
|
| 184 |
-
if not line:
|
| 185 |
-
continue
|
| 186 |
-
doc = nlp(line)
|
| 187 |
-
for sent in doc.sents:
|
| 188 |
-
tokens = [tok.text for tok in sent if not tok.is_space]
|
| 189 |
-
if tokens:
|
| 190 |
-
sentences.append(tokens)
|
| 191 |
-
return sentences
|
| 192 |
-
|
| 193 |
-
# Example
|
| 194 |
-
raw = """SPEAKER_00: Hello, my name is Peter.
|
| 195 |
-
SPEAKER_01: Hello, my name is Peter as well. Okay, and where do you come from? I come from Chicago."""
|
| 196 |
-
|
| 197 |
-
dialogue = split_dialogue(raw)
|
| 198 |
-
# [['Hello', ',', 'my', 'name', 'is', 'Peter', '.'],
|
| 199 |
-
# ['Hello', ',', 'my', 'name', 'is', 'Peter', 'as', 'well', '.'],
|
| 200 |
-
# ['Okay', ',', 'and', 'where', 'do', 'you', 'come', 'from', '?'],
|
| 201 |
-
# ['I', 'come', 'from', 'Chicago', '.']]
|
| 202 |
-
```
|
| 203 |
-
|
| 204 |
-
### Inference
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
Returns: list of label lists, one per sentence
|
| 211 |
-
"""
|
| 212 |
-
all_labels = []
|
| 213 |
-
for tokens in sentences:
|
| 214 |
-
enc = tokenizer(tokens, is_split_into_words=True,
|
| 215 |
-
return_tensors="pt", truncation=True, max_length=512).to(device)
|
| 216 |
-
word_ids = enc.word_ids(batch_index=0)
|
| 217 |
-
|
| 218 |
-
with torch.no_grad():
|
| 219 |
-
outputs = model(**enc)
|
| 220 |
-
emissions = outputs["logits"]
|
| 221 |
-
mask = enc["attention_mask"].bool()
|
| 222 |
-
preds = model.decode(emissions, mask)[0]
|
| 223 |
-
|
| 224 |
-
word_labels = ["O"] * len(tokens)
|
| 225 |
-
seen = set()
|
| 226 |
-
for idx, wid in enumerate(word_ids):
|
| 227 |
-
if wid is None or wid in seen:
|
| 228 |
-
continue
|
| 229 |
-
seen.add(wid)
|
| 230 |
-
word_labels[wid] = id2label[preds[idx]]
|
| 231 |
-
|
| 232 |
-
all_labels.append(word_labels)
|
| 233 |
-
|
| 234 |
-
return all_labels
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
# Example: dialogue from above
|
| 238 |
-
results = predict_sentences(dialogue, model, tokenizer, id2label)
|
| 239 |
-
|
| 240 |
-
for sent_tokens, sent_labels in zip(dialogue, results):
|
| 241 |
-
for token, label in zip(sent_tokens, sent_labels):
|
| 242 |
-
if label != "O":
|
| 243 |
-
print(f"{token:20s} -> {label}")
|
| 244 |
-
```
|
| 245 |
-
|
| 246 |
-
### Single-sentence inference
|
| 247 |
-
|
| 248 |
-
If you only have isolated sentences, you can pass them directly:
|
| 249 |
-
|
| 250 |
-
```python
|
| 251 |
-
tokens = ["My", "name", "is", "John", "Smith", "and", "I", "live", "in", "Berlin", "."]
|
| 252 |
-
|
| 253 |
-
enc = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=512)
|
| 254 |
-
word_ids = enc.word_ids(batch_index=0)
|
| 255 |
|
| 256 |
with torch.no_grad():
|
| 257 |
-
outputs = model(**
|
| 258 |
emissions = outputs["logits"]
|
| 259 |
-
mask =
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
for
|
| 264 |
-
|
| 265 |
-
continue
|
| 266 |
-
seen.add(wid)
|
| 267 |
-
label = id2label[preds[idx]]
|
| 268 |
if label != "O":
|
| 269 |
-
print(f"{
|
| 270 |
```
|
| 271 |
|
| 272 |
## Training Data
|
|
|
|
| 101 |
|
| 102 |
## Usage
|
| 103 |
|
| 104 |
+
This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class:
|
|
|
|
|
|
|
| 105 |
|
| 106 |
```python
|
| 107 |
import torch
|
|
|
|
| 140 |
return self.crf.decode(emissions, mask=mask)
|
| 141 |
|
| 142 |
# Load model
|
| 143 |
+
model_dir = "deryaerman/mmbert_multilingual_pii_ner/jhu-clsp-mmBERT-base-multilingual-pii"
|
| 144 |
|
| 145 |
with open(f"{model_dir}/crf_config.json") as f:
|
| 146 |
config = json.load(f)
|
|
|
|
| 155 |
model.eval()
|
| 156 |
|
| 157 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
# Inference
|
| 160 |
+
text = "My name is John Smith and I live in Berlin."
|
| 161 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
| 162 |
+
inputs.pop("token_type_ids", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
with torch.no_grad():
|
| 165 |
+
outputs = model(**inputs)
|
| 166 |
emissions = outputs["logits"]
|
| 167 |
+
mask = inputs["attention_mask"].bool()
|
| 168 |
+
predictions = model.decode(emissions, mask)
|
| 169 |
+
|
| 170 |
+
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
| 171 |
+
for token, pred_id in zip(tokens, predictions[0]):
|
| 172 |
+
label = config["id2label"][str(pred_id)]
|
|
|
|
|
|
|
|
|
|
| 173 |
if label != "O":
|
| 174 |
+
print(f"{token:20s} -> {label}")
|
| 175 |
```
|
| 176 |
|
| 177 |
## Training Data
|
crf_config.json โ jhu-clsp-mmBERT-base-multilingual-pii/crf_config.json
RENAMED
|
File without changes
|
jhu-clsp-mmBERT-base-multilingual-pii/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9455404c7f48274f52550b29ec0510b646cb480823d0f2cd3cf18eb830b274b
|
| 3 |
+
size 1227931040
|
pytorch_model.bin โ jhu-clsp-mmBERT-base-multilingual-pii/pytorch_model.bin
RENAMED
|
File without changes
|
special_tokens_map.json โ jhu-clsp-mmBERT-base-multilingual-pii/special_tokens_map.json
RENAMED
|
File without changes
|
tokenizer.json โ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json
RENAMED
|
File without changes
|
tokenizer_config.json โ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer_config.json
RENAMED
|
File without changes
|