Adding `safetensors` variant of this model

#1
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitattributes CHANGED
@@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
- tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
README.md CHANGED
@@ -101,9 +101,7 @@ Evaluated on held-out test sets per language (type-aware micro scores):
101
 
102
  ## Usage
103
 
104
- This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class.
105
-
106
- ### Setup
107
 
108
  ```python
109
  import torch
@@ -142,7 +140,7 @@ class ModernBertCRF(nn.Module):
142
  return self.crf.decode(emissions, mask=mask)
143
 
144
  # Load model
145
- model_dir = "deryaerman/mmbert_multilingual_pii_ner"
146
 
147
  with open(f"{model_dir}/crf_config.json") as f:
148
  config = json.load(f)
@@ -157,116 +155,23 @@ model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin", map_location=
157
  model.eval()
158
 
159
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
160
- id2label = {int(k): v for k, v in config["id2label"].items()}
161
- ```
162
-
163
- ### Preprocessing: Sentence Splitting
164
-
165
- The model was trained on **sentence-level** input โ€” each training example is a single sentence, split and tokenized using [spaCy](https://spacy.io/). For best results, split your input into sentences before inference. Passing unsplit speaker turns (multiple sentences as one input) can cause entities to be missed.
166
-
167
- ```python
168
- import re
169
- import spacy
170
-
171
- nlp = spacy.blank("en") # use "de" for German, "xx" for multilingual
172
- nlp.add_pipe("sentencizer")
173
-
174
- def split_dialogue(text):
175
- """
176
- Split raw dialogue text into a list of sentences (each a list of tokens).
177
- Expects lines like: 'SPEAKER_00: Hello, my name is Peter.'
178
- """
179
- sentences = []
180
- for line in text.strip().splitlines():
181
- m = re.match(r"^(SPEAKER_\d+)\s*:\s*(.*)", line.strip())
182
- if m:
183
- line = m.group(2)
184
- if not line:
185
- continue
186
- doc = nlp(line)
187
- for sent in doc.sents:
188
- tokens = [tok.text for tok in sent if not tok.is_space]
189
- if tokens:
190
- sentences.append(tokens)
191
- return sentences
192
-
193
- # Example
194
- raw = """SPEAKER_00: Hello, my name is Peter.
195
- SPEAKER_01: Hello, my name is Peter as well. Okay, and where do you come from? I come from Chicago."""
196
-
197
- dialogue = split_dialogue(raw)
198
- # [['Hello', ',', 'my', 'name', 'is', 'Peter', '.'],
199
- # ['Hello', ',', 'my', 'name', 'is', 'Peter', 'as', 'well', '.'],
200
- # ['Okay', ',', 'and', 'where', 'do', 'you', 'come', 'from', '?'],
201
- # ['I', 'come', 'from', 'Chicago', '.']]
202
- ```
203
-
204
- ### Inference
205
 
206
- ```python
207
- def predict_sentences(sentences, model, tokenizer, id2label, device="cpu"):
208
- """
209
- sentences: list of sentences, each a list of word tokens
210
- Returns: list of label lists, one per sentence
211
- """
212
- all_labels = []
213
- for tokens in sentences:
214
- enc = tokenizer(tokens, is_split_into_words=True,
215
- return_tensors="pt", truncation=True, max_length=512).to(device)
216
- word_ids = enc.word_ids(batch_index=0)
217
-
218
- with torch.no_grad():
219
- outputs = model(**enc)
220
- emissions = outputs["logits"]
221
- mask = enc["attention_mask"].bool()
222
- preds = model.decode(emissions, mask)[0]
223
-
224
- word_labels = ["O"] * len(tokens)
225
- seen = set()
226
- for idx, wid in enumerate(word_ids):
227
- if wid is None or wid in seen:
228
- continue
229
- seen.add(wid)
230
- word_labels[wid] = id2label[preds[idx]]
231
-
232
- all_labels.append(word_labels)
233
-
234
- return all_labels
235
-
236
-
237
- # Example: dialogue from above
238
- results = predict_sentences(dialogue, model, tokenizer, id2label)
239
-
240
- for sent_tokens, sent_labels in zip(dialogue, results):
241
- for token, label in zip(sent_tokens, sent_labels):
242
- if label != "O":
243
- print(f"{token:20s} -> {label}")
244
- ```
245
-
246
- ### Single-sentence inference
247
-
248
- If you only have isolated sentences, you can pass them directly:
249
-
250
- ```python
251
- tokens = ["My", "name", "is", "John", "Smith", "and", "I", "live", "in", "Berlin", "."]
252
-
253
- enc = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=512)
254
- word_ids = enc.word_ids(batch_index=0)
255
 
256
  with torch.no_grad():
257
- outputs = model(**enc)
258
  emissions = outputs["logits"]
259
- mask = enc["attention_mask"].bool()
260
- preds = model.decode(emissions, mask)[0]
261
-
262
- seen = set()
263
- for idx, wid in enumerate(word_ids):
264
- if wid is None or wid in seen:
265
- continue
266
- seen.add(wid)
267
- label = id2label[preds[idx]]
268
  if label != "O":
269
- print(f"{tokens[wid]:20s} -> {label}")
270
  ```
271
 
272
  ## Training Data
 
101
 
102
  ## Usage
103
 
104
+ This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class:
 
 
105
 
106
  ```python
107
  import torch
 
140
  return self.crf.decode(emissions, mask=mask)
141
 
142
  # Load model
143
+ model_dir = "deryaerman/mmbert_multilingual_pii_ner/jhu-clsp-mmBERT-base-multilingual-pii"
144
 
145
  with open(f"{model_dir}/crf_config.json") as f:
146
  config = json.load(f)
 
155
  model.eval()
156
 
157
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ # Inference
160
+ text = "My name is John Smith and I live in Berlin."
161
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
162
+ inputs.pop("token_type_ids", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  with torch.no_grad():
165
+ outputs = model(**inputs)
166
  emissions = outputs["logits"]
167
+ mask = inputs["attention_mask"].bool()
168
+ predictions = model.decode(emissions, mask)
169
+
170
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
171
+ for token, pred_id in zip(tokens, predictions[0]):
172
+ label = config["id2label"][str(pred_id)]
 
 
 
173
  if label != "O":
174
+ print(f"{token:20s} -> {label}")
175
  ```
176
 
177
  ## Training Data
crf_config.json โ†’ jhu-clsp-mmBERT-base-multilingual-pii/crf_config.json RENAMED
File without changes
jhu-clsp-mmBERT-base-multilingual-pii/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9455404c7f48274f52550b29ec0510b646cb480823d0f2cd3cf18eb830b274b
3
+ size 1227931040
pytorch_model.bin โ†’ jhu-clsp-mmBERT-base-multilingual-pii/pytorch_model.bin RENAMED
File without changes
special_tokens_map.json โ†’ jhu-clsp-mmBERT-base-multilingual-pii/special_tokens_map.json RENAMED
File without changes
tokenizer.json โ†’ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json RENAMED
File without changes
tokenizer_config.json โ†’ jhu-clsp-mmBERT-base-multilingual-pii/tokenizer_config.json RENAMED
File without changes