Adding `safetensors` variant of this model

by SFconvertbot - opened Apr 20

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+17

-110

Files changed (9) hide show

.DS_Store +0 -0
.gitattributes +0 -1
README.md +14 -109
crf_config.json → jhu-clsp-mmBERT-base-multilingual-pii/crf_config.json +0 -0
jhu-clsp-mmBERT-base-multilingual-pii/model.safetensors +3 -0
pytorch_model.bin → jhu-clsp-mmBERT-base-multilingual-pii/pytorch_model.bin +0 -0
special_tokens_map.json → jhu-clsp-mmBERT-base-multilingual-pii/special_tokens_map.json +0 -0
tokenizer.json → jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json +0 -0
tokenizer_config.json → jhu-clsp-mmBERT-base-multilingual-pii/tokenizer_config.json +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitattributes CHANGED Viewed

@@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -101,9 +101,7 @@ Evaluated on held-out test sets per language (type-aware micro scores):
 ## Usage
-This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class.
-### Setup
 ```python
 import torch
@@ -142,7 +140,7 @@ class ModernBertCRF(nn.Module):
         return self.crf.decode(emissions, mask=mask)
 # Load model
-model_dir = "deryaerman/mmbert_multilingual_pii_ner"
 with open(f"{model_dir}/crf_config.json") as f:
     config = json.load(f)
@@ -157,116 +155,23 @@ model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin", map_location=
 model.eval()
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
-id2label = {int(k): v for k, v in config["id2label"].items()}
-```
-### Preprocessing: Sentence Splitting
-The model was trained on **sentence-level** input — each training example is a single sentence, split and tokenized using [spaCy](https://spacy.io/). For best results, split your input into sentences before inference. Passing unsplit speaker turns (multiple sentences as one input) can cause entities to be missed.
-```python
-import re
-import spacy
-nlp = spacy.blank("en")          # use "de" for German, "xx" for multilingual
-nlp.add_pipe("sentencizer")
-def split_dialogue(text):
-    """
-    Split raw dialogue text into a list of sentences (each a list of tokens).
-    Expects lines like: 'SPEAKER_00: Hello, my name is Peter.'
-    """
-    sentences = []
-    for line in text.strip().splitlines():
-        m = re.match(r"^(SPEAKER_\d+)\s*:\s*(.*)", line.strip())
-        if m:
-            line = m.group(2)
-        if not line:
-            continue
-        doc = nlp(line)
-        for sent in doc.sents:
-            tokens = [tok.text for tok in sent if not tok.is_space]
-            if tokens:
-                sentences.append(tokens)
-    return sentences
-# Example
-raw = """SPEAKER_00: Hello, my name is Peter.
-SPEAKER_01: Hello, my name is Peter as well. Okay, and where do you come from? I come from Chicago."""
-dialogue = split_dialogue(raw)
-# [['Hello', ',', 'my', 'name', 'is', 'Peter', '.'],
-#  ['Hello', ',', 'my', 'name', 'is', 'Peter', 'as', 'well', '.'],
-#  ['Okay', ',', 'and', 'where', 'do', 'you', 'come', 'from', '?'],
-#  ['I', 'come', 'from', 'Chicago', '.']]
-```
-### Inference
-```python
-def predict_sentences(sentences, model, tokenizer, id2label, device="cpu"):
-    """
-    sentences: list of sentences, each a list of word tokens
-    Returns:   list of label lists, one per sentence
-    """
-    all_labels = []
-    for tokens in sentences:
-        enc = tokenizer(tokens, is_split_into_words=True,
-                        return_tensors="pt", truncation=True, max_length=512).to(device)
-        word_ids = enc.word_ids(batch_index=0)
-        with torch.no_grad():
-            outputs = model(**enc)
-            emissions = outputs["logits"]
-            mask = enc["attention_mask"].bool()
-            preds = model.decode(emissions, mask)[0]
-        word_labels = ["O"] * len(tokens)
-        seen = set()
-        for idx, wid in enumerate(word_ids):
-            if wid is None or wid in seen:
-                continue
-            seen.add(wid)
-            word_labels[wid] = id2label[preds[idx]]
-        all_labels.append(word_labels)
-    return all_labels
-# Example: dialogue from above
-results = predict_sentences(dialogue, model, tokenizer, id2label)
-for sent_tokens, sent_labels in zip(dialogue, results):
-    for token, label in zip(sent_tokens, sent_labels):
-        if label != "O":
-            print(f"{token:20s} -> {label}")
-```
-### Single-sentence inference
-If you only have isolated sentences, you can pass them directly:
-```python
-tokens = ["My", "name", "is", "John", "Smith", "and", "I", "live", "in", "Berlin", "."]
-enc = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=512)
-word_ids = enc.word_ids(batch_index=0)
 with torch.no_grad():
-    outputs = model(**enc)
     emissions = outputs["logits"]
-    mask = enc["attention_mask"].bool()
-    preds = model.decode(emissions, mask)[0]
-seen = set()
-for idx, wid in enumerate(word_ids):
-    if wid is None or wid in seen:
-        continue
-    seen.add(wid)
-    label = id2label[preds[idx]]
     if label != "O":
-        print(f"{tokens[wid]:20s} -> {label}")
 ```
 ## Training Data

 ## Usage
+This model uses a custom CRF architecture and cannot be loaded directly with `AutoModelForTokenClassification`. You need to use the custom `ModernBertCRF` class:
 ```python
 import torch
         return self.crf.decode(emissions, mask=mask)
 # Load model
+model_dir = "deryaerman/mmbert_multilingual_pii_ner/jhu-clsp-mmBERT-base-multilingual-pii"
 with open(f"{model_dir}/crf_config.json") as f:
     config = json.load(f)
 model.eval()
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
+# Inference
+text = "My name is John Smith and I live in Berlin."
+inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+inputs.pop("token_type_ids", None)
 with torch.no_grad():
+    outputs = model(**inputs)
     emissions = outputs["logits"]
+    mask = inputs["attention_mask"].bool()
+    predictions = model.decode(emissions, mask)
+tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+for token, pred_id in zip(tokens, predictions[0]):
+    label = config["id2label"][str(pred_id)]
     if label != "O":
+        print(f"{token:20s} -> {label}")
 ```
 ## Training Data

crf_config.json → jhu-clsp-mmBERT-base-multilingual-pii/crf_config.json RENAMED Viewed

File without changes

jhu-clsp-mmBERT-base-multilingual-pii/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9455404c7f48274f52550b29ec0510b646cb480823d0f2cd3cf18eb830b274b
+size 1227931040

pytorch_model.bin → jhu-clsp-mmBERT-base-multilingual-pii/pytorch_model.bin RENAMED Viewed

File without changes

special_tokens_map.json → jhu-clsp-mmBERT-base-multilingual-pii/special_tokens_map.json RENAMED Viewed

File without changes

tokenizer.json → jhu-clsp-mmBERT-base-multilingual-pii/tokenizer.json RENAMED Viewed

File without changes

tokenizer_config.json → jhu-clsp-mmBERT-base-multilingual-pii/tokenizer_config.json RENAMED Viewed

File without changes