Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +6 -12
added_tokens.json +3 -0
config.json +33 -0
label_mappings.json +0 -11
model.safetensors +2 -2
special_tokens_map.json +9 -1
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +16 -13

README.md CHANGED Viewed

@@ -16,22 +16,22 @@ tags:
 - coreference-resolution
 - distilbert
 - multi-task
-base_model: distilbert-base-cased
 ---
 # Kiji PII Detection Model
-Multi-task DistilBERT model for detecting Personally Identifiable Information (PII) in text with coreference resolution. Fine-tuned from [`distilbert-base-cased`](https://huggingface.co/distilbert-base-cased).
 ## Model Summary
 | | |
 |---|---|
-| **Base model** | [distilbert-base-cased](https://huggingface.co/distilbert-base-cased) |
 | **Architecture** | Shared DistilBERT encoder + two linear classification heads |
 | **Parameters** | ~66M |
-| **Model size** | 249 MB (SafeTensors) |
-| **Tasks** | PII token classification (53 labels) + coreference detection (7 labels) |
 | **PII entity types** | 26 |
 | **Max sequence length** | 512 tokens |
@@ -45,7 +45,7 @@ Input (input_ids, attention_mask)
    +----+----+
    |         |
 PII Head  Coref Head
-(768->53)  (768->7)
 ```
 The model uses multi-task learning: a shared DistilBERT encoder feeds into two independent linear classification heads. Both tasks are trained simultaneously with equal loss weighting, which acts as regularization and improves PII detection generalization.
@@ -132,12 +132,6 @@ Each entity type has `B-` (beginning) and `I-` (inside) variants, plus `O` for n
 Trained on the [DataikuNLP/kiji-pii-training-data](https://huggingface.co/datasets/DataikuNLP/kiji-pii-training-data) dataset — a synthetic multilingual PII dataset with entity annotations and coreference resolution.
-## Derived Models
-| Variant | Format | Repository |
-|---------|--------|------------|
-| Quantized (INT8) | ONNX | [DataikuNLP/kiji-pii-model-onnx](https://huggingface.co/DataikuNLP/kiji-pii-model-onnx) |
 ## Limitations
 - Trained on **synthetically generated** data — may not generalize to all real-world text

 - coreference-resolution
 - distilbert
 - multi-task
+base_model: microsoft/deberta-v3-small
 ---
 # Kiji PII Detection Model
+Multi-task DistilBERT model for detecting Personally Identifiable Information (PII) in text with coreference resolution. Fine-tuned from [`microsoft/deberta-v3-small`](https://huggingface.co/microsoft/deberta-v3-small).
 ## Model Summary
 | | |
 |---|---|
+| **Base model** | [microsoft/deberta-v3-small](https://huggingface.co/microsoft/deberta-v3-small) |
 | **Architecture** | Shared DistilBERT encoder + two linear classification heads |
 | **Parameters** | ~66M |
+| **Model size** | 703 MB (SafeTensors) |
+| **Tasks** | PII token classification (53 labels) + coreference detection (0 labels) |
 | **PII entity types** | 26 |
 | **Max sequence length** | 512 tokens |
    +----+----+
    |         |
 PII Head  Coref Head
+(768->53)  (768->0)
 ```
 The model uses multi-task learning: a shared DistilBERT encoder feeds into two independent linear classification heads. Both tasks are trained simultaneously with equal loss weighting, which acts as regularization and improves PII detection generalization.
 Trained on the [DataikuNLP/kiji-pii-training-data](https://huggingface.co/datasets/DataikuNLP/kiji-pii-training-data) dataset — a synthetic multilingual PII dataset with entity annotations and coreference resolution.
 ## Limitations
 - Trained on **synthetically generated** data — may not generalize to all real-world text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_attn_implementation_autoset": true,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "legacy": true,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

label_mappings.json CHANGED Viewed

@@ -111,16 +111,5 @@
       "52": "I-USERNAME",
       "-100": "IGNORE"
     }
-  },
-  "coref": {
-    "id2label": {
-      "0": "NO_COREF",
-      "1": "CLUSTER_0",
-      "2": "CLUSTER_1",
-      "3": "CLUSTER_2",
-      "4": "CLUSTER_3",
-      "5": "CLUSTER_4",
-      "6": "CLUSTER_5"
-    }
   }
 }

       "52": "I-USERNAME",
       "-100": "IGNORE"
     }
   }
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:246b7e3f3f1e0369155ad84c55efa9769ddd149861f9ce7f93a8f293ab58ee7e
-size 260960440

 version https://git-lfs.github.com/spec/v1
+oid sha256:3d41bd69ac00fbf7bb546870200feb2a6d5df4c2b04150acd35ce4b0fe8ad6e1
+size 736625504

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,15 @@
 {
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
 }

 {
+  "bos_token": "[CLS]",
   "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
 }

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -8,31 +8,31 @@
       "single_word": false,
       "special": true
     },
-    "100": {
-      "content": "[UNK]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "101": {
-      "content": "[CLS]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "102": {
-      "content": "[SEP]",
       "lstrip": false,
-      "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "103": {
       "content": "[MASK]",
       "lstrip": false,
       "normalized": false,
@@ -41,16 +41,19 @@
       "special": true
     }
   },
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "do_lower_case": false,
   "extra_special_tokens": {},
   "mask_token": "[MASK]",
-  "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "DistilBertTokenizer",
-  "unk_token": "[UNK]"
 }

       "single_word": false,
       "special": true
     },
+    "1": {
+      "content": "[CLS]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "2": {
+      "content": "[SEP]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "3": {
+      "content": "[UNK]",
       "lstrip": false,
+      "normalized": true,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "128000": {
       "content": "[MASK]",
       "lstrip": false,
       "normalized": false,
       "special": true
     }
   },
+  "bos_token": "[CLS]",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "do_lower_case": false,
+  "eos_token": "[SEP]",
   "extra_special_tokens": {},
   "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
 }