hanneshapke commited on
Commit
8cc41f3
·
verified ·
1 Parent(s): 6d35bc0

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -16,22 +16,22 @@ tags:
16
  - coreference-resolution
17
  - distilbert
18
  - multi-task
19
- base_model: distilbert-base-cased
20
  ---
21
 
22
  # Kiji PII Detection Model
23
 
24
- Multi-task DistilBERT model for detecting Personally Identifiable Information (PII) in text with coreference resolution. Fine-tuned from [`distilbert-base-cased`](https://huggingface.co/distilbert-base-cased).
25
 
26
  ## Model Summary
27
 
28
  | | |
29
  |---|---|
30
- | **Base model** | [distilbert-base-cased](https://huggingface.co/distilbert-base-cased) |
31
  | **Architecture** | Shared DistilBERT encoder + two linear classification heads |
32
  | **Parameters** | ~66M |
33
- | **Model size** | 249 MB (SafeTensors) |
34
- | **Tasks** | PII token classification (53 labels) + coreference detection (7 labels) |
35
  | **PII entity types** | 26 |
36
  | **Max sequence length** | 512 tokens |
37
 
@@ -45,7 +45,7 @@ Input (input_ids, attention_mask)
45
  +----+----+
46
  | |
47
  PII Head Coref Head
48
- (768->53) (768->7)
49
  ```
50
 
51
  The model uses multi-task learning: a shared DistilBERT encoder feeds into two independent linear classification heads. Both tasks are trained simultaneously with equal loss weighting, which acts as regularization and improves PII detection generalization.
@@ -132,12 +132,6 @@ Each entity type has `B-` (beginning) and `I-` (inside) variants, plus `O` for n
132
 
133
  Trained on the [DataikuNLP/kiji-pii-training-data](https://huggingface.co/datasets/DataikuNLP/kiji-pii-training-data) dataset — a synthetic multilingual PII dataset with entity annotations and coreference resolution.
134
 
135
- ## Derived Models
136
-
137
- | Variant | Format | Repository |
138
- |---------|--------|------------|
139
- | Quantized (INT8) | ONNX | [DataikuNLP/kiji-pii-model-onnx](https://huggingface.co/DataikuNLP/kiji-pii-model-onnx) |
140
-
141
  ## Limitations
142
 
143
  - Trained on **synthetically generated** data — may not generalize to all real-world text
 
16
  - coreference-resolution
17
  - distilbert
18
  - multi-task
19
+ base_model: microsoft/deberta-v3-small
20
  ---
21
 
22
  # Kiji PII Detection Model
23
 
24
+ Multi-task DistilBERT model for detecting Personally Identifiable Information (PII) in text with coreference resolution. Fine-tuned from [`microsoft/deberta-v3-small`](https://huggingface.co/microsoft/deberta-v3-small).
25
 
26
  ## Model Summary
27
 
28
  | | |
29
  |---|---|
30
+ | **Base model** | [microsoft/deberta-v3-small](https://huggingface.co/microsoft/deberta-v3-small) |
31
  | **Architecture** | Shared DistilBERT encoder + two linear classification heads |
32
  | **Parameters** | ~66M |
33
+ | **Model size** | 703 MB (SafeTensors) |
34
+ | **Tasks** | PII token classification (53 labels) + coreference detection (0 labels) |
35
  | **PII entity types** | 26 |
36
  | **Max sequence length** | 512 tokens |
37
 
 
45
  +----+----+
46
  | |
47
  PII Head Coref Head
48
+ (768->53) (768->0)
49
  ```
50
 
51
  The model uses multi-task learning: a shared DistilBERT encoder feeds into two independent linear classification heads. Both tasks are trained simultaneously with equal loss weighting, which acts as regularization and improves PII detection generalization.
 
132
 
133
  Trained on the [DataikuNLP/kiji-pii-training-data](https://huggingface.co/datasets/DataikuNLP/kiji-pii-training-data) dataset — a synthetic multilingual PII dataset with entity annotations and coreference resolution.
134
 
 
 
 
 
 
 
135
  ## Limitations
136
 
137
  - Trained on **synthetically generated** data — may not generalize to all real-world text
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "layer_norm_eps": 1e-07,
10
+ "legacy": true,
11
+ "max_position_embeddings": 512,
12
+ "max_relative_positions": -1,
13
+ "model_type": "deberta-v2",
14
+ "norm_rel_ebd": "layer_norm",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "pad_token_id": 0,
18
+ "pooler_dropout": 0,
19
+ "pooler_hidden_act": "gelu",
20
+ "pooler_hidden_size": 768,
21
+ "pos_att_type": [
22
+ "p2c",
23
+ "c2p"
24
+ ],
25
+ "position_biased_input": false,
26
+ "position_buckets": 256,
27
+ "relative_attention": true,
28
+ "share_att_key": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.51.3",
31
+ "type_vocab_size": 0,
32
+ "vocab_size": 128100
33
+ }
label_mappings.json CHANGED
@@ -111,16 +111,5 @@
111
  "52": "I-USERNAME",
112
  "-100": "IGNORE"
113
  }
114
- },
115
- "coref": {
116
- "id2label": {
117
- "0": "NO_COREF",
118
- "1": "CLUSTER_0",
119
- "2": "CLUSTER_1",
120
- "3": "CLUSTER_2",
121
- "4": "CLUSTER_3",
122
- "5": "CLUSTER_4",
123
- "6": "CLUSTER_5"
124
- }
125
  }
126
  }
 
111
  "52": "I-USERNAME",
112
  "-100": "IGNORE"
113
  }
 
 
 
 
 
 
 
 
 
 
 
114
  }
115
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:246b7e3f3f1e0369155ad84c55efa9769ddd149861f9ce7f93a8f293ab58ee7e
3
- size 260960440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d41bd69ac00fbf7bb546870200feb2a6d5df4c2b04150acd35ce4b0fe8ad6e1
3
+ size 736625504
special_tokens_map.json CHANGED
@@ -1,7 +1,15 @@
1
  {
 
2
  "cls_token": "[CLS]",
 
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
 
 
 
 
 
 
7
  }
 
1
  {
2
+ "bos_token": "[CLS]",
3
  "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
  "mask_token": "[MASK]",
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
  }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -8,31 +8,31 @@
8
  "single_word": false,
9
  "special": true
10
  },
11
- "100": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "101": {
20
- "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "102": {
28
- "content": "[SEP]",
29
  "lstrip": false,
30
- "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "103": {
36
  "content": "[MASK]",
37
  "lstrip": false,
38
  "normalized": false,
@@ -41,16 +41,19 @@
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
  "do_lower_case": false,
 
47
  "extra_special_tokens": {},
48
  "mask_token": "[MASK]",
49
- "model_max_length": 512,
50
  "pad_token": "[PAD]",
51
  "sep_token": "[SEP]",
52
- "strip_accents": null,
53
- "tokenize_chinese_chars": true,
54
- "tokenizer_class": "DistilBertTokenizer",
55
- "unk_token": "[UNK]"
 
56
  }
 
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "1": {
12
+ "content": "[CLS]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "2": {
20
+ "content": "[SEP]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "3": {
28
+ "content": "[UNK]",
29
  "lstrip": false,
30
+ "normalized": true,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "128000": {
36
  "content": "[MASK]",
37
  "lstrip": false,
38
  "normalized": false,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "[CLS]",
45
  "clean_up_tokenization_spaces": false,
46
  "cls_token": "[CLS]",
47
  "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
  "extra_special_tokens": {},
50
  "mask_token": "[MASK]",
51
+ "model_max_length": 1000000000000000019884624838656,
52
  "pad_token": "[PAD]",
53
  "sep_token": "[SEP]",
54
+ "sp_model_kwargs": {},
55
+ "split_by_punct": false,
56
+ "tokenizer_class": "DebertaV2Tokenizer",
57
+ "unk_token": "[UNK]",
58
+ "vocab_type": "spm"
59
  }