harshildarji commited on
Commit
d39696e
·
verified ·
1 Parent(s): ccb4e5f

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +48 -3
  2. config.json +109 -0
  3. f1.png +0 -0
  4. model.safetensors +3 -0
  5. special_tokens_map.json +7 -0
  6. tokenizer_config.json +58 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,48 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - elenanereiss/german-ler
5
+ language:
6
+ - de
7
+ base_model:
8
+ - google-bert/bert-base-german-cased
9
+ pipeline_tag: token-classification
10
+ library_name: transformers
11
+ ---
12
+
13
+ ## German BERT for Legal NER
14
+
15
+ **F1-Score:** **`99.762`**
16
+
17
+ This model is fine-tuned on the [German LER dataset](https://huggingface.co/datasets/elenanereiss/german-ler), introduced in [this paper](https://link.springer.com/content/pdf/10.1007/978-3-030-33220-4_20.pdf). The LER dataset provides annotations across 19 fine-grained legal entity classes, capturing the complexity of legal texts in German.
18
+
19
+ ## Class-wise Performance Metrics
20
+
21
+ The table below summarizes the class-wise performance metrics of our improved model:
22
+ | Abbreviation | Class | Dataset % | F1-Score |
23
+ |--------------|---------------------|-----------|----------|
24
+ | PER | Person | 3.26 | 94.47 |
25
+ | RR | Judge | 2.83 | 99.56 |
26
+ | AN | Lawyer | 0.21 | 92.31 |
27
+ | LD | Country | 2.66 | 96.30 |
28
+ | ST | City | 1.31 | 91.53 |
29
+ | STR | Street | 0.25 | 95.05 |
30
+ | LDS | Landscape | 0.37 | 88.24 |
31
+ | ORG | Organization | 2.17 | 93.72 |
32
+ | UN | Company | 1.97 | 98.16 |
33
+ | INN | Institution | 4.09 | 97.73 |
34
+ | GRT | Court | 5.99 | 98.32 |
35
+ | MRK | Brand | 0.53 | 98.65 |
36
+ | GS | Law | 34.53 | 99.46 |
37
+ | VO | Ordinance | 1.49 | 95.72 |
38
+ | EUN | European legal norm | 2.79 | 97.79 |
39
+ | VS | Regulation | 1.13 | 89.73 |
40
+ | VT | Contract | 5.34 | 99.22 |
41
+ | RS | Court decision | 23.46 | 99.76 |
42
+ | LIT | Legal literature | 5.60 | 98.09 |
43
+
44
+ ## Comparison of F1 Scores
45
+
46
+ Below is a comparison of F1 scores between our previous model, [gbert-legal-ner](https://huggingface.co/PaDaS-Lab/gbert-legal-ner), and JuraNER:
47
+
48
+ ![f1-comparison](https://huggingface.co/harshildarji/JuraNER/resolve/main/f1.png)
config.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-german-cased",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "I-INN",
13
+ "1": "I-GRT",
14
+ "2": "I-RS",
15
+ "3": "B-ORG",
16
+ "4": "I-UN",
17
+ "5": "I-LDS",
18
+ "6": "I-GS",
19
+ "7": "I-LD",
20
+ "8": "I-ORG",
21
+ "9": "B-LD",
22
+ "10": "I-EUN",
23
+ "11": "B-INN",
24
+ "12": "I-VO",
25
+ "13": "B-GS",
26
+ "14": "B-AN",
27
+ "15": "I-RR",
28
+ "16": "I-AN",
29
+ "17": "B-UN",
30
+ "18": "B-RR",
31
+ "19": "I-STR",
32
+ "20": "B-PER",
33
+ "21": "I-VT",
34
+ "22": "I-MRK",
35
+ "23": "B-RS",
36
+ "24": "I-LIT",
37
+ "25": "B-VT",
38
+ "26": "B-STR",
39
+ "27": "B-VS",
40
+ "28": "B-VO",
41
+ "29": "B-EUN",
42
+ "30": "I-VS",
43
+ "31": "B-ST",
44
+ "32": "I-ST",
45
+ "33": "B-LIT",
46
+ "34": "O",
47
+ "35": "B-LDS",
48
+ "36": "B-GRT",
49
+ "37": "B-MRK",
50
+ "38": "I-PER",
51
+ "39": "PAD"
52
+ },
53
+ "initializer_range": 0.02,
54
+ "intermediate_size": 3072,
55
+ "label2id": {
56
+ "B-AN": 14,
57
+ "B-EUN": 29,
58
+ "B-GRT": 36,
59
+ "B-GS": 13,
60
+ "B-INN": 11,
61
+ "B-LD": 9,
62
+ "B-LDS": 35,
63
+ "B-LIT": 33,
64
+ "B-MRK": 37,
65
+ "B-ORG": 3,
66
+ "B-PER": 20,
67
+ "B-RR": 18,
68
+ "B-RS": 23,
69
+ "B-ST": 31,
70
+ "B-STR": 26,
71
+ "B-UN": 17,
72
+ "B-VO": 28,
73
+ "B-VS": 27,
74
+ "B-VT": 25,
75
+ "I-AN": 16,
76
+ "I-EUN": 10,
77
+ "I-GRT": 1,
78
+ "I-GS": 6,
79
+ "I-INN": 0,
80
+ "I-LD": 7,
81
+ "I-LDS": 5,
82
+ "I-LIT": 24,
83
+ "I-MRK": 22,
84
+ "I-ORG": 8,
85
+ "I-PER": 38,
86
+ "I-RR": 15,
87
+ "I-RS": 2,
88
+ "I-ST": 32,
89
+ "I-STR": 19,
90
+ "I-UN": 4,
91
+ "I-VO": 12,
92
+ "I-VS": 30,
93
+ "I-VT": 21,
94
+ "O": 34,
95
+ "PAD": 39
96
+ },
97
+ "layer_norm_eps": 1e-12,
98
+ "max_position_embeddings": 512,
99
+ "model_type": "bert",
100
+ "num_attention_heads": 12,
101
+ "num_hidden_layers": 12,
102
+ "pad_token_id": 0,
103
+ "position_embedding_type": "absolute",
104
+ "torch_dtype": "float32",
105
+ "transformers_version": "4.47.1",
106
+ "type_vocab_size": 2,
107
+ "use_cache": true,
108
+ "vocab_size": 30000
109
+ }
f1.png ADDED
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3de3c63fb08ef11b0b7f3c40b64084d5a2511d4c386ab6db1a51f4696f216d84
3
+ size 434109392
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "2": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "3": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "4": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "5": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff