added the tokenizer
Browse files- README.md +30 -28
- tokenizer.json +2 -16
- tokenizer_config.json +43 -2
README.md
CHANGED
|
@@ -1,4 +1,34 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
model-index:
|
| 3 |
- name: Sociovestix/lenu_DE
|
| 4 |
results:
|
|
@@ -20,34 +50,6 @@ model-index:
|
|
| 20 |
name: f1 macro
|
| 21 |
args:
|
| 22 |
average: macro
|
| 23 |
-
widget:
|
| 24 |
-
- text: "Licht-Express Leuchtenvertrieb Gesellschaft mit beschränkter Haftung"
|
| 25 |
-
- text: "Lux 16 Briest BO GmbH & Co. KG"
|
| 26 |
-
- text: "Stiftung für Kunst und Kunstakademie Nürtingen - Stiftung des bürgerlichen Rechts"
|
| 27 |
-
- text: "Wasserverband Eifel-Rur"
|
| 28 |
-
- text: "Altenstiftung zur Einigkeit"
|
| 29 |
-
- text: "UFH-Börsenladies GbR"
|
| 30 |
-
- text: "MunichFinancialServices AG Holding"
|
| 31 |
-
- text: "Evangelisch-Lutherischer Kirchenbezirk Leipzig"
|
| 32 |
-
- text: "Das Küchenkollektiv UG (haftungsbeschränkt)"
|
| 33 |
-
- text: "Kath. Kirchenstiftung St. Johann Baptist Unterdarching"
|
| 34 |
-
- text: "maratim eG"
|
| 35 |
-
- text: "Kunststiftung Baden-Württemberg gGmbH"
|
| 36 |
-
- text: "Tectrade 93 e.K."
|
| 37 |
-
- text: "SOLIDAR Versicherungsgemeinschaft Sterbegeldversicherung VVaG"
|
| 38 |
-
- text: "Sparkasse Bielefeld"
|
| 39 |
-
- text: "PKF WULF EGERMANN oHG Zollernalb Treuhand Wirtschaftsprüfungsgesellschaft Steuerberatungsgesellschaft"
|
| 40 |
-
- text: "Flink SE"
|
| 41 |
-
- text: "Deutsche Gesellschaft zur Rettung Schiffbrüchiger"
|
| 42 |
-
- text: "Fini Invest GmbH & Co. KGaA"
|
| 43 |
-
- text: "Lengermann Hoffmann Partnerschaft mbB, Steuerberater"
|
| 44 |
-
- text: "Rechtsanwälte Dr. Schmitt und Kollegen Partnerschaft"
|
| 45 |
-
- text: "Krefelder Frauenverein für Kinder- und Altenfürsorge gegr. 1827"
|
| 46 |
-
- text: "Urlaubs- und Lohnausgleichskasse der Bauwirtschaft"
|
| 47 |
-
- text: "Weingärtner Esslingen e.G."
|
| 48 |
-
- text: "Relaxation and Security EWIV"
|
| 49 |
-
- text: "DWS Investmentaktiengesellschaft mit Teilgesellschaftsvermögen"
|
| 50 |
-
- text: "Feldmarkinteressentschaft Broitzem"
|
| 51 |
---
|
| 52 |
# LENU - Legal Entity Name Understanding for Germany
|
| 53 |
|
|
|
|
| 1 |
---
|
| 2 |
+
widget:
|
| 3 |
+
- text: Licht-Express Leuchtenvertrieb Gesellschaft mit beschränkter Haftung
|
| 4 |
+
- text: Lux 16 Briest BO GmbH & Co. KG
|
| 5 |
+
- text: Stiftung für Kunst und Kunstakademie Nürtingen - Stiftung des bürgerlichen
|
| 6 |
+
Rechts
|
| 7 |
+
- text: Wasserverband Eifel-Rur
|
| 8 |
+
- text: Altenstiftung zur Einigkeit
|
| 9 |
+
- text: UFH-Börsenladies GbR
|
| 10 |
+
- text: MunichFinancialServices AG Holding
|
| 11 |
+
- text: Evangelisch-Lutherischer Kirchenbezirk Leipzig
|
| 12 |
+
- text: Das Küchenkollektiv UG (haftungsbeschränkt)
|
| 13 |
+
- text: Kath. Kirchenstiftung St. Johann Baptist Unterdarching
|
| 14 |
+
- text: maratim eG
|
| 15 |
+
- text: Kunststiftung Baden-Württemberg gGmbH
|
| 16 |
+
- text: Tectrade 93 e.K.
|
| 17 |
+
- text: SOLIDAR Versicherungsgemeinschaft Sterbegeldversicherung VVaG
|
| 18 |
+
- text: Sparkasse Bielefeld
|
| 19 |
+
- text: PKF WULF EGERMANN oHG Zollernalb Treuhand Wirtschaftsprüfungsgesellschaft
|
| 20 |
+
Steuerberatungsgesellschaft
|
| 21 |
+
- text: Flink SE
|
| 22 |
+
- text: Deutsche Gesellschaft zur Rettung Schiffbrüchiger
|
| 23 |
+
- text: Fini Invest GmbH & Co. KGaA
|
| 24 |
+
- text: Lengermann Hoffmann Partnerschaft mbB, Steuerberater
|
| 25 |
+
- text: Rechtsanwälte Dr. Schmitt und Kollegen Partnerschaft
|
| 26 |
+
- text: Krefelder Frauenverein für Kinder- und Altenfürsorge gegr. 1827
|
| 27 |
+
- text: Urlaubs- und Lohnausgleichskasse der Bauwirtschaft
|
| 28 |
+
- text: Weingärtner Esslingen e.G.
|
| 29 |
+
- text: Relaxation and Security EWIV
|
| 30 |
+
- text: DWS Investmentaktiengesellschaft mit Teilgesellschaftsvermögen
|
| 31 |
+
- text: Feldmarkinteressentschaft Broitzem
|
| 32 |
model-index:
|
| 33 |
- name: Sociovestix/lenu_DE
|
| 34 |
results:
|
|
|
|
| 50 |
name: f1 macro
|
| 51 |
args:
|
| 52 |
average: macro
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
---
|
| 54 |
# LENU - Legal Entity Name Understanding for Germany
|
| 55 |
|
tokenizer.json
CHANGED
|
@@ -1,21 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
| 4 |
-
|
| 5 |
-
"max_length": 25,
|
| 6 |
-
"strategy": "LongestFirst",
|
| 7 |
-
"stride": 0
|
| 8 |
-
},
|
| 9 |
-
"padding": {
|
| 10 |
-
"strategy": {
|
| 11 |
-
"Fixed": 25
|
| 12 |
-
},
|
| 13 |
-
"direction": "Right",
|
| 14 |
-
"pad_to_multiple_of": null,
|
| 15 |
-
"pad_id": 0,
|
| 16 |
-
"pad_type_id": 0,
|
| 17 |
-
"pad_token": "[PAD]"
|
| 18 |
-
},
|
| 19 |
"added_tokens": [
|
| 20 |
{
|
| 21 |
"id": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
tokenizer_config.json
CHANGED
|
@@ -1,15 +1,56 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"cls_token": "[CLS]",
|
| 3 |
"do_basic_tokenize": true,
|
| 4 |
"do_lower_case": true,
|
| 5 |
"mask_token": "[MASK]",
|
| 6 |
"max_len": 512,
|
| 7 |
"model_max_length": 512,
|
| 8 |
-
"name_or_path": "dbmdz/bert-base-german-uncased",
|
| 9 |
"never_split": null,
|
| 10 |
"pad_token": "[PAD]",
|
| 11 |
"sep_token": "[SEP]",
|
| 12 |
-
"special_tokens_map_file": null,
|
| 13 |
"strip_accents": null,
|
| 14 |
"tokenize_chinese_chars": true,
|
| 15 |
"tokenizer_class": "BertTokenizer",
|
|
|
|
| 1 |
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"101": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"102": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"103": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"104": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
"cls_token": "[CLS]",
|
| 46 |
"do_basic_tokenize": true,
|
| 47 |
"do_lower_case": true,
|
| 48 |
"mask_token": "[MASK]",
|
| 49 |
"max_len": 512,
|
| 50 |
"model_max_length": 512,
|
|
|
|
| 51 |
"never_split": null,
|
| 52 |
"pad_token": "[PAD]",
|
| 53 |
"sep_token": "[SEP]",
|
|
|
|
| 54 |
"strip_accents": null,
|
| 55 |
"tokenize_chinese_chars": true,
|
| 56 |
"tokenizer_class": "BertTokenizer",
|