aoiandroid commited on
Commit
ec85f22
·
verified ·
1 Parent(s): 60d6626

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Helsinki-NLP-opus-mt-en-cy/README.md +25 -0
  2. Helsinki-NLP-opus-mt-en-cy/special_tokens_map.json +5 -0
  3. Helsinki-NLP-opus-mt-en-cy/tokenizer.json +0 -0
  4. Helsinki-NLP-opus-mt-en-cy/tokenizer_config.json +39 -0
  5. Helsinki-NLP-opus-mt-en-cy/vocab.json +0 -0
  6. Helsinki-NLP-opus-mt-en-da/README.md +25 -0
  7. Helsinki-NLP-opus-mt-en-da/config.json +56 -0
  8. Helsinki-NLP-opus-mt-en-da/generation_config.json +16 -0
  9. Helsinki-NLP-opus-mt-en-da/special_tokens_map.json +5 -0
  10. Helsinki-NLP-opus-mt-en-da/tokenizer.json +0 -0
  11. Helsinki-NLP-opus-mt-en-da/tokenizer_config.json +39 -0
  12. Helsinki-NLP-opus-mt-en-da/vocab.json +0 -0
  13. Helsinki-NLP-opus-mt-en-de/README.md +109 -0
  14. Helsinki-NLP-opus-mt-en-de/config.json +57 -0
  15. Helsinki-NLP-opus-mt-en-de/generation_config.json +16 -0
  16. Helsinki-NLP-opus-mt-en-de/special_tokens_map.json +5 -0
  17. Helsinki-NLP-opus-mt-en-de/tokenizer.json +0 -0
  18. Helsinki-NLP-opus-mt-en-de/tokenizer_config.json +39 -0
  19. Helsinki-NLP-opus-mt-en-de/vocab.json +0 -0
  20. Helsinki-NLP-opus-mt-en-dra/README.md +106 -0
  21. Helsinki-NLP-opus-mt-en-dra/config.json +56 -0
  22. Helsinki-NLP-opus-mt-en-dra/generation_config.json +16 -0
  23. Helsinki-NLP-opus-mt-en-dra/special_tokens_map.json +5 -0
  24. Helsinki-NLP-opus-mt-en-dra/tokenizer.json +0 -0
  25. Helsinki-NLP-opus-mt-en-dra/tokenizer_config.json +39 -0
  26. Helsinki-NLP-opus-mt-en-dra/vocab.json +0 -0
  27. Helsinki-NLP-opus-mt-en-ee/README.md +26 -0
  28. Helsinki-NLP-opus-mt-en-ee/config.json +56 -0
  29. Helsinki-NLP-opus-mt-en-ee/generation_config.json +16 -0
  30. Helsinki-NLP-opus-mt-en-ee/special_tokens_map.json +5 -0
  31. Helsinki-NLP-opus-mt-en-ee/tokenizer.json +0 -0
  32. Helsinki-NLP-opus-mt-en-ee/tokenizer_config.json +39 -0
  33. Helsinki-NLP-opus-mt-en-ee/vocab.json +0 -0
  34. Helsinki-NLP-opus-mt-en-efi/README.md +25 -0
  35. Helsinki-NLP-opus-mt-en-efi/config.json +56 -0
  36. Helsinki-NLP-opus-mt-en-efi/generation_config.json +16 -0
  37. Helsinki-NLP-opus-mt-en-efi/special_tokens_map.json +5 -0
  38. Helsinki-NLP-opus-mt-en-efi/tokenizer.json +0 -0
  39. Helsinki-NLP-opus-mt-en-efi/tokenizer_config.json +39 -0
  40. Helsinki-NLP-opus-mt-en-efi/vocab.json +0 -0
  41. Helsinki-NLP-opus-mt-en-el/README.md +25 -0
  42. Helsinki-NLP-opus-mt-en-el/config.json +56 -0
  43. Helsinki-NLP-opus-mt-en-el/generation_config.json +16 -0
  44. Helsinki-NLP-opus-mt-en-el/special_tokens_map.json +5 -0
  45. Helsinki-NLP-opus-mt-en-el/tokenizer.json +0 -0
  46. Helsinki-NLP-opus-mt-en-el/tokenizer_config.json +39 -0
  47. Helsinki-NLP-opus-mt-en-el/vocab.json +0 -0
  48. Helsinki-NLP-opus-mt-en-eo/README.md +25 -0
  49. Helsinki-NLP-opus-mt-en-eo/config.json +56 -0
  50. Helsinki-NLP-opus-mt-en-eo/generation_config.json +16 -0
Helsinki-NLP-opus-mt-en-cy/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-cy
8
+
9
+ * source languages: en
10
+ * target languages: cy
11
+ * OPUS readme: [en-cy](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-cy/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-cy/opus-2019-12-18.zip)
17
+ * test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cy/opus-2019-12-18.test.txt)
18
+ * test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cy/opus-2019-12-18.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | Tatoeba.en.cy | 25.3 | 0.487 |
25
+
Helsinki-NLP-opus-mt-en-cy/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-cy/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-cy/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "54394": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "cy",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-cy/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-da/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-da
8
+
9
+ * source languages: en
10
+ * target languages: da
11
+ * OPUS readme: [en-da](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-da/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-da/opus-2019-12-18.zip)
17
+ * test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-da/opus-2019-12-18.test.txt)
18
+ * test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-da/opus-2019-12-18.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | Tatoeba.en.da | 60.4 | 0.745 |
25
+
Helsinki-NLP-opus-mt-en-da/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 58929,
20
+ "decoder_vocab_size": 58930,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 58929,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 58930
56
+ }
Helsinki-NLP-opus-mt-en-da/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 58929
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 58929,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 58929,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-da/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-da/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-da/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "58929": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "da",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-da/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-de/README.md ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: cc-by-4.0
5
+ ---
6
+
7
+ ### opus-mt-en-de
8
+
9
+
10
+ ## Table of Contents
11
+ - [Model Details](#model-details)
12
+ - [Uses](#uses)
13
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
14
+ - [Training](#training)
15
+ - [Evaluation](#evaluation)
16
+ - [Citation Information](#citation-information)
17
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
18
+
19
+ ## Model Details
20
+ **Model Description:**
21
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
22
+ - **Model Type:** Translation
23
+ - **Language(s):**
24
+ - Source Language: English
25
+ - Target Language: German
26
+ - **License:** CC-BY-4.0
27
+ - **Resources for more information:**
28
+ - [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
29
+
30
+
31
+ ## Uses
32
+
33
+ #### Direct Use
34
+
35
+ This model can be used for translation and text-to-text generation.
36
+
37
+
38
+ ## Risks, Limitations and Biases
39
+
40
+
41
+
42
+ **CONTENT WARNING: Readers should be aware this section contains content that is disturbing, offensive, and can propagate historical and current stereotypes.**
43
+
44
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
45
+
46
+ Further details about the dataset for this model can be found in the OPUS readme: [en-de](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-de/README.md)
47
+
48
+
49
+ #### Training Data
50
+ ##### Preprocessing
51
+ * pre-processing: normalization + SentencePiece
52
+
53
+ * dataset: [opus](https://github.com/Helsinki-NLP/Opus-MT)
54
+ * download original weights: [opus-2020-02-26.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.zip)
55
+
56
+ * test set translations: [opus-2020-02-26.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.test.txt)
57
+
58
+ ## Evaluation
59
+
60
+ #### Results
61
+
62
+ * test set scores: [opus-2020-02-26.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-de/opus-2020-02-26.eval.txt)
63
+
64
+
65
+ #### Benchmarks
66
+
67
+ | testset | BLEU | chr-F |
68
+ |-----------------------|-------|-------|
69
+ | newssyscomb2009.en.de | 23.5 | 0.540 |
70
+ | news-test2008.en.de | 23.5 | 0.529 |
71
+ | newstest2009.en.de | 22.3 | 0.530 |
72
+ | newstest2010.en.de | 24.9 | 0.544 |
73
+ | newstest2011.en.de | 22.5 | 0.524 |
74
+ | newstest2012.en.de | 23.0 | 0.525 |
75
+ | newstest2013.en.de | 26.9 | 0.553 |
76
+ | newstest2015-ende.en.de | 31.1 | 0.594 |
77
+ | newstest2016-ende.en.de | 37.0 | 0.636 |
78
+ | newstest2017-ende.en.de | 29.9 | 0.586 |
79
+ | newstest2018-ende.en.de | 45.2 | 0.690 |
80
+ | newstest2019-ende.en.de | 40.9 | 0.654 |
81
+ | Tatoeba.en.de | 47.3 | 0.664 |
82
+
83
+
84
+
85
+ ## Citation Information
86
+
87
+ ```bibtex
88
+ @InProceedings{TiedemannThottingal:EAMT2020,
89
+ author = {J{\"o}rg Tiedemann and Santhosh Thottingal},
90
+ title = {{OPUS-MT} — {B}uilding open translation services for the {W}orld},
91
+ booktitle = {Proceedings of the 22nd Annual Conferenec of the European Association for Machine Translation (EAMT)},
92
+ year = {2020},
93
+ address = {Lisbon, Portugal}
94
+ }
95
+ ```
96
+
97
+ ## How to Get Started With the Model
98
+ ```python
99
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
100
+
101
+ tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
102
+
103
+ model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
104
+
105
+ ```
106
+
107
+
108
+
109
+
Helsinki-NLP-opus-mt-en-de/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 58100,
20
+ "decoder_vocab_size": 58101,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "is_transformers_support_available": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_length": null,
43
+ "max_position_embeddings": 512,
44
+ "model_type": "marian",
45
+ "normalize_before": false,
46
+ "normalize_embedding": false,
47
+ "num_beams": null,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 58100,
50
+ "scale_embedding": true,
51
+ "share_encoder_decoder_embeddings": true,
52
+ "static_position_embeddings": true,
53
+ "torch_dtype": "float32",
54
+ "transformers_version": "4.53.3",
55
+ "use_cache": true,
56
+ "vocab_size": 58101
57
+ }
Helsinki-NLP-opus-mt-en-de/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 58100
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 58100,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 58100,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-de/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-de/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-de/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "58100": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "de",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-de/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-dra/README.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - ta
5
+ - kn
6
+ - ml
7
+ - te
8
+ - dra
9
+
10
+ tags:
11
+ - translation
12
+
13
+ license: apache-2.0
14
+ ---
15
+
16
+ ### eng-dra
17
+
18
+ * source group: English
19
+ * target group: Dravidian languages
20
+ * OPUS readme: [eng-dra](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-dra/README.md)
21
+
22
+ * model: transformer
23
+ * source language(s): eng
24
+ * target language(s): kan mal tam tel
25
+ * model: transformer
26
+ * pre-processing: normalization + SentencePiece (spm32k,spm32k)
27
+ * a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
28
+ * download original weights: [opus-2020-07-26.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-dra/opus-2020-07-26.zip)
29
+ * test set translations: [opus-2020-07-26.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-dra/opus-2020-07-26.test.txt)
30
+ * test set scores: [opus-2020-07-26.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-dra/opus-2020-07-26.eval.txt)
31
+
32
+ ## Benchmarks
33
+
34
+ | testset | BLEU | chr-F |
35
+ |-----------------------|-------|-------|
36
+ | Tatoeba-test.eng-kan.eng.kan | 4.7 | 0.348 |
37
+ | Tatoeba-test.eng-mal.eng.mal | 13.1 | 0.515 |
38
+ | Tatoeba-test.eng.multi | 10.7 | 0.463 |
39
+ | Tatoeba-test.eng-tam.eng.tam | 9.0 | 0.444 |
40
+ | Tatoeba-test.eng-tel.eng.tel | 7.1 | 0.363 |
41
+
42
+
43
+ ### System Info:
44
+ - hf_name: eng-dra
45
+
46
+ - source_languages: eng
47
+
48
+ - target_languages: dra
49
+
50
+ - opus_readme_url: https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-dra/README.md
51
+
52
+ - original_repo: Tatoeba-Challenge
53
+
54
+ - tags: ['translation']
55
+
56
+ - languages: ['en', 'ta', 'kn', 'ml', 'te', 'dra']
57
+
58
+ - src_constituents: {'eng'}
59
+
60
+ - tgt_constituents: {'tam', 'kan', 'mal', 'tel'}
61
+
62
+ - src_multilingual: False
63
+
64
+ - tgt_multilingual: True
65
+
66
+ - prepro: normalization + SentencePiece (spm32k,spm32k)
67
+
68
+ - url_model: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-dra/opus-2020-07-26.zip
69
+
70
+ - url_test_set: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-dra/opus-2020-07-26.test.txt
71
+
72
+ - src_alpha3: eng
73
+
74
+ - tgt_alpha3: dra
75
+
76
+ - short_pair: en-dra
77
+
78
+ - chrF2_score: 0.46299999999999997
79
+
80
+ - bleu: 10.7
81
+
82
+ - brevity_penalty: 1.0
83
+
84
+ - ref_len: 7928.0
85
+
86
+ - src_name: English
87
+
88
+ - tgt_name: Dravidian languages
89
+
90
+ - train_date: 2020-07-26
91
+
92
+ - src_alpha2: en
93
+
94
+ - tgt_alpha2: dra
95
+
96
+ - prefer_old: False
97
+
98
+ - long_pair: eng-dra
99
+
100
+ - helsinki_git_sha: 480fcbe0ee1bf4774bcbe6226ad9f58e63f6c535
101
+
102
+ - transformers_git_sha: 2207e5d8cb224e954a7cba69fa4ac2309e9ff30b
103
+
104
+ - port_machine: brutasse
105
+
106
+ - port_time: 2020-08-21-14:41
Helsinki-NLP-opus-mt-en-dra/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "swish",
4
+ "add_bias_logits": false,
5
+ "add_final_layer_norm": false,
6
+ "architectures": [
7
+ "MarianMTModel"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 0,
11
+ "classif_dropout": 0.0,
12
+ "classifier_dropout": 0.0,
13
+ "d_model": 512,
14
+ "decoder_attention_heads": 8,
15
+ "decoder_ffn_dim": 2048,
16
+ "decoder_layerdrop": 0.0,
17
+ "decoder_layers": 6,
18
+ "decoder_start_token_id": 62951,
19
+ "decoder_vocab_size": 62952,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 8,
22
+ "encoder_ffn_dim": 2048,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 6,
25
+ "eos_token_id": 0,
26
+ "extra_pos_embeddings": 62952,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 62951,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 62952
56
+ }
Helsinki-NLP-opus-mt-en-dra/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 62951
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 62951,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 62951,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-dra/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-dra/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-dra/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "62951": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "eng",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "dra",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-dra/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-ee/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-ee
8
+
9
+ * source languages: en
10
+ * target languages: ee
11
+ * OPUS readme: [en-ee](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-ee/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2020-01-08.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ee/opus-2020-01-08.zip)
17
+ * test set translations: [opus-2020-01-08.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ee/opus-2020-01-08.test.txt)
18
+ * test set scores: [opus-2020-01-08.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ee/opus-2020-01-08.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | JW300.en.ee | 38.2 | 0.591 |
25
+ | Tatoeba.en.ee | 6.0 | 0.347 |
26
+
Helsinki-NLP-opus-mt-en-ee/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 57578,
20
+ "decoder_vocab_size": 57579,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 57578,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 57579
56
+ }
Helsinki-NLP-opus-mt-en-ee/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 57578
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 57578,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 57578,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-ee/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-ee/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-ee/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "57578": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "ee",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-ee/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-efi/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-efi
8
+
9
+ * source languages: en
10
+ * target languages: efi
11
+ * OPUS readme: [en-efi](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-efi/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2020-01-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-efi/opus-2020-01-20.zip)
17
+ * test set translations: [opus-2020-01-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-efi/opus-2020-01-20.test.txt)
18
+ * test set scores: [opus-2020-01-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-efi/opus-2020-01-20.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | JW300.en.efi | 38.0 | 0.568 |
25
+
Helsinki-NLP-opus-mt-en-efi/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 55089,
20
+ "decoder_vocab_size": 55090,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 55089,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 55090
56
+ }
Helsinki-NLP-opus-mt-en-efi/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 55089
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 55089,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 55089,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-efi/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-efi/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-efi/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "55089": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "efi",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-efi/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-el/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-el
8
+
9
+ * source languages: en
10
+ * target languages: el
11
+ * OPUS readme: [en-el](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-el/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-el/opus-2019-12-18.zip)
17
+ * test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-el/opus-2019-12-18.test.txt)
18
+ * test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-el/opus-2019-12-18.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | Tatoeba.en.el | 56.4 | 0.745 |
25
+
Helsinki-NLP-opus-mt-en-el/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 64825,
20
+ "decoder_vocab_size": 64826,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 64825,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 64826
56
+ }
Helsinki-NLP-opus-mt-en-el/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 64825
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 64825,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 64825,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }
Helsinki-NLP-opus-mt-en-el/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
Helsinki-NLP-opus-mt-en-el/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-el/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "64825": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "el",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
Helsinki-NLP-opus-mt-en-el/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Helsinki-NLP-opus-mt-en-eo/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - translation
4
+ license: apache-2.0
5
+ ---
6
+
7
+ ### opus-mt-en-eo
8
+
9
+ * source languages: en
10
+ * target languages: eo
11
+ * OPUS readme: [en-eo](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/en-eo/README.md)
12
+
13
+ * dataset: opus
14
+ * model: transformer-align
15
+ * pre-processing: normalization + SentencePiece
16
+ * download original weights: [opus-2019-12-18.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-eo/opus-2019-12-18.zip)
17
+ * test set translations: [opus-2019-12-18.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-eo/opus-2019-12-18.test.txt)
18
+ * test set scores: [opus-2019-12-18.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-eo/opus-2019-12-18.eval.txt)
19
+
20
+ ## Benchmarks
21
+
22
+ | testset | BLEU | chr-F |
23
+ |-----------------------|-------|-------|
24
+ | Tatoeba.en.eo | 49.5 | 0.682 |
25
+
Helsinki-NLP-opus-mt-en-eo/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 512,
15
+ "decoder_attention_heads": 8,
16
+ "decoder_ffn_dim": 2048,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 59196,
20
+ "decoder_vocab_size": 59197,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 8,
23
+ "encoder_ffn_dim": 2048,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 0,
27
+ "forced_eos_token_id": 0,
28
+ "id2label": {
29
+ "0": "LABEL_0",
30
+ "1": "LABEL_1",
31
+ "2": "LABEL_2"
32
+ },
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "is_transformers_support_available": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 59196,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.53.3",
54
+ "use_cache": true,
55
+ "vocab_size": 59197
56
+ }
Helsinki-NLP-opus-mt-en-eo/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 59196
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 59196,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 59196,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.53.3"
16
+ }