InigoHierroMuga commited on
Commit
9485e72
·
1 Parent(s): a4c9cd1
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,119 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: LICENSE
3
+ language:
4
+ - ES
5
+ pipeline_tag: translation
6
+ tags:
7
+ - Translation
8
+ - Capitalization-and-punctuation
9
+ - Transformer
10
+ ---
11
+
12
+ # HiTZ's Capitalization & Punctuation model for Spanish
13
+
14
+ ## Model description
15
+ This model was trained from scratch usign Marian NMT. The dataset used in training contains 9,784,905 sentences. The model was evaluated on the Flores-101 dev and devtest datasets and some randomly picked CommonVoice dataset sentences.
16
+ * **Developed by**: HiTZ Research Center (University of the Basque Country EHU)
17
+ * **Model type**: Capitalization and Punctuation
18
+ * **Language**: Spanish
19
+
20
+ ## Intended uses and limitations
21
+ You can use this model for Punctuation and Capitalization in Spanish
22
+ ## Usage
23
+ Required packages:
24
+ - torch
25
+ - transformers
26
+ - sentencepiece
27
+
28
+ ### Capitalizing using python:
29
+ Clone repository to download the model:
30
+
31
+ ```bash
32
+ git clone https://huggingface.co/HiTZ/marianmt-cap-punct-es
33
+ ```
34
+
35
+ Given `MODEL_PATH` is the path that points to the downloaded `marianmt-cap-punct-es` folder.
36
+
37
+ ```python
38
+ from transformers import pipeline
39
+
40
+ device = 0 # 0-->GPU, -1-->CPU
41
+
42
+ segment_list = ["hola buenos días a todos", "faktoria se escucha en la radio de e i te be", "más o menos el cuarenta y dos por ciento","cuatro ocho quince dieciséis veintitrés cuarenta y dos","mi año de nacimiento es mil novecientos noventa y seis", "más información en uve doble uve doble uve doble punto e hache u punto eus"]
43
+
44
+ translator = pipeline(task="translation", model=MODEL_PATH, tokenizer=MODEL_PATH, device=device)
45
+ result_list = translator(segment_list)
46
+ cp_segment_list = [result["translation_text"] for result in result_list]
47
+
48
+ for text, cp_text in zip(segment_list, cp_segment_list):
49
+ print(f"Normalized: {text}\n With C&P: {cp_text}\n")
50
+ ```
51
+
52
+ ### Expected output:
53
+ ```bash
54
+ Normalized: hola buenos días a todos
55
+ With C&P: Hola, buenos días a todos.
56
+
57
+ Normalized: faktoria se escucha en la radio de e i te be
58
+ With C&P: Faktoria se escucha en la radio de EiTB
59
+
60
+ Normalized: más o menos el cuarenta y dos por ciento
61
+ With C&P: Más o menos el 42 %.
62
+
63
+ Normalized: cuatro ocho quince dieciséis veintitrés cuarenta y dos
64
+ With C&P: Cuatro, ocho, quince, dieciséis, veintitrés, cuarenta y dos.
65
+
66
+ Normalized: mi año de nacimiento es mil novecientos noventa y seis
67
+ With C&P: Mi año de nacimiento es 1996.
68
+
69
+ Normalized: más información en uve doble uve doble uve doble punto e hache u punto eus
70
+ With C&P: Más información en www.ehu.eus
71
+ ```
72
+
73
+ ## Training
74
+
75
+ ### Data preparation
76
+ The training data was compiled by our research group from multiple heterogeneous sources and consists of approximately 9,784,905 sentences.
77
+
78
+ Prior to training, the data underwent preprocessing steps including cleaning, punctuation standardization, filtering, and the creation of aligned input–output sentence pairs for the capitalization and punctuation restoration task.
79
+
80
+ To generate the input–output pairs, the target sentences were lowercased, punctuation was removed, and text normalization was applied using an in-house normalization tool.
81
+
82
+ Example:
83
+ ```bash
84
+ Output (Cleaned, filetered and standarized): Esto supone pasar de los 0,22 euros por elector de las Elecciones Generales de 2011 a 0,18 euros en las de 2015.
85
+ Input (Lowercased, without punctuation and normalized): esto supone pasar de los cero coma veintidos euros por elector de las elecciones generales de dos mil once a cero coma dieciocho euros en las de dos mil quince
86
+ ```
87
+
88
+ ### Training procedure
89
+ The model was trained using the official [MarianNMT](https://marian-nmt.github.io/quickstart/) implementation.
90
+ Training was performed on a single NVIDIA TITAN RTX GPU.
91
+
92
+ ## Performance
93
+ The following table shows the model performance. We use the Word Error Rate metric. The WER-WITHOUT metric corresponds to the Word Error Rate computed on the evaluation dataset before applying capitalization and punctuation restoration. The WER metric corresponds to the output after processing with the model.
94
+ The evaluation dataset underwent the same processing as the training dataset.
95
+
96
+ | Metric | FLORES-101 | COMMON-VOICE |
97
+ |-------------|----------------|----------------|
98
+ | WER-WITHOUT | %19.55 | %22.42 |
99
+ | WER | %5.99 | %5.75 |
100
+
101
+
102
+ # Aditional Information
103
+ ## Author
104
+ HiTZ Basque Center for Language Technology - Aholab Signal Processing Laboratory, University of the Basque Country UPV/EHU.
105
+
106
+ ## Copyright
107
+ Copyright (c) 2025 HiTZ Basque Center for Language Technology - Aholab Signal Processing Laboratory, University of the Basque Country UPV/EHU.
108
+
109
+ ## Licensing Information
110
+ This work is licensed under a [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
111
+
112
+ ## Disclaimer
113
+ <details>
114
+ <summary>Click to expand</summary>
115
+ The models published in this repository are intended for a generalist purpose and are available to third parties. These models may have bias and/or any other undesirable distortions.
116
+
117
+ When third parties, deploy or provide systems and/or services to other parties using any of these models (or using systems based on these models) or become users of the models, they should note that it is their responsibility to mitigate the risks arising from their use and, in any event, to comply with applicable regulations, including regulations regarding the use of Artificial Intelligence.
118
+
119
+ In no event shall the owner and creator of the models (HiTZ Basque Center for Language Technology - Aholab Signal Processing Laboratory, University of the Basque Country UPV/EHU.) be liable for any results arising from the use made by third parties of these models.
config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "activation_dropout": 0.0,
3
- "activation_function": "swish",
4
- "architectures": [
5
- "MarianMTModel"
6
- ],
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 0,
9
- "d_model": 512,
10
- "decoder_attention_heads": 8,
11
- "decoder_ffn_dim": 2048,
12
- "decoder_layerdrop": 0.0,
13
- "decoder_layers": 6,
14
- "decoder_start_token_id": 32000,
15
- "decoder_vocab_size": 32001,
16
- "dropout": 0.1,
17
- "dtype": "float32",
18
- "encoder_attention_heads": 8,
19
- "encoder_ffn_dim": 2048,
20
- "encoder_layerdrop": 0.0,
21
- "encoder_layers": 6,
22
- "eos_token_id": 0,
23
- "forced_eos_token_id": 0,
24
- "init_std": 0.02,
25
- "is_encoder_decoder": true,
26
- "max_length": null,
27
- "max_position_embeddings": 512,
28
- "model_type": "marian",
29
- "normalize_embedding": false,
30
- "num_beams": null,
31
- "num_hidden_layers": 6,
32
- "pad_token_id": 32000,
33
- "scale_embedding": true,
34
- "share_encoder_decoder_embeddings": true,
35
- "static_position_embeddings": true,
36
- "transformers_version": "4.57.3",
37
- "use_cache": true,
38
- "vocab_size": 32001
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generation_config.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bad_words_ids": [
4
- [
5
- 32000
6
- ]
7
- ],
8
- "bos_token_id": 0,
9
- "decoder_start_token_id": 32000,
10
- "eos_token_id": 0,
11
- "forced_eos_token_id": 0,
12
- "max_length": 512,
13
- "num_beams": 6,
14
- "pad_token_id": 32000,
15
- "transformers_version": "4.57.3"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:737d3828c9f1ca12e3f4103fcccd77e82b4ff0e63dfe3c15dfdbafb9f6bdc783
3
- size 242249092
 
 
 
 
special_tokens_map.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "</s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<pad>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "unk_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "</s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<unk>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "32000": {
20
- "content": "<pad>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- }
27
- },
28
- "clean_up_tokenization_spaces": false,
29
- "eos_token": "</s>",
30
- "extra_special_tokens": {},
31
- "model_max_length": 512,
32
- "pad_token": "<pad>",
33
- "separate_vocabs": false,
34
- "source_lang": "",
35
- "sp_model_kwargs": {},
36
- "target_lang": "modelCastellano",
37
- "tokenizer_class": "MarianTokenizer",
38
- "unk_token": "<unk>"
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.json DELETED
The diff for this file is too large to render. See raw diff