mpolacek commited on
Commit
7097751
·
verified ·
1 Parent(s): fd94ee0

Upload 6 files

Browse files
Files changed (4) hide show
  1. README.md +48 -2
  2. special_tokens_map.json +6 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +17 -0
README.md CHANGED
@@ -41,6 +41,32 @@ mELECTRA uses a **SentencePiece tokenizer** and requires a SentencePiece model f
41
 
42
  ### Example: Tokenization
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  ```python
45
  import sentencepiece as spm
46
 
@@ -48,7 +74,27 @@ import sentencepiece as spm
48
  sp = spm.SentencePieceProcessor()
49
  sp.load("m.model")
50
 
51
- # Tokenize input text
52
- sentence = "This is a multilingual model supporting multiple languages."
53
  tokens = sp.encode(sentence, out_type=str)
54
  print(tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  ### Example: Tokenization
43
 
44
+ #### Using HuggingFace AutoTokenizer (Recommended)
45
+
46
+ ```python
47
+ from transformers import AutoTokenizer
48
+
49
+ # Load the tokenizer directly from HuggingFace Hub
50
+ tokenizer = AutoTokenizer.from_pretrained("AILabTUL/mELECTRA")
51
+
52
+ # Or load from local directory
53
+ # tokenizer = AutoTokenizer.from_pretrained("./mELECTRA")
54
+
55
+ # Tokenize input text
56
+ sentence = "This is a multilingual model supporting multiple languages."
57
+ tokens = tokenizer.tokenize(sentence)
58
+ ids = tokenizer.encode(sentence)
59
+
60
+ print(f"Tokens: {tokens}")
61
+ print(f"IDs: {ids}")
62
+
63
+ # Decode back to text
64
+ decoded = tokenizer.decode(ids)
65
+ print(f"Decoded: {decoded}")
66
+ ```
67
+
68
+ #### Using SentencePiece directly
69
+
70
  ```python
71
  import sentencepiece as spm
72
 
 
74
  sp = spm.SentencePieceProcessor()
75
  sp.load("m.model")
76
 
77
+ # Tokenize input text (note: input should be lowercase)
78
+ sentence = "this is a multilingual model supporting multiple languages."
79
  tokens = sp.encode(sentence, out_type=str)
80
  print(tokens)
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Citation
86
+
87
+ This model was published as part of the research paper:
88
+
89
+ **"Study on Automatic Punctuation Restoration in Bilingual Broadcast Stream"**
90
+ *Martin Poláček, Petr Červa*
91
+ *RANLP Student Workshop 2025*
92
+
93
+ Citation information will be provided after the conference publication.
94
+
95
+ ---
96
+
97
+ ## Related Models
98
+
99
+ - **Czech-Slovak**: [AILabTUL/BiELECTRA-czech-slovak](https://huggingface.co/AILabTUL/BiELECTRA-czech-slovak)
100
+ - **Norwegian-Swedish**: [AILabTUL/BiELECTRA-norwegian-swedish](https://huggingface.co/AILabTUL/BiELECTRA-norwegian-swedish)
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>",
5
+ "pad_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "transformers",
6
+ "PreTrainedTokenizerFast"
7
+ ]
8
+ },
9
+ "bos_token": "<s>",
10
+ "eos_token": "</s>",
11
+ "unk_token": "<unk>",
12
+ "pad_token": "<unk>",
13
+ "model_max_length": 512,
14
+ "special_tokens_map_file": null,
15
+ "name_or_path": "mELECTRA",
16
+ "tokenizer_type": "SentencePiece"
17
+ }