ULFBERTO commited on
Commit
677f594
·
verified ·
1 Parent(s): e1fdf4e

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +36 -0
tokenizer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ class CharacterTokenizer:
4
+ def __init__(self):
5
+ self.chars = []
6
+ self.vocab_size = 0
7
+ self.stoi = {}
8
+ self.itos = {}
9
+ self.special_tokens = ["<|pad|>", "<|user|>", "<|assistant|>", "<|end|>"]
10
+
11
+ def fit(self, text: str):
12
+ chars = sorted(list(set(text)))
13
+ self.chars = self.special_tokens + chars
14
+ self.vocab_size = len(self.chars)
15
+ self.stoi = { ch:i for i,ch in enumerate(self.chars) }
16
+ self.itos = { i:ch for i,ch in enumerate(self.chars) }
17
+ print(f"Vocabulario SSM creado. Tamaño: {self.vocab_size}")
18
+
19
+ def encode(self, s: str) -> list[int]:
20
+ return [self.stoi[c] for c in s if c in self.stoi]
21
+
22
+ def decode(self, l: list[int]) -> str:
23
+ return ''.join([self.itos[i] for i in l])
24
+
25
+ def load_books(data_path: str) -> str:
26
+ all_text = ""
27
+ if not os.path.exists(data_path):
28
+ return "Error: Carpeta de datos no encontrada."
29
+ for filename in os.listdir(data_path):
30
+ if filename.endswith(".txt"):
31
+ try:
32
+ with open(os.path.join(data_path, filename), 'r', encoding='utf-8') as f:
33
+ all_text += f.read() + "\n"
34
+ except:
35
+ continue
36
+ return all_text