File size: 1,798 Bytes
c64cf6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import json

def build_tokenizer_word_by_word_from_texts(texts, save_path="tokenizer_words.json", pad_token="[PAD]"):
    all_text = " ".join(texts)
    words = all_text.split()
    unique_words = sorted(set(words))

    if pad_token not in unique_words:
        unique_words.insert(0, pad_token)  

    stoi = {word: i for i, word in enumerate(unique_words)}
    itos = {i: word for word, i in stoi.items()}

    tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(tokenizer, f, ensure_ascii=False, indent=2)

    print(f"Tokenizer mot-par-mot sauvegardé dans {save_path} ({len(stoi)} tokens)")


def build_tokenizer_char_by_char_from_texts(texts, save_path="tokenizer.json", pad_token="[PAD]"):

    all_text = " ".join(texts)
    unique_chars = sorted(set(all_text))
    if pad_token not in unique_chars:
        unique_chars.insert(0, pad_token)  
    
    stoi = {ch: i for i, ch in enumerate(unique_chars)}
    itos = {i: ch for ch, i in stoi.items()}
    
    tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}
    
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(tokenizer, f, ensure_ascii=False, indent=2)
    
    print(f"Tokenizer sauvegardé dans {save_path} ({len(stoi)} tokens)")
    
    
def load_tokenizer(path="tokenizer.json"):
    with open(path, "r", encoding="utf-8") as f:
        tokenizer = json.load(f)

    stoi = tokenizer["stoi"]
    itos = {int(k): v for k, v in tokenizer["itos"].items()}

    encode = lambda s: [stoi[word] for word in s.split()]
    decode = lambda l: ' '.join([itos[i] for i in l])

    pad_token = tokenizer.get("pad_token", "[PAD]")
    pad_token_id = stoi[pad_token]

    return stoi, itos, encode, decode, pad_token_id