File size: 1,495 Bytes
1485644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#pragma once


#include <stdint.h>





extern char _embedded_binary_tokenizer[];


#define MAX_WORD_LEN 24

typedef struct __attribute__((packed)) token_t {
    uint8_t  byte;
    uint16_t prev;
} token_t;


typedef struct Tokenizer Tokenizer;
struct Tokenizer {
	token_t  *vocab;
	uint16_t  vocab_size;

	uint16_t  (*find)   (Tokenizer *, uint8_t, uint16_t);
	uint16_t  (*encode) (Tokenizer *, uint8_t **);
	uint8_t  *(*decode) (Tokenizer *, uint16_t);


};





static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) {
	for (uint16_t i = prev; i < tokenizer->vocab_size; ++i)
		if (tokenizer->vocab[i].byte == byte && tokenizer->vocab[i].prev == prev)
			return i;

	return 0;
}


static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) {	

	uint16_t prev = 0;
	for (; **seed_text; ++*seed_text) {
		uint16_t next = tokenizer->find(tokenizer, **seed_text, prev);
		if (next == 0) break;	
		prev = next;
	}

	return prev;
}


static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) {

	static uint8_t dest[MAX_WORD_LEN + 1];
	dest[MAX_WORD_LEN] = '\0';

	uint16_t prev = token;
	uint16_t i = MAX_WORD_LEN - 1;

	for (; prev && i > 0; prev = tokenizer->vocab[prev].prev, --i)
		dest[i] = tokenizer->vocab[prev].byte;

	return dest + i + 1;
}


Tokenizer tokenizer = {
	.vocab 	 	= (token_t *) _embedded_binary_tokenizer,

	.vocab_size = VOCAB_SIZE,

	.find   = TokenizerFind,
	.encode = TokenizerEncode,
	.decode = TokenizerDecode
};