File size: 1,935 Bytes
6183f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b9e1af
c543589
 
fab1845
c543589
6183f29
 
 
c543589
6183f29
 
 
 
 
 
 
 
 
 
 
 
 
2e9278d
172f6d4
 
 
 
2e9278d
6183f29
 
 
2e9278d
 
172f6d4
2e9278d
172f6d4
6183f29
d6ead5a
6183f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6ead5a
 
6183f29
 
 
 
 
 
c543589
6183f29
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#pragma once


#include <stdint.h>




extern char _binary_tokenizer_bin_start[];
extern char _binary_tokenizer_bin_end[];



#define MAX_WORD_LEN 24

typedef struct __attribute__((packed)) token_t {
    uint8_t  byte;
    uint16_t prev;
} token_t;


typedef struct Tokenizer Tokenizer;
struct Tokenizer {
	token_t *vocab;
	
	uint16_t  (*get_vocab_size) (void);

	uint16_t  (*find)   (Tokenizer *, uint8_t, uint16_t);
	uint16_t  (*encode) (Tokenizer *, uint8_t **);
	uint8_t  *(*decode) (Tokenizer *, uint16_t);


};


static uint16_t TokenizerGetVocabSize() { return (_binary_tokenizer_bin_end - _binary_tokenizer_bin_start) / 3; }

/*
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
*/

static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) {
	
	//for (int i = 0; i < tokenizer->get_vocab_size(); ++i)
	//	printf("token %d: (%c, %d)\n", i, tokenizer->vocab[i].byte, tokenizer->vocab[i].prev);

	//exit(0);

	for (uint16_t i = prev; i < tokenizer->get_vocab_size(); ++i)
		if (tokenizer->vocab[i].byte == byte && tokenizer->vocab[i].prev == prev)
			return i;

	return 0;
}


static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) {	

	uint16_t prev = 0;
	for (; **seed_text; ++*seed_text) {
		uint16_t next = tokenizer->find(tokenizer, **seed_text, prev);
		if (next == 0) break;	
		prev = next;
	}

	return prev;
}


static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) {

	static uint8_t dest[MAX_WORD_LEN + 1];
	dest[MAX_WORD_LEN] = '\0';

	uint16_t prev = token;
	uint16_t i = MAX_WORD_LEN - 1;

	for (; prev && i > 0; prev = tokenizer->vocab[prev].prev, --i)
		dest[i] = tokenizer->vocab[prev].byte;

	return dest + i + 1;
}


Tokenizer tokenizer = {
	.vocab 	 	= (token_t *) _binary_tokenizer_bin_start,

	.get_vocab_size = TokenizerGetVocabSize,

	.find   = TokenizerFind,
	.encode = TokenizerEncode,
	.decode = TokenizerDecode
};