#pragma once #include extern char _binary_tokenizer_bin_start[]; extern char _binary_tokenizer_bin_end[]; #define MAX_WORD_LEN 24 typedef struct __attribute__((packed)) token_t { uint8_t byte; uint16_t prev; } token_t; typedef struct Tokenizer Tokenizer; struct Tokenizer { token_t *vocab; uint16_t (*get_vocab_size) (void); uint16_t (*find) (Tokenizer *, uint8_t, uint16_t); uint16_t (*encode) (Tokenizer *, uint8_t **); uint8_t *(*decode) (Tokenizer *, uint16_t); }; static uint16_t TokenizerGetVocabSize() { return (_binary_tokenizer_bin_end - _binary_tokenizer_bin_start) / 3; } /* #include #include #include #include */ static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) { //for (int i = 0; i < tokenizer->get_vocab_size(); ++i) // printf("token %d: (%c, %d)\n", i, tokenizer->vocab[i].byte, tokenizer->vocab[i].prev); //exit(0); for (uint16_t i = prev; i < tokenizer->get_vocab_size(); ++i) if (tokenizer->vocab[i].byte == byte && tokenizer->vocab[i].prev == prev) return i; return 0; } static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) { uint16_t prev = 0; for (; **seed_text; ++*seed_text) { uint16_t next = tokenizer->find(tokenizer, **seed_text, prev); if (next == 0) break; prev = next; } return prev; } static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) { static uint8_t dest[MAX_WORD_LEN + 1]; dest[MAX_WORD_LEN] = '\0'; uint16_t prev = token; uint16_t i = MAX_WORD_LEN - 1; for (; prev && i > 0; prev = tokenizer->vocab[prev].prev, --i) dest[i] = tokenizer->vocab[prev].byte; return dest + i + 1; } Tokenizer tokenizer = { .vocab = (token_t *) _binary_tokenizer_bin_start, .get_vocab_size = TokenizerGetVocabSize, .find = TokenizerFind, .encode = TokenizerEncode, .decode = TokenizerDecode };