File size: 1,935 Bytes
6183f29 9b9e1af c543589 fab1845 c543589 6183f29 c543589 6183f29 2e9278d 172f6d4 2e9278d 6183f29 2e9278d 172f6d4 2e9278d 172f6d4 6183f29 d6ead5a 6183f29 d6ead5a 6183f29 c543589 6183f29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#pragma once
#include <stdint.h>
extern char _binary_tokenizer_bin_start[];
extern char _binary_tokenizer_bin_end[];
#define MAX_WORD_LEN 24
typedef struct __attribute__((packed)) token_t {
uint8_t byte;
uint16_t prev;
} token_t;
typedef struct Tokenizer Tokenizer;
struct Tokenizer {
token_t *vocab;
uint16_t (*get_vocab_size) (void);
uint16_t (*find) (Tokenizer *, uint8_t, uint16_t);
uint16_t (*encode) (Tokenizer *, uint8_t **);
uint8_t *(*decode) (Tokenizer *, uint16_t);
};
static uint16_t TokenizerGetVocabSize() { return (_binary_tokenizer_bin_end - _binary_tokenizer_bin_start) / 3; }
/*
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
*/
static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) {
//for (int i = 0; i < tokenizer->get_vocab_size(); ++i)
// printf("token %d: (%c, %d)\n", i, tokenizer->vocab[i].byte, tokenizer->vocab[i].prev);
//exit(0);
for (uint16_t i = prev; i < tokenizer->get_vocab_size(); ++i)
if (tokenizer->vocab[i].byte == byte && tokenizer->vocab[i].prev == prev)
return i;
return 0;
}
static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) {
uint16_t prev = 0;
for (; **seed_text; ++*seed_text) {
uint16_t next = tokenizer->find(tokenizer, **seed_text, prev);
if (next == 0) break;
prev = next;
}
return prev;
}
static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) {
static uint8_t dest[MAX_WORD_LEN + 1];
dest[MAX_WORD_LEN] = '\0';
uint16_t prev = token;
uint16_t i = MAX_WORD_LEN - 1;
for (; prev && i > 0; prev = tokenizer->vocab[prev].prev, --i)
dest[i] = tokenizer->vocab[prev].byte;
return dest + i + 1;
}
Tokenizer tokenizer = {
.vocab = (token_t *) _binary_tokenizer_bin_start,
.get_vocab_size = TokenizerGetVocabSize,
.find = TokenizerFind,
.encode = TokenizerEncode,
.decode = TokenizerDecode
};
|