flpelerin commited on
Commit
6183f29
·
1 Parent(s): c8a8f2e

Update 3 files

Browse files

- /c_tokenizer/main.c
- /c_tokenizer/tokenizer.h
- /c_tokenizer/Makefile

c_tokenizer/Makefile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC := gcc
2
+ CFLAGS := -O3 -Ofast -ffast-math
3
+ CLIBS := -fopenmp
4
+
5
+ C_TOKENIZER := tokenizer.bin
6
+ SRC := *.c tokenizer.o
7
+ TARGET := a.out
8
+
9
+
10
+
11
+ all: $(TARGET)
12
+
13
+ clean:
14
+ $(RM) $(TARGET) *.o
15
+
16
+ wipe:
17
+ make clean
18
+ $(RM) *.bin *.txt
19
+
20
+
21
+ run: $(TARGET)
22
+ ./$< data.txt data.bin
23
+
24
+
25
+
26
+ $(C_TOKENIZER):
27
+ awk 'BEGIN {for (i = 0; i <= 255; i++) printf("%c%c%c", i, 0, 0)}' > $@
28
+
29
+
30
+ #elf64-littleaarch64
31
+
32
+
33
+ tokenizer.o: $(C_TOKENIZER)
34
+ objcopy --input-target binary \
35
+ --output-target elf64-x86-64 \
36
+ $< $@
37
+
38
+ #--redefine-sym _binary_tokenizer_bin_start=_embedded_binary_tokenizer \
39
+ #$< $@
40
+
41
+
42
+
43
+
44
+ $(TARGET): $(SRC)
45
+ $(CC) $(CFLAGS) -o $@ $^ $(CLIBS)
c_tokenizer/main.c ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ #include <stdio.h>
4
+ #include <fcntl.h>
5
+ #include <stdlib.h>
6
+ #include <unistd.h>
7
+ #include <string.h>
8
+
9
+
10
+
11
+
12
+
13
+ #include "tokenizer.h"
14
+
15
+
16
+
17
+
18
+
19
+
20
+ uint32_t get_file_size(const char *filename) {
21
+ FILE *file = fopen(filename, "rb");
22
+ if (!file) {
23
+ perror("fopen");
24
+ exit(EXIT_FAILURE);
25
+ }
26
+
27
+ if (fseek(file, 0, SEEK_END)!= 0) {
28
+ perror("fseek");
29
+ exit(EXIT_FAILURE);
30
+ }
31
+
32
+ uint32_t size = ftell(file);
33
+ if (size == -1) {
34
+ perror("ftell");
35
+ exit(EXIT_FAILURE);
36
+ }
37
+
38
+ fclose(file);
39
+ return size;
40
+ }
41
+
42
+
43
+ int main(int argc, char *argv[]) {
44
+ if(argc < 3) {
45
+ printf("Needs at least 2 arguments\n");
46
+ return 1;
47
+ }
48
+
49
+ char *fname[2] = {argv[1], argv[2]};
50
+ int fd[2] = {
51
+ open(fname[0], O_RDONLY),
52
+ open(fname[1], O_CREAT | O_WRONLY, 0644)
53
+ };
54
+
55
+ printf("Vocab size is %d\n", tokenizer.get_vocab_size());
56
+ printf("File names: %s, %s\n", fname[0], fname[1]);
57
+ printf("Got file descriptor: %d, %d\n", fd[0], fd[1]);
58
+
59
+ uint32_t size = get_file_size(fname[0]);
60
+ uint8_t *data = (uint8_t *) malloc(size * sizeof(uint8_t));
61
+ uint32_t n = read(fd[0], data, size * sizeof(uint8_t));
62
+
63
+ printf("Expected file size of %d bytes\n", size);
64
+ printf("Read %d bytes\n", n);
65
+ if (size != n) perror(0);
66
+
67
+ uint8_t *p = data;
68
+ uint32_t d = 0;
69
+ for (int i = 0; *p; ++i) {
70
+ uint16_t tok = tokenizer.encode(&tokenizer, &p);
71
+ write(fd[1], &tok, sizeof(uint16_t));
72
+
73
+ if (i % 10000 == 0) {
74
+ printf("Current token %d | Index position %d | Remaining %d | Tokenization delta %ld\n", tok, i, d, d - ((data + n) - p));
75
+ d = (data + n) - p;
76
+ }
77
+
78
+ }
79
+
80
+ printf("\n");
81
+ return 0;
82
+ }
c_tokenizer/tokenizer.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+
4
+ #include <stdint.h>
5
+
6
+
7
+
8
+
9
+ extern char _binary_tokenizer_bin_start[];
10
+ extern char _binary_tokenizer_bin_end[];
11
+
12
+
13
+
14
+ #define MAX_WORD_LEN 24
15
+
16
+ typedef struct {
17
+ struct __attribute__((packed)) {
18
+ uint8_t byte;
19
+ uint16_t prev;
20
+ } token[VOCAB_SIZE];
21
+ } vocab_t;
22
+
23
+ typedef struct Tokenizer Tokenizer;
24
+ struct Tokenizer {
25
+ vocab_t *vocab;
26
+
27
+ uint16_t (*get_vocab_size) (void);
28
+
29
+ uint16_t (*find) (Tokenizer *, uint8_t, uint16_t);
30
+ uint16_t (*encode) (Tokenizer *, uint8_t **);
31
+ uint8_t *(*decode) (Tokenizer *, uint16_t);
32
+
33
+
34
+ };
35
+
36
+
37
+ static uint16_t TokenizerGetVocabSize() { return (_binary_tokenizer_bin_end - _binary_tokenizer_bin_start) / 3; }
38
+
39
+
40
+
41
+ static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) {
42
+
43
+ for (uint16_t i = prev; i < tokenizer->get_vocab_size(); ++i)
44
+ if (tokenizer->vocab->token[i].byte == byte && tokenizer->vocab->token[i].prev == prev)
45
+ return i;
46
+
47
+ return 0;
48
+ }
49
+
50
+
51
+ static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) {
52
+
53
+ uint16_t prev = 0;
54
+ for (; **seed_text; ++*seed_text) {
55
+ uint16_t next = tokenizer->find(tokenizer, **seed_text, prev);
56
+ if (next == 0) break;
57
+ prev = next;
58
+ }
59
+
60
+ return prev;
61
+ }
62
+
63
+
64
+ static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) {
65
+
66
+ static uint8_t dest[MAX_WORD_LEN + 1];
67
+ dest[MAX_WORD_LEN] = '\0';
68
+
69
+ uint16_t prev = token;
70
+ uint16_t i = MAX_WORD_LEN - 1;
71
+
72
+ for (; prev && i > 0; prev = tokenizer->vocab->token[prev].prev, --i)
73
+ dest[i] = tokenizer->vocab->token[prev].byte;
74
+
75
+ return dest + i + 1;
76
+ }
77
+
78
+
79
+ Tokenizer tokenizer = {
80
+ .vocab = (vocab_t *) _binary_tokenizer_bin_start,
81
+
82
+ .get_vocab_size = TokenizerGetVocabSize,
83
+
84
+ .find = TokenizerFind,
85
+ .encode = TokenizerEncode,
86
+ .decode = TokenizerDecode
87
+ };
88
+