#include #include #include #include #include #include "tokenizer.h" uint32_t get_file_size(const char *filename) { FILE *file = fopen(filename, "rb"); if (!file) { perror("fopen"); exit(EXIT_FAILURE); } if (fseek(file, 0, SEEK_END)!= 0) { perror("fseek"); exit(EXIT_FAILURE); } uint32_t size = ftell(file); if (size == -1) { perror("ftell"); exit(EXIT_FAILURE); } fclose(file); return size; } int main(int argc, char *argv[]) { if(argc < 3) { printf("Needs at least 2 arguments\n"); return 1; } char *fname[2] = {argv[1], argv[2]}; int fd[2] = { open(fname[0], O_RDONLY), open(fname[1], O_CREAT | O_WRONLY, 0644) }; printf("Vocab size is %d\n", tokenizer.get_vocab_size()); printf("File names: %s, %s\n", fname[0], fname[1]); printf("Got file descriptor: %d, %d\n", fd[0], fd[1]); uint32_t size = get_file_size(fname[0]); uint8_t *data = (uint8_t *) malloc(size * sizeof(uint8_t)); uint32_t n = read(fd[0], data, size * sizeof(uint8_t)); printf("Expected file size of %d bytes\n", size); printf("Read %d bytes\n", n); if (size != n) perror(0); uint8_t *p = data; uint32_t d = 0; for (int i = 0; *p; ++i) { uint16_t tok = tokenizer.encode(&tokenizer, &p); write(fd[1], &tok, sizeof(uint16_t)); if (i % 10000 == 0) { printf("Current token %d | Index position %d | Remaining %d | Tokenization delta %ld\n", tok, i, d, d - ((data + n) - p)); d = (data + n) - p; } } printf("\n"); return 0; }