|
|
|
|
|
|
|
|
#include <stdio.h> |
|
|
#include <fcntl.h> |
|
|
#include <stdlib.h> |
|
|
#include <unistd.h> |
|
|
#include <string.h> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "tokenizer.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t get_file_size(const char *filename) { |
|
|
FILE *file = fopen(filename, "rb"); |
|
|
if (!file) { |
|
|
perror("fopen"); |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
if (fseek(file, 0, SEEK_END)!= 0) { |
|
|
perror("fseek"); |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
uint32_t size = ftell(file); |
|
|
if (size == -1) { |
|
|
perror("ftell"); |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
fclose(file); |
|
|
return size; |
|
|
} |
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]) { |
|
|
if(argc < 3) { |
|
|
printf("Needs at least 2 arguments\n"); |
|
|
return 1; |
|
|
} |
|
|
|
|
|
char *fname[2] = {argv[1], argv[2]}; |
|
|
int fd[2] = { |
|
|
open(fname[0], O_RDONLY), |
|
|
open(fname[1], O_CREAT | O_WRONLY, 0644) |
|
|
}; |
|
|
|
|
|
printf("Vocab size is %d\n", tokenizer.get_vocab_size()); |
|
|
printf("File names: %s, %s\n", fname[0], fname[1]); |
|
|
printf("Got file descriptor: %d, %d\n", fd[0], fd[1]); |
|
|
|
|
|
uint32_t size = get_file_size(fname[0]); |
|
|
uint8_t *data = (uint8_t *) malloc(size * sizeof(uint8_t)); |
|
|
uint32_t n = read(fd[0], data, size * sizeof(uint8_t)); |
|
|
|
|
|
printf("Expected file size of %d bytes\n", size); |
|
|
printf("Read %d bytes\n", n); |
|
|
if (size != n) perror(0); |
|
|
|
|
|
uint8_t *p = data; |
|
|
uint32_t d = 0; |
|
|
for (int i = 0; *p; ++i) { |
|
|
uint16_t tok = tokenizer.encode(&tokenizer, &p); |
|
|
write(fd[1], &tok, sizeof(uint16_t)); |
|
|
|
|
|
if (i % 10000 == 0) { |
|
|
printf("Current token %d | Index position %d | Remaining %d | Tokenization delta %ld\n", tok, i, d, d - ((data + n) - p)); |
|
|
d = (data + n) - p; |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
printf("\n"); |
|
|
return 0; |
|
|
} |
|
|
|