| |
| |
| |
| |
|
|
|
|
| #include <stdio.h>
|
| #include <stdlib.h>
|
| #include <math.h>
|
| #include <string.h>
|
| #include <time.h>
|
| #include <vector>
|
| #include <algorithm>
|
| #include <immintrin.h>
|
|
|
| #ifdef _OPENMP
|
| #include <omp.h>
|
| #endif
|
|
|
|
|
|
|
|
|
|
|
| typedef struct {
|
| int n_layer;
|
| int n_head;
|
| int n_embd;
|
| int block_size;
|
| int vocab_size;
|
| } Config;
|
|
|
| typedef struct {
|
| float* wte; float* wpe;
|
| float** ln1_w; float** ln1_b;
|
| float** c_attn_w; float** c_attn_b;
|
| float** c_proj_w; float** c_proj_b;
|
| float** ln2_w; float** ln2_b;
|
| float** fc_w; float** fc_b;
|
| float** mlp_proj_w; float** mlp_proj_b;
|
| float* ln_f_w; float* ln_f_b;
|
| float* lm_head_w;
|
| } Weights;
|
|
|
| typedef struct { float* k_cache; float* v_cache; } KVCache;
|
|
|
| static Config cfg;
|
| static Weights W;
|
| static float* model_data_buffer = NULL;
|
|
|
|
|
|
|
|
|
|
|
| static void layer_norm(float* out, const float* x, const float* w, const float* b, int size) {
|
| float mean = 0.0f, var = 0.0f;
|
|
|
| for (int i = 0; i < size; i++) mean += x[i];
|
| mean /= size;
|
|
|
| for (int i = 0; i < size; i++) {
|
| float d = x[i] - mean;
|
| var += d * d;
|
| }
|
| var /= size;
|
|
|
| float scale = 1.0f / sqrtf(var + 1e-5f);
|
|
|
| for (int i = 0; i < size; i++)
|
| out[i] = (x[i] - mean) * scale * w[i] + b[i];
|
| }
|
|
|
|
|
| static void matmul_vec(float* out, const float* mat, const float* x, int M, int K) {
|
|
|
| #pragma omp parallel for
|
| for (int i = 0; i < M; i++) {
|
| const float* row = mat + (long long)i * K;
|
|
|
|
|
| __m256 sum_vec = _mm256_setzero_ps();
|
|
|
| int j = 0;
|
|
|
| for (; j <= K - 8; j += 8) {
|
|
|
| __m256 m_val = _mm256_loadu_ps(&row[j]);
|
| __m256 x_val = _mm256_loadu_ps(&x[j]);
|
|
|
|
|
| sum_vec = _mm256_fmadd_ps(m_val, x_val, sum_vec);
|
| }
|
|
|
|
|
| float sum_arr[8];
|
| _mm256_storeu_ps(sum_arr, sum_vec);
|
| float sum = sum_arr[0] + sum_arr[1] + sum_arr[2] + sum_arr[3] +
|
| sum_arr[4] + sum_arr[5] + sum_arr[6] + sum_arr[7];
|
|
|
|
|
| for (; j < K; j++) {
|
| sum += row[j] * x[j];
|
| }
|
|
|
| out[i] = sum;
|
| }
|
| }
|
|
|
| static void add_bias(float* x, const float* b, int N) {
|
| #pragma omp parallel for
|
| for (int i = 0; i < N; i++)
|
| x[i] += b[i];
|
| }
|
|
|
| static void residual_add(float* x, const float* y, int N) {
|
| #pragma omp parallel for
|
| for (int i = 0; i < N; i++)
|
| x[i] += y[i];
|
| }
|
|
|
| static void gelu_inplace(float* x, int N) {
|
| const float c = 0.7978845608f;
|
|
|
| #pragma omp parallel for
|
| for (int i = 0; i < N; i++) {
|
| float v = x[i];
|
| float t = tanhf(c * (v + 0.044715f * v * v * v));
|
| x[i] = 0.5f * v * (1.0f + t);
|
| }
|
| }
|
|
|
| static void softmax_inplace(float* x, int N) {
|
|
|
| float max_val = x[0];
|
| for (int i = 1; i < N; i++)
|
| if (x[i] > max_val) max_val = x[i];
|
|
|
| float sum = 0.0f;
|
| for (int i = 0; i < N; i++) {
|
| x[i] = expf(x[i] - max_val);
|
| sum += x[i];
|
| }
|
|
|
| for (int i = 0; i < N; i++)
|
| x[i] /= sum;
|
| }
|
|
|
|
|
|
|
|
|
|
|
| static void forward(
|
| int token_id,
|
| int pos,
|
| KVCache* kv,
|
| float* x,
|
| float* buf,
|
| float* qkv_buf,
|
| float* attn_buf,
|
| float* ff_buf,
|
| float* logits
|
| ) {
|
| const int C = cfg.n_embd;
|
| const int H = cfg.n_head;
|
| const int hs = C / H;
|
|
|
| float* content_row = W.wte + (long long)token_id * C;
|
| float* pos_row = W.wpe + (long long)pos * C;
|
|
|
| #pragma omp parallel for
|
| for (int i = 0; i < C; i++)
|
| x[i] = content_row[i] + pos_row[i];
|
|
|
| for (int l = 0; l < cfg.n_layer; l++) {
|
|
|
| layer_norm(buf, x, W.ln1_w[l], W.ln1_b[l], C);
|
|
|
| matmul_vec(qkv_buf, W.c_attn_w[l], buf, 3 * C, C);
|
| add_bias(qkv_buf, W.c_attn_b[l], 3 * C);
|
|
|
| float* q = qkv_buf;
|
| float* k = qkv_buf + C;
|
| float* v = qkv_buf + 2 * C;
|
|
|
| float* k_cache = kv->k_cache + (long long)l * cfg.block_size * C;
|
| float* v_cache = kv->v_cache + (long long)l * cfg.block_size * C;
|
|
|
| memcpy(k_cache + (long long)pos * C, k, C * sizeof(float));
|
| memcpy(v_cache + (long long)pos * C, v, C * sizeof(float));
|
|
|
| #pragma omp parallel for
|
| for (int h = 0; h < H; h++) {
|
|
|
| float* q_h = q + h * hs;
|
| float scale = 1.0f / sqrtf((float)hs);
|
|
|
|
|
| float* local_attn = attn_buf + h * cfg.block_size;
|
|
|
| for (int t = 0; t <= pos; t++) {
|
| float* k_h = k_cache + (long long)t * C + h * hs;
|
| float dot = 0.0f;
|
| for (int d = 0; d < hs; d++)
|
| dot += q_h[d] * k_h[d];
|
|
|
| local_attn[t] = dot * scale;
|
| }
|
|
|
| softmax_inplace(local_attn, pos + 1);
|
|
|
| float* out_h = buf + h * hs;
|
| memset(out_h, 0, hs * sizeof(float));
|
|
|
| for (int t = 0; t <= pos; t++) {
|
| float* v_h = v_cache + (long long)t * C + h * hs;
|
| float a = local_attn[t];
|
| for (int d = 0; d < hs; d++)
|
| out_h[d] += a * v_h[d];
|
| }
|
| }
|
|
|
| float* attn_out = qkv_buf;
|
| matmul_vec(attn_out, W.c_proj_w[l], buf, C, C);
|
| add_bias(attn_out, W.c_proj_b[l], C);
|
| residual_add(x, attn_out, C);
|
|
|
| layer_norm(buf, x, W.ln2_w[l], W.ln2_b[l], C);
|
|
|
| matmul_vec(ff_buf, W.fc_w[l], buf, 4 * C, C);
|
| add_bias(ff_buf, W.fc_b[l], 4 * C);
|
| gelu_inplace(ff_buf, 4 * C);
|
|
|
| matmul_vec(buf, W.mlp_proj_w[l], ff_buf, C, 4 * C);
|
| add_bias(buf, W.mlp_proj_b[l], C);
|
| residual_add(x, buf, C);
|
| }
|
|
|
| layer_norm(buf, x, W.ln_f_w, W.ln_f_b, C);
|
| matmul_vec(logits, W.lm_head_w, buf, cfg.vocab_size, C);
|
| }
|
|
|
|
|
|
|
|
|
|
|
| static void map_weights(float* data) {
|
|
|
| float* ptr = data;
|
| const int C = cfg.n_embd;
|
| const int L = cfg.n_layer;
|
|
|
| W.wte = ptr; ptr += (long long)cfg.vocab_size * C;
|
| W.wpe = ptr; ptr += (long long)cfg.block_size * C;
|
|
|
| W.ln1_w = (float**)malloc(L * sizeof(float*));
|
| W.ln1_b = (float**)malloc(L * sizeof(float*));
|
| W.c_attn_w = (float**)malloc(L * sizeof(float*));
|
| W.c_attn_b = (float**)malloc(L * sizeof(float*));
|
| W.c_proj_w = (float**)malloc(L * sizeof(float*));
|
| W.c_proj_b = (float**)malloc(L * sizeof(float*));
|
| W.ln2_w = (float**)malloc(L * sizeof(float*));
|
| W.ln2_b = (float**)malloc(L * sizeof(float*));
|
| W.fc_w = (float**)malloc(L * sizeof(float*));
|
| W.fc_b = (float**)malloc(L * sizeof(float*));
|
| W.mlp_proj_w = (float**)malloc(L * sizeof(float*));
|
| W.mlp_proj_b = (float**)malloc(L * sizeof(float*));
|
|
|
| for (int l = 0; l < L; l++) {
|
| W.ln1_w[l] = ptr; ptr += C;
|
| W.ln1_b[l] = ptr; ptr += C;
|
|
|
| W.c_attn_w[l] = ptr; ptr += 3LL * C * C;
|
| W.c_attn_b[l] = ptr; ptr += 3LL * C;
|
|
|
| W.c_proj_w[l] = ptr; ptr += 1LL * C * C;
|
| W.c_proj_b[l] = ptr; ptr += C;
|
|
|
| W.ln2_w[l] = ptr; ptr += C;
|
| W.ln2_b[l] = ptr; ptr += C;
|
|
|
| W.fc_w[l] = ptr; ptr += 4LL * C * C;
|
| W.fc_b[l] = ptr; ptr += 4LL * C;
|
|
|
| W.mlp_proj_w[l] = ptr; ptr += 1LL * C * 4 * C;
|
| W.mlp_proj_b[l] = ptr; ptr += C;
|
| }
|
|
|
| W.ln_f_w = ptr; ptr += C;
|
| W.ln_f_b = ptr; ptr += C;
|
|
|
| W.lm_head_w = ptr;
|
| }
|
|
|
|
|
|
|
|
|
|
|
| int main(int argc, char* argv[]) {
|
|
|
| if (argc < 3) {
|
| printf("ERROR_ARGS");
|
| return 1;
|
| }
|
|
|
| FILE* f = fopen("model.bin", "rb");
|
| if (!f) {
|
| printf("ERROR_MODEL_NOT_FOUND");
|
| return 1;
|
| }
|
|
|
| fread(&cfg, sizeof(int), 5, f);
|
| fseek(f, 0, SEEK_END);
|
| long file_size = ftell(f);
|
| fseek(f, 5 * sizeof(int), SEEK_SET);
|
|
|
| model_data_buffer = (float*)malloc(file_size - 5 * sizeof(int));
|
| fread(model_data_buffer, 1, file_size - 5 * sizeof(int), f);
|
| fclose(f);
|
|
|
| map_weights(model_data_buffer);
|
|
|
| std::vector<int> input_ids;
|
| char* token = strtok(argv[1], ",");
|
| while (token) {
|
| input_ids.push_back(atoi(token));
|
| token = strtok(NULL, ",");
|
| }
|
|
|
| if (input_ids.size() >= (size_t)cfg.block_size)
|
| input_ids.resize(cfg.block_size - 1);
|
|
|
| int max_new_tokens = atoi(argv[2]);
|
|
|
| float temperature = (argc > 3) ? atof(argv[3]) : 0.8f;
|
| int top_k = (argc > 4) ? atoi(argv[4]) : 40;
|
| if (temperature < 0.01f) temperature = 0.01f;
|
| if (top_k < 1) top_k = 1;
|
| if (top_k > cfg.vocab_size) top_k = cfg.vocab_size;
|
|
|
| srand((unsigned int)time(NULL));
|
|
|
| const int C = cfg.n_embd;
|
|
|
| KVCache kv;
|
| kv.k_cache = (float*)calloc((long long)cfg.n_layer * cfg.block_size * C, sizeof(float));
|
| kv.v_cache = (float*)calloc((long long)cfg.n_layer * cfg.block_size * C, sizeof(float));
|
|
|
| float* x = (float*)malloc(C * sizeof(float));
|
| float* buf = (float*)malloc(C * sizeof(float));
|
| float* qkv_buf = (float*)malloc(3 * C * sizeof(float));
|
|
|
|
|
| float* attn_buf = (float*)malloc(cfg.n_head * cfg.block_size * sizeof(float));
|
|
|
| float* ff_buf = (float*)malloc(4 * C * sizeof(float));
|
| float* logits = (float*)malloc(cfg.vocab_size * sizeof(float));
|
|
|
| for (int i = 0; i < (int)input_ids.size(); i++)
|
| forward(input_ids[i], i, &kv, x, buf, qkv_buf, attn_buf, ff_buf, logits);
|
|
|
| int pos = input_ids.size();
|
|
|
| for (int i = 0; i < max_new_tokens; i++) {
|
|
|
| if (pos >= cfg.block_size)
|
| break;
|
|
|
| for (int v = 0; v < cfg.vocab_size; v++)
|
| logits[v] /= temperature;
|
|
|
| std::vector<std::pair<float, int>> pairs(cfg.vocab_size);
|
| for (int v = 0; v < cfg.vocab_size; v++)
|
| pairs[v] = {logits[v], v};
|
|
|
| std::partial_sort(pairs.begin(), pairs.begin() + top_k, pairs.end(),
|
| [](const std::pair<float,int>& a, const std::pair<float,int>& b) {
|
| return a.first > b.first;
|
| });
|
|
|
| float sum = 0.0f;
|
| for (int j = 0; j < top_k; j++) {
|
| pairs[j].first = expf(pairs[j].first);
|
| sum += pairs[j].first;
|
| }
|
| for (int j = 0; j < top_k; j++)
|
| pairs[j].first /= sum;
|
|
|
| float r = (float)rand() / ((float)RAND_MAX + 1.0f);
|
| float cum = 0.0f;
|
| int best = pairs[0].second;
|
| for (int j = 0; j < top_k; j++) {
|
| cum += pairs[j].first;
|
| if (r < cum) {
|
| best = pairs[j].second;
|
| break;
|
| }
|
| }
|
|
|
| printf("%d ", best);
|
|
|
| if (best == 50256)
|
| break;
|
|
|
| forward(best, pos, &kv, x, buf, qkv_buf, attn_buf, ff_buf, logits);
|
| pos++;
|
| }
|
|
|
| free(model_data_buffer);
|
| return 0;
|
| } |