| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <cstdio> |
| #include <cstdlib> |
| #include <cmath> |
| #include <cstring> |
| #include <ctime> |
| #include <cassert> |
| #include <algorithm> |
| #include <string> |
| #include <vector> |
| #include <iostream> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <immintrin.h> |
| #ifdef _OPENMP |
| #include <omp.h> |
| #endif |
| #ifdef _WIN32 |
| #include <windows.h> |
| static double get_ms(){LARGE_INTEGER f,c;QueryPerformanceFrequency(&f);QueryPerformanceCounter(&c);return(double)c.QuadPart/f.QuadPart*1000.0;} |
| #else |
| #include <sys/time.h> |
| static double get_ms(){struct timeval tv;gettimeofday(&tv,NULL);return tv.tv_sec*1000.0+tv.tv_usec/1000.0;} |
| #endif |
|
|
| |
| |
| |
| struct Config { |
| int n_layer, n_head, n_kv_head, n_embd, n_intermediate, vocab_size, max_seq_len; |
| float rope_theta; |
| }; |
|
|
| |
| |
| |
| struct Weights { |
| float* embed_tokens; |
|
|
| |
| float** rms_att; |
| float** q_proj; |
| float** k_proj; |
| float** v_proj; |
| float** o_proj; |
| float** rms_ffn; |
| float** gate_proj; |
| float** up_proj; |
| float** down_proj; |
|
|
| float* rms_final; |
| float* lm_head; |
| }; |
|
|
| static Config cfg; |
| static Weights W; |
| static float* g_data = nullptr; |
|
|
| |
| |
| |
| struct Session { |
| float* k_cache = nullptr; |
| float* v_cache = nullptr; |
| int pos = 0; |
| double last_use = 0.0; |
| }; |
| static const int MAX_SESSIONS = 8; |
| static std::unordered_map<std::string, Session> g_sessions; |
|
|
| |
| |
| |
| static float *g_x, *g_xb, *g_q, *g_k, *g_v, *g_attn, *g_ff_gate, *g_ff_up, *g_logits; |
|
|
| |
| |
| |
|
|
| |
| static void rmsnorm(float* out, const float* x, const float* w, int N) { |
| float ss = 0.0f; |
| for (int i = 0; i < N; i++) ss += x[i] * x[i]; |
| ss = 1.0f / sqrtf(ss / N + 1e-5f); |
| for (int i = 0; i < N; i++) out[i] = x[i] * ss * w[i]; |
| } |
|
|
| |
| static void matmul(float* out, const float* mat, const float* x, int M, int K) { |
| #pragma omp parallel for schedule(static) |
| for (int i = 0; i < M; i++) { |
| const float* row = mat + (long long)i * K; |
| __m256 acc = _mm256_setzero_ps(); |
| int j = 0; |
| for (; j <= K - 8; j += 8) |
| acc = _mm256_fmadd_ps(_mm256_loadu_ps(row + j), |
| _mm256_loadu_ps(x + j), acc); |
| float tmp[8]; _mm256_storeu_ps(tmp, acc); |
| float s = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7]; |
| for (; j < K; j++) s += row[j] * x[j]; |
| out[i] = s; |
| } |
| } |
|
|
| |
| |
| static void swiglu(float* out, const float* gate, const float* up, int N) { |
| #pragma omp parallel for |
| for (int i = 0; i < N; i++) { |
| float g = gate[i]; |
| float silu = g / (1.0f + expf(-g)); |
| out[i] = silu * up[i]; |
| } |
| } |
|
|
| |
| static void softmax(float* x, int n) { |
| float mx = x[0]; |
| for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i]; |
| float s = 0.0f; |
| for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); s += x[i]; } |
| for (int i = 0; i < n; i++) x[i] /= s; |
| } |
|
|
| |
| |
| |
| |
| static void rope(float* x, int pos, int dim, float theta) { |
| for (int i = 0; i < dim; i += 2) { |
| float freq = 1.0f / powf(theta, (float)i / dim); |
| float angle = pos * freq; |
| float c = cosf(angle), s = sinf(angle); |
| float x0 = x[i], x1 = x[i + 1]; |
| x[i] = x0 * c - x1 * s; |
| x[i + 1] = x0 * s + x1 * c; |
| } |
| } |
|
|
| |
| |
| |
| static void forward(int token_id, int pos, float* k_cache, float* v_cache) { |
| const int C = cfg.n_embd; |
| const int H = cfg.n_head; |
| const int KVH = cfg.n_kv_head; |
| const int hs = C / H; |
| const int kv_dim = KVH * hs; |
| const int GRP = H / KVH; |
|
|
| |
| memcpy(g_x, W.embed_tokens + (long long)token_id * C, C * sizeof(float)); |
|
|
| for (int l = 0; l < cfg.n_layer; l++) { |
|
|
| |
|
|
| |
| rmsnorm(g_xb, g_x, W.rms_att[l], C); |
|
|
| |
| matmul(g_q, W.q_proj[l], g_xb, C, C); |
| matmul(g_k, W.k_proj[l], g_xb, kv_dim, C); |
| matmul(g_v, W.v_proj[l], g_xb, kv_dim, C); |
|
|
| |
| for (int h = 0; h < H; h++) rope(g_q + h*hs, pos, hs, cfg.rope_theta); |
| for (int h = 0; h < KVH; h++) rope(g_k + h*hs, pos, hs, cfg.rope_theta); |
|
|
| |
| float* kc = k_cache + (long long)l * cfg.max_seq_len * kv_dim; |
| float* vc = v_cache + (long long)l * cfg.max_seq_len * kv_dim; |
| memcpy(kc + (long long)pos * kv_dim, g_k, kv_dim * sizeof(float)); |
| memcpy(vc + (long long)pos * kv_dim, g_v, kv_dim * sizeof(float)); |
|
|
| |
| #pragma omp parallel for schedule(static) |
| for (int h = 0; h < H; h++) { |
| int kv_h = h / GRP; |
| float scale = 1.0f / sqrtf((float)hs); |
| float* qh = g_q + h * hs; |
|
|
| |
| for (int t = 0; t <= pos; t++) { |
| float* kh = kc + (long long)t * kv_dim + kv_h * hs; |
| float dot = 0.0f; |
| for (int d = 0; d < hs; d++) dot += qh[d] * kh[d]; |
| g_attn[h * cfg.max_seq_len + t] = dot * scale; |
| } |
| softmax(g_attn + h * cfg.max_seq_len, pos + 1); |
|
|
| |
| float* out_h = g_xb + h * hs; |
| memset(out_h, 0, hs * sizeof(float)); |
| for (int t = 0; t <= pos; t++) { |
| float* vh = vc + (long long)t * kv_dim + kv_h * hs; |
| float a = g_attn[h * cfg.max_seq_len + t]; |
| for (int d = 0; d < hs; d++) out_h[d] += a * vh[d]; |
| } |
| } |
|
|
| |
| float tmp_o[C]; |
| matmul(tmp_o, W.o_proj[l], g_xb, C, C); |
| #pragma omp parallel for |
| for (int i = 0; i < C; i++) g_x[i] += tmp_o[i]; |
|
|
| |
|
|
| rmsnorm(g_xb, g_x, W.rms_ffn[l], C); |
|
|
| |
| matmul(g_ff_gate, W.gate_proj[l], g_xb, cfg.n_intermediate, C); |
| matmul(g_ff_up, W.up_proj[l], g_xb, cfg.n_intermediate, C); |
|
|
| |
| swiglu(g_ff_gate, g_ff_gate, g_ff_up, cfg.n_intermediate); |
|
|
| |
| float tmp_d[C]; |
| matmul(tmp_d, W.down_proj[l], g_ff_gate, C, cfg.n_intermediate); |
| #pragma omp parallel for |
| for (int i = 0; i < C; i++) g_x[i] += tmp_d[i]; |
| } |
|
|
| |
| rmsnorm(g_xb, g_x, W.rms_final, C); |
| matmul(g_logits, W.lm_head, g_xb, cfg.vocab_size, C); |
| } |
|
|
| |
| |
| |
| static void map_weights(float* data) { |
| const int C = cfg.n_embd; |
| const int L = cfg.n_layer; |
| const int KVH = cfg.n_kv_head; |
| const int hs = C / cfg.n_head; |
| const int kv_dim = KVH * hs; |
| const int F = cfg.n_intermediate; |
|
|
| float* p = data; |
|
|
| W.embed_tokens = p; p += (long long)cfg.vocab_size * C; |
|
|
| #define MK(f) W.f = (float**)malloc(L * sizeof(float*)) |
| MK(rms_att); MK(q_proj); MK(k_proj); MK(v_proj); MK(o_proj); |
| MK(rms_ffn); MK(gate_proj); MK(up_proj); MK(down_proj); |
| #undef MK |
|
|
| for (int l = 0; l < L; l++) { |
| W.rms_att[l] = p; p += C; |
| W.q_proj[l] = p; p += (long long)C * C; |
| W.k_proj[l] = p; p += (long long)kv_dim * C; |
| W.v_proj[l] = p; p += (long long)kv_dim * C; |
| W.o_proj[l] = p; p += (long long)C * C; |
| W.rms_ffn[l] = p; p += C; |
| W.gate_proj[l] = p; p += (long long)F * C; |
| W.up_proj[l] = p; p += (long long)F * C; |
| W.down_proj[l] = p; p += (long long)C * F; |
| } |
|
|
| W.rms_final = p; p += C; |
| W.lm_head = p; |
| } |
|
|
| |
| |
| |
| static long long kv_bytes() { |
| int kv_dim = cfg.n_kv_head * (cfg.n_embd / cfg.n_head); |
| return (long long)cfg.n_layer * cfg.max_seq_len * kv_dim * sizeof(float); |
| } |
|
|
| static void free_sess(Session& s) { |
| free(s.k_cache); free(s.v_cache); |
| s.k_cache = nullptr; s.v_cache = nullptr; s.pos = 0; |
| } |
|
|
| static void evict_oldest() { |
| if (g_sessions.empty()) return; |
| std::string oid; double ot = 1e300; |
| for (auto& kv : g_sessions) |
| if (kv.second.last_use < ot) { ot = kv.second.last_use; oid = kv.first; } |
| free_sess(g_sessions[oid]); |
| g_sessions.erase(oid); |
| } |
|
|
| static Session& get_or_create(const std::string& id) { |
| auto it = g_sessions.find(id); |
| if (it != g_sessions.end()) { it->second.last_use = get_ms(); return it->second; } |
| if ((int)g_sessions.size() >= MAX_SESSIONS) evict_oldest(); |
| Session s; |
| long long nb = kv_bytes(); |
| s.k_cache = (float*)calloc(nb, 1); |
| s.v_cache = (float*)calloc(nb, 1); |
| s.pos = 0; |
| s.last_use = get_ms(); |
| g_sessions[id] = s; |
| return g_sessions[id]; |
| } |
|
|
| |
| |
| |
| static int sample_topk(float temperature, int top_k) { |
| for (int v = 0; v < cfg.vocab_size; v++) g_logits[v] /= temperature; |
| int K = std::min(top_k, cfg.vocab_size); |
| std::vector<std::pair<float,int>> pairs(cfg.vocab_size); |
| for (int v = 0; v < cfg.vocab_size; v++) pairs[v] = {g_logits[v], v}; |
| std::partial_sort(pairs.begin(), pairs.begin() + K, pairs.end(), |
| [](const auto& a, const auto& b){ return a.first > b.first; }); |
| float sum = 0.0f; |
| for (int j = 0; j < K; j++) { pairs[j].first = expf(pairs[j].first); sum += pairs[j].first; } |
| for (int j = 0; j < K; j++) pairs[j].first /= sum; |
| float r = (float)rand() / ((float)RAND_MAX + 1.0f), cum = 0.0f, best = pairs[0].second; |
| for (int j = 0; j < K; j++) { cum += pairs[j].first; if (r < cum) { best = pairs[j].second; break; } } |
| return (int)best; |
| } |
|
|
| |
| |
| |
| static std::vector<std::string> split(const std::string& s, char d) { |
| std::vector<std::string> out; std::string cur; |
| for (char c : s) { if (c == d) { out.push_back(cur); cur.clear(); } else cur += c; } |
| out.push_back(cur); return out; |
| } |
| static std::vector<int> parse_ints(const std::string& s) { |
| std::vector<int> out; |
| for (auto& t : split(s, ',')) if (!t.empty()) out.push_back(atoi(t.c_str())); |
| return out; |
| } |
|
|
| |
| |
| |
| static void handle_request(const std::string& line) { |
| auto parts = split(line, '|'); |
| if (parts.size() < 7) { printf("ERROR bad_format\n"); fflush(stdout); return; } |
|
|
| std::string sess_id = parts[1]; |
| auto new_toks = parse_ints(parts[2]); |
| int max_new = atoi(parts[3].c_str()); |
| float temp = (float)atof(parts[4].c_str()); |
| int top_k = atoi(parts[5].c_str()); |
| auto stop_lst = parse_ints(parts[6]); |
|
|
| temp = std::max(temp, 0.01f); |
| top_k = std::clamp(top_k, 1, cfg.vocab_size); |
| max_new = std::max(max_new, 1); |
|
|
| std::unordered_set<int> stop_ids(stop_lst.begin(), stop_lst.end()); |
| stop_ids.insert(128009); |
| stop_ids.insert(128001); |
|
|
| Session& sess = get_or_create(sess_id); |
|
|
| |
| for (int tok : new_toks) { |
| if (sess.pos >= cfg.max_seq_len) { |
| printf("ERROR context_full\n"); fflush(stdout); return; |
| } |
| forward(tok, sess.pos, sess.k_cache, sess.v_cache); |
| sess.pos++; |
| } |
|
|
| |
| double t0 = get_ms(); |
| int gen = 0; |
| for (int i = 0; i < max_new; i++) { |
| if (sess.pos >= cfg.max_seq_len) break; |
| int next = sample_topk(temp, top_k); |
| printf("TOKEN %d %.2f\n", next, get_ms() - t0); |
| fflush(stdout); |
| gen++; |
| if (stop_ids.count(next)) break; |
| forward(next, sess.pos, sess.k_cache, sess.v_cache); |
| sess.pos++; |
| } |
|
|
| printf("DONE %d %.2f\n", gen, get_ms() - t0); |
| fflush(stdout); |
| } |
|
|
| static void handle_reset(const std::string& line) { |
| auto parts = split(line, '|'); |
| if (parts.size() >= 2) { |
| auto it = g_sessions.find(parts[1]); |
| if (it != g_sessions.end()) { free_sess(it->second); g_sessions.erase(it); } |
| } |
| printf("RESET_OK\n"); fflush(stdout); |
| } |
|
|
| |
| |
| |
| int main() { |
| FILE* f = fopen("model_llama.bin", "rb"); |
| if (!f) { printf("ERROR model_llama.bin not found\n"); fflush(stdout); return 1; } |
|
|
| |
| int hdr[7]; float theta; |
| fread(hdr, sizeof(int), 7, f); |
| fread(&theta, sizeof(float), 1, f); |
| cfg = {hdr[0], hdr[1], hdr[2], hdr[3], hdr[4], hdr[5], hdr[6], theta}; |
|
|
| printf("[engine] Layers=%d Heads=%d KVHeads=%d Embd=%d Inter=%d Vocab=%d Seq=%d Theta=%.0f\n", |
| cfg.n_layer, cfg.n_head, cfg.n_kv_head, cfg.n_embd, |
| cfg.n_intermediate, cfg.vocab_size, cfg.max_seq_len, cfg.rope_theta); |
| fflush(stdout); |
|
|
| |
| fseek(f, 0, SEEK_END); long fsize = ftell(f); |
| long woff = 7 * sizeof(int) + sizeof(float); |
| fseek(f, woff, SEEK_SET); |
| long wbytes = fsize - woff; |
|
|
| g_data = (float*)malloc(wbytes); |
| if (!g_data) { printf("ERROR oom\n"); fflush(stdout); return 1; } |
| fread(g_data, 1, wbytes, f); |
| fclose(f); |
| map_weights(g_data); |
|
|
| |
| const int C = cfg.n_embd; |
| const int F = cfg.n_intermediate; |
| const int S = cfg.max_seq_len; |
| const int H = cfg.n_head; |
| g_x = (float*)malloc(C * sizeof(float)); |
| g_xb = (float*)malloc(C * sizeof(float)); |
| g_q = (float*)malloc(C * sizeof(float)); |
| g_k = (float*)malloc(cfg.n_kv_head * (C/H) * sizeof(float)); |
| g_v = (float*)malloc(cfg.n_kv_head * (C/H) * sizeof(float)); |
| g_attn = (float*)malloc((long long)H * S * sizeof(float)); |
| g_ff_gate = (float*)malloc(F * sizeof(float)); |
| g_ff_up = (float*)malloc(F * sizeof(float)); |
| g_logits = (float*)malloc((long long)cfg.vocab_size * sizeof(float)); |
|
|
| srand((unsigned)time(NULL)); |
| printf("READY\n"); fflush(stdout); |
|
|
| std::string line; |
| while (std::getline(std::cin, line)) { |
| if (!line.empty() && line.back() == '\r') line.pop_back(); |
| if (line.empty()) continue; |
| if (line == "QUIT") break; |
| else if (line.rfind("RESET|", 0) == 0) handle_reset(line); |
| else if (line.rfind("REQUEST|", 0) == 0) handle_request(line); |
| else { printf("ERROR unknown_cmd\n"); fflush(stdout); } |
| } |
|
|
| for (auto& kv : g_sessions) free_sess(kv.second); |
| free(g_data); |
| free(g_x); free(g_xb); free(g_q); free(g_k); free(g_v); |
| free(g_attn); free(g_ff_gate); free(g_ff_up); free(g_logits); |
| return 0; |
| } |
|
|