Spaces:

NOT-OMEGA
/

KVInfer

Sleeping

App Files Files Community

NOT-OMEGA commited on Mar 21

Commit

c11b4f4

verified ·

1 Parent(s): ba21f66

Update inference.cpp

Browse files

Files changed (1) hide show

inference.cpp +41 -73

inference.cpp CHANGED Viewed

@@ -3,44 +3,27 @@
  * KVInfer — PERSISTENT DAEMON INFERENCE ENGINE  v2.0
  * ============================================================
  *
- * FIX #1  Persistent process: model loads ONCE at startup.
- *         Handles unlimited requests over stdin/stdout pipe.
- *         No more subprocess-per-request overhead.
- *
- * FIX #3  Session KV-cache reuse: each session_id keeps its
- *         own KV cache + position. New chat turns only run
- *         forward() on NEW tokens — full history stays cached.
- *         Massive TTFT reduction on multi-turn conversations.
- *
- * FIX #4  Stop-token list: caller passes extra stop IDs (e.g.
- *         the encoded <|user|> token) so the model cannot bleed
- *         into the next speaker's turn.
- *
  * ── STDIN PROTOCOL ──────────────────────────────────────────
  *   REQUEST|<sess>|<new_tokens_csv>|<max_new>|<temp>|<top_k>|<stop_csv>
  *   RESET|<sess>
  *   QUIT
  *
  * ── STDOUT PROTOCOL ─────────────────────────────────────────
- *   READY                           (once, after model loads)
- *   TOKEN <id> <elapsed_ms>         (one per generated token)
- *   DONE <count> <total_ms>         (end of one request)
- *   RESET_OK                        (ack for RESET)
  *   ERROR <message>
  *
- * ── COMPILE (MSVC, Developer Prompt) ────────────────────────
- *   cl /O2 /openmp /arch:AVX2 /fp:fast /std:c++17 /EHsc /Fe:inference.exe inference.cpp
- *
- * ── COMPILE (GCC / MinGW) ───────────────────────────────────
- *   g++ -O3 -march=native -fopenmp -ffast-math -std=c++17 -o inference.exe inference.cpp
  * ============================================================
  */
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
-#include <iostream>
 #include <time.h>
 #include <algorithm>
 #include <string>
@@ -48,11 +31,9 @@
 #include <unordered_set>
 #include <vector>
 #include <immintrin.h>   // AVX2 + FMA
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef _WIN32
   #include <windows.h>
   static double get_time_ms() {
@@ -72,9 +53,7 @@
 // ─────────────────────────────────────────────────────────────────────────
 // Model Structures
 // ─────────────────────────────────────────────────────────────────────────
 typedef struct { int n_layer, n_head, n_embd, block_size, vocab_size; } Config;
 typedef struct {
     float *wte, *wpe;
     float **ln1_w, **ln1_b;
@@ -90,7 +69,7 @@ typedef struct {
 struct SessionState {
     float*  k_cache   = nullptr;
     float*  v_cache   = nullptr;
-    int     pos       = 0;       // tokens already in KV cache
     double  last_used = 0.0;
 };
@@ -98,17 +77,20 @@ static Config  cfg;
 static Weights W;
 static float*  g_model_data = nullptr;
-// LRU session store
-static const int MAX_SESSIONS = 4;
 static std::unordered_map<std::string, SessionState> g_sessions;
 // Shared per-request working buffers
 static float *g_x, *g_buf, *g_qkv, *g_attn, *g_ff, *g_logits;
 // ─────────────────────────────────────────────────────────────────────────
-// Math Kernels   (AVX2 + FMA + OpenMP)
 // ─────────────────────────────────────────────────────────────────────────
 static void layer_norm(float* out, const float* x, const float* w,
                        const float* b, int N) {
     float mean = 0.f, var = 0.f;
@@ -165,20 +147,16 @@ static void softmax_inplace(float* x, int N) {
 }
 // ─────────────────────────────────────────────────────────────────────────
-// Transformer Forward   (single token at position `pos`)
-// Writes next-token log-probs into g_logits.
 // ─────────────────────────────────────────────────────────────────────────
 static void forward(int token_id, int pos, float* k_cache, float* v_cache) {
     const int C = cfg.n_embd, H = cfg.n_head, hs = C/H;
     float* te = W.wte + (long long)token_id*C;
     float* pe = W.wpe + (long long)pos*C;
 #pragma omp parallel for
     for (int i = 0; i < C; i++) g_x[i] = te[i]+pe[i];
     for (int l = 0; l < cfg.n_layer; l++) {
         // Self-attention
         layer_norm(g_buf, g_x, W.ln1_w[l], W.ln1_b[l], C);
         matmul_vec(g_qkv, W.c_attn_w[l], g_buf, 3*C, C);
@@ -233,34 +211,30 @@ static void forward(int token_id, int pos, float* k_cache, float* v_cache) {
 // ─────────────────────────────────────────────────────────────────────────
 // Weight Mapping
 // ─────────────────────────────────────────────────────────────────────────
 static void map_weights(float* data) {
     float* p = data;
     const int C = cfg.n_embd, L = cfg.n_layer;
     W.wte=p; p+=(long long)cfg.vocab_size*C;
     W.wpe=p; p+=(long long)cfg.block_size*C;
     #define ARR(f) W.f=(float**)malloc(L*sizeof(float*))
     ARR(ln1_w); ARR(ln1_b); ARR(c_attn_w); ARR(c_attn_b);
     ARR(c_proj_w); ARR(c_proj_b); ARR(ln2_w); ARR(ln2_b);
     ARR(fc_w); ARR(fc_b); ARR(mlp_proj_w); ARR(mlp_proj_b);
     #undef ARR
     for (int l = 0; l < L; l++) {
-        W.ln1_w[l]=p; p+=C;     W.ln1_b[l]=p; p+=C;
         W.c_attn_w[l]=p; p+=3LL*C*C; W.c_attn_b[l]=p; p+=3LL*C;
         W.c_proj_w[l]=p; p+=1LL*C*C; W.c_proj_b[l]=p; p+=C;
-        W.ln2_w[l]=p; p+=C;     W.ln2_b[l]=p; p+=C;
-        W.fc_w[l]=p; p+=4LL*C*C; W.fc_b[l]=p; p+=4LL*C;
         W.mlp_proj_w[l]=p; p+=1LL*C*4*C; W.mlp_proj_b[l]=p; p+=C;
     }
     W.ln_f_w=p; p+=C; W.ln_f_b=p; p+=C; W.lm_head_w=p;
 }
 // ─────────────────────────────────────────────────────────────────────────
-// Session Management   (LRU, max MAX_SESSIONS)
 // ─────────────────────────────────────────────────────────────────────────
 static long long kv_alloc_bytes() {
     return (long long)cfg.n_layer * cfg.block_size * cfg.n_embd * sizeof(float);
 }
@@ -297,16 +271,16 @@ static SessionState& get_or_create(const std::string& id) {
 }
 // ─────────────────────────────────────────────────────────────────────────
-// Sampler
 // ─────────────────────────────────────────────────────────────────────────
 static int sample_topk(float temperature, int top_k) {
     for (int v = 0; v < cfg.vocab_size; v++) g_logits[v] /= temperature;
     std::vector<std::pair<float,int>> pairs(cfg.vocab_size);
     for (int v = 0; v < cfg.vocab_size; v++) pairs[v]={g_logits[v],v};
     std::partial_sort(pairs.begin(), pairs.begin()+top_k, pairs.end(),
-        [](const std::pair<float,int>& a,const std::pair<float,int>& b){
-            return a.first>b.first;});
     float sum=0.f;
     for (int j=0; j<top_k; j++) { pairs[j].first=expf(pairs[j].first); sum+=pairs[j].first; }
     for (int j=0; j<top_k; j++) pairs[j].first /= sum;
@@ -319,7 +293,6 @@ static int sample_topk(float temperature, int top_k) {
 // ─────────────────────────────────────────��───────────────────────────────
 // Helpers
 // ─────────────────────────────────────────────────────────────────────────
 static std::vector<std::string> split(const std::string& s, char d) {
     std::vector<std::string> out; std::string cur;
     for (char c:s){ if(c==d){out.push_back(cur);cur.clear();}else cur+=c; }
@@ -336,19 +309,17 @@ static std::vector<int> parse_ints(const std::string& s) {
 // ─────────────────────────────────────────────────────────────────────────
 // Command Handlers
 // ─────────────────────────────────────────────────────────────────────────
-// REQUEST|<sess>|<new_tokens_csv>|<max_new>|<temp>|<top_k>|<stop_csv>
 static void handle_request(const std::string& line) {
     auto parts = split(line, '|');
     if (parts.size() < 7) {
         printf("ERROR bad_request_format\n"); fflush(stdout); return;
     }
-    std::string sess_id   = parts[1];
-    auto new_tokens       = parse_ints(parts[2]);
-    int  max_new          = atoi(parts[3].c_str());
-    float temp            = (float)atof(parts[4].c_str());
-    int  top_k            = atoi(parts[5].c_str());
-    auto stop_list        = parse_ints(parts[6]);
     if (temp  < 0.01f) temp  = 0.01f;
     if (top_k < 1)     top_k = 1;
@@ -356,11 +327,11 @@ static void handle_request(const std::string& line) {
     if (max_new < 1)   max_new = 1;
     std::unordered_set<int> stop_ids(stop_list.begin(), stop_list.end());
-    stop_ids.insert(50256);   // <|endoftext|> always a stop
     SessionState& sess = get_or_create(sess_id);
-    // ── Prefill new tokens (updates session KV cache) ─────────────────
     for (int tok : new_tokens) {
         if (sess.pos >= cfg.block_size) {
             printf("ERROR context_window_full\n"); fflush(stdout); return;
@@ -369,10 +340,9 @@ static void handle_request(const std::string& line) {
         sess.pos++;
     }
-    // ── Autoregressive generation ─────────────────────────────────────
     double t0  = get_time_ms();
     int    gen = 0;
     for (int i = 0; i < max_new; i++) {
         if (sess.pos >= cfg.block_size) break;
         int best = sample_topk(temp, top_k);
@@ -388,7 +358,6 @@ static void handle_request(const std::string& line) {
     fflush(stdout);
 }
-// RESET|<sess>
 static void handle_reset(const std::string& line) {
     auto parts = split(line, '|');
     if (parts.size() < 2) { printf("RESET_OK\n"); fflush(stdout); return; }
@@ -401,9 +370,8 @@ static void handle_reset(const std::string& line) {
 }
 // ─────────────────────────────────────────────────────────────────────────
-// MAIN — load model once, then serve from stdin forever
 // ─────────────────────────────────────────────────────────────────────────
 int main() {
     FILE* f = fopen("model.bin", "rb");
     if (!f) { printf("ERROR model.bin_not_found\n"); fflush(stdout); return 1; }
@@ -412,12 +380,13 @@ int main() {
     fseek(f, 0, SEEK_END);
     long fsize = ftell(f);
     fseek(f, 5*(long)sizeof(int), SEEK_SET);
     long wbytes = fsize - 5*(long)sizeof(int);
     g_model_data = (float*)malloc(wbytes);
     if (!g_model_data) { printf("ERROR oom_loading_model\n"); fflush(stdout); return 1; }
     fread(g_model_data, 1, wbytes, f);
     fclose(f);
     map_weights(g_model_data);
     const int C = cfg.n_embd;
@@ -429,16 +398,15 @@ int main() {
     g_logits = (float*)malloc(cfg.vocab_size*sizeof(float));
     srand((unsigned int)time(NULL));
-    printf("READY\n"); fflush(stdout);   // Python waits for this
     std::string line;
     while (std::getline(std::cin, line)) {
         if (!line.empty() && line.back()=='\r') line.pop_back();
         if (line.empty()) continue;
-        if (line == "QUIT")                     break;
-        else if (line.rfind("RESET|",0)==0)     handle_reset(line);
-        else if (line.rfind("REQUEST|",0)==0)   handle_request(line);
         else { printf("ERROR unknown_cmd\n"); fflush(stdout); }
     }
@@ -446,4 +414,4 @@ int main() {
     free(g_model_data);
     free(g_x); free(g_buf); free(g_qkv); free(g_attn); free(g_ff); free(g_logits);
     return 0;
-}

  * KVInfer — PERSISTENT DAEMON INFERENCE ENGINE  v2.0
  * ============================================================
  *
  * ── STDIN PROTOCOL ──────────────────────────────────────────
  *   REQUEST|<sess>|<new_tokens_csv>|<max_new>|<temp>|<top_k>|<stop_csv>
  *   RESET|<sess>
  *   QUIT
  *
  * ── STDOUT PROTOCOL ─────────────────────────────────────────
+ *   READY
+ *   TOKEN <id> <elapsed_ms>
+ *   DONE <count> <total_ms>
+ *   RESET_OK
  *   ERROR <message>
  *
+ * ── COMPILE (GCC / Linux) ───────────────────────────────────
+ *   g++ -O3 -march=native -fopenmp -ffast-math -std=c++17 -o inference inference.cpp
  * ============================================================
  */
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
+#include <iostream>
 #include <time.h>
 #include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
 #include <immintrin.h>   // AVX2 + FMA
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef _WIN32
   #include <windows.h>
   static double get_time_ms() {
 // ─────────────────────────────────────────────────────────────────────────
 // Model Structures
 // ─────────────────────────────────────────────────────────────────────────
 typedef struct { int n_layer, n_head, n_embd, block_size, vocab_size; } Config;
 typedef struct {
     float *wte, *wpe;
     float **ln1_w, **ln1_b;
 struct SessionState {
     float*  k_cache   = nullptr;
     float*  v_cache   = nullptr;
+    int     pos       = 0;
     double  last_used = 0.0;
 };
 static Weights W;
 static float*  g_model_data = nullptr;
+// ─────────────────────────────────────────────────────────────────────────
+// MAX_SESSIONS — 3 engines × 14 sessions × 96MB = ~4GB KV cache
+// Total RAM: ~6.57GB (safe under HF 8GB)
+// ─────────────────────────────────────────────────────────────────────────
+static const int MAX_SESSIONS = 14;
 static std::unordered_map<std::string, SessionState> g_sessions;
 // Shared per-request working buffers
 static float *g_x, *g_buf, *g_qkv, *g_attn, *g_ff, *g_logits;
 // ─────────────────────────────────────────────────────────────────────────
+// Math Kernels  (AVX2 + FMA + OpenMP)
 // ─────────────────────────────────────────────────────────────────────────
 static void layer_norm(float* out, const float* x, const float* w,
                        const float* b, int N) {
     float mean = 0.f, var = 0.f;
 }
 // ─────────────────────────────────────────────────────────────────────────
+// Transformer Forward  (single token at position `pos`)
 // ─────────────────────────────────────────────────────────────────────────
 static void forward(int token_id, int pos, float* k_cache, float* v_cache) {
     const int C = cfg.n_embd, H = cfg.n_head, hs = C/H;
     float* te = W.wte + (long long)token_id*C;
     float* pe = W.wpe + (long long)pos*C;
 #pragma omp parallel for
     for (int i = 0; i < C; i++) g_x[i] = te[i]+pe[i];
     for (int l = 0; l < cfg.n_layer; l++) {
         // Self-attention
         layer_norm(g_buf, g_x, W.ln1_w[l], W.ln1_b[l], C);
         matmul_vec(g_qkv, W.c_attn_w[l], g_buf, 3*C, C);
 // ─────────────────────────────────────────────────────────────────────────
 // Weight Mapping
 // ─────────────────────────────────────────────────────────────────────────
 static void map_weights(float* data) {
     float* p = data;
     const int C = cfg.n_embd, L = cfg.n_layer;
     W.wte=p; p+=(long long)cfg.vocab_size*C;
     W.wpe=p; p+=(long long)cfg.block_size*C;
     #define ARR(f) W.f=(float**)malloc(L*sizeof(float*))
     ARR(ln1_w); ARR(ln1_b); ARR(c_attn_w); ARR(c_attn_b);
     ARR(c_proj_w); ARR(c_proj_b); ARR(ln2_w); ARR(ln2_b);
     ARR(fc_w); ARR(fc_b); ARR(mlp_proj_w); ARR(mlp_proj_b);
     #undef ARR
     for (int l = 0; l < L; l++) {
+        W.ln1_w[l]=p; p+=C;         W.ln1_b[l]=p; p+=C;
         W.c_attn_w[l]=p; p+=3LL*C*C; W.c_attn_b[l]=p; p+=3LL*C;
         W.c_proj_w[l]=p; p+=1LL*C*C; W.c_proj_b[l]=p; p+=C;
+        W.ln2_w[l]=p; p+=C;          W.ln2_b[l]=p; p+=C;
+        W.fc_w[l]=p; p+=4LL*C*C;     W.fc_b[l]=p; p+=4LL*C;
         W.mlp_proj_w[l]=p; p+=1LL*C*4*C; W.mlp_proj_b[l]=p; p+=C;
     }
     W.ln_f_w=p; p+=C; W.ln_f_b=p; p+=C; W.lm_head_w=p;
 }
 // ─────────────────────────────────────────────────────────────────────────
+// Session Management  (LRU eviction when MAX_SESSIONS reached)
 // ─────────────────────────────────────────────────────────────────────────
 static long long kv_alloc_bytes() {
     return (long long)cfg.n_layer * cfg.block_size * cfg.n_embd * sizeof(float);
 }
 }
 // ─────────────────────────────────────────────────────────────────────────
+// Sampler  (Top-K)
 // ─────────────────────────────────────────────────────────────────────────
 static int sample_topk(float temperature, int top_k) {
     for (int v = 0; v < cfg.vocab_size; v++) g_logits[v] /= temperature;
     std::vector<std::pair<float,int>> pairs(cfg.vocab_size);
     for (int v = 0; v < cfg.vocab_size; v++) pairs[v]={g_logits[v],v};
     std::partial_sort(pairs.begin(), pairs.begin()+top_k, pairs.end(),
+        [](const std::pair<float,int>& a, const std::pair<float,int>& b){
+            return a.first > b.first;
+        });
     float sum=0.f;
     for (int j=0; j<top_k; j++) { pairs[j].first=expf(pairs[j].first); sum+=pairs[j].first; }
     for (int j=0; j<top_k; j++) pairs[j].first /= sum;
 // ─────────────────────────────────────────��───────────────────────────────
 // Helpers
 // ─────────────────────────────────────────────────────────────────────────
 static std::vector<std::string> split(const std::string& s, char d) {
     std::vector<std::string> out; std::string cur;
     for (char c:s){ if(c==d){out.push_back(cur);cur.clear();}else cur+=c; }
 // ─────────────────────────────────────────────────────────────────────────
 // Command Handlers
 // ─────────────────────────────────────────────────────────────────────────
 static void handle_request(const std::string& line) {
     auto parts = split(line, '|');
     if (parts.size() < 7) {
         printf("ERROR bad_request_format\n"); fflush(stdout); return;
     }
+    std::string sess_id = parts[1];
+    auto new_tokens     = parse_ints(parts[2]);
+    int  max_new        = atoi(parts[3].c_str());
+    float temp          = (float)atof(parts[4].c_str());
+    int  top_k          = atoi(parts[5].c_str());
+    auto stop_list      = parse_ints(parts[6]);
     if (temp  < 0.01f) temp  = 0.01f;
     if (top_k < 1)     top_k = 1;
     if (max_new < 1)   max_new = 1;
     std::unordered_set<int> stop_ids(stop_list.begin(), stop_list.end());
+    stop_ids.insert(50256);  // <|endoftext|> always stop
     SessionState& sess = get_or_create(sess_id);
+    // Prefill new tokens into KV cache
     for (int tok : new_tokens) {
         if (sess.pos >= cfg.block_size) {
             printf("ERROR context_window_full\n"); fflush(stdout); return;
         sess.pos++;
     }
+    // Autoregressive generation
     double t0  = get_time_ms();
     int    gen = 0;
     for (int i = 0; i < max_new; i++) {
         if (sess.pos >= cfg.block_size) break;
         int best = sample_topk(temp, top_k);
     fflush(stdout);
 }
 static void handle_reset(const std::string& line) {
     auto parts = split(line, '|');
     if (parts.size() < 2) { printf("RESET_OK\n"); fflush(stdout); return; }
 }
 // ─────────────────────────────────────────────────────────────────────────
+// MAIN — model ek baar load, phir stdin se commands serve karo
 // ─────────────────────────────────────────────────────────────────────────
 int main() {
     FILE* f = fopen("model.bin", "rb");
     if (!f) { printf("ERROR model.bin_not_found\n"); fflush(stdout); return 1; }
     fseek(f, 0, SEEK_END);
     long fsize = ftell(f);
     fseek(f, 5*(long)sizeof(int), SEEK_SET);
     long wbytes = fsize - 5*(long)sizeof(int);
     g_model_data = (float*)malloc(wbytes);
     if (!g_model_data) { printf("ERROR oom_loading_model\n"); fflush(stdout); return 1; }
     fread(g_model_data, 1, wbytes, f);
     fclose(f);
     map_weights(g_model_data);
     const int C = cfg.n_embd;
     g_logits = (float*)malloc(cfg.vocab_size*sizeof(float));
     srand((unsigned int)time(NULL));
+    printf("READY\n"); fflush(stdout);  // Python waits for this
     std::string line;
     while (std::getline(std::cin, line)) {
         if (!line.empty() && line.back()=='\r') line.pop_back();
         if (line.empty()) continue;
+        if (line == "QUIT")                   break;
+        else if (line.rfind("RESET|",0)==0)   handle_reset(line);
+        else if (line.rfind("REQUEST|",0)==0) handle_request(line);
         else { printf("ERROR unknown_cmd\n"); fflush(stdout); }
     }
     free(g_model_data);
     free(g_x); free(g_buf); free(g_qkv); free(g_attn); free(g_ff); free(g_logits);
     return 0;
+}