File size: 16,076 Bytes
f52586c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
/*
 * SpinalCord LLM β€” C++ Inference Engine
 * =======================================
 * This program uses llama.cpp to:
 *   1. Load a GGUF model (your converted SpinalCord Draft model)
 *   2. Walk through all the transformer layers
 *   3. *** Print confidence/logit statistics at each layer ***
 *      (This is the first "spark" of the Spinal Cord invention!)
 *
 * Build instructions:
 *   cmake -B build -DLLAMA_CUDA=ON
 *   cmake --build build --config Release
 *   ./spinalcord path/to/model.gguf "Your input prompt here"
 *
 * Author: Shivansh Darji | AppDice
 */

#include <cstdio>
#include <cstring>
#include <cmath>
#include <string>
#include <vector>
#include <chrono>
#include <algorithm>

// llama.cpp headers (must be after cloning and building llama.cpp)
#include "llama.h"
#include "common.h"


// ─────────────────────────────────────────────────────────────────────────────
// UTILITY: Calculate entropy of a probability distribution
// Entropy = measure of "uncertainty" or "confidence"
// Low entropy = model is very confident (spinal cord reflex working)
// High entropy = model is uncertain (brain needs to think)
// ─────────────────────────────────────────────────────────────────────────────
float compute_entropy(const float* logits, int vocab_size) {
    // Find max for numerical stability (log-sum-exp trick)
    float max_logit = logits[0];
    for (int i = 1; i < vocab_size; ++i) {
        if (logits[i] > max_logit) max_logit = logits[i];
    }

    // Compute softmax probabilities
    float sum = 0.0f;
    std::vector<float> probs(vocab_size);
    for (int i = 0; i < vocab_size; ++i) {
        probs[i] = expf(logits[i] - max_logit);
        sum += probs[i];
    }
    for (int i = 0; i < vocab_size; ++i) {
        probs[i] /= sum;
    }

    // Compute entropy: H = -sum(p * log(p))
    float entropy = 0.0f;
    for (int i = 0; i < vocab_size; ++i) {
        if (probs[i] > 1e-9f) {
            entropy -= probs[i] * logf(probs[i]);
        }
    }
    return entropy;
}


// ─────────────────────────────────────────────────────────────────────────────
// UTILITY: Get top-k token IDs and their probabilities
// ─────────────────────────────────────────────────────────────────────────────
struct TokenProb {
    llama_token id;
    float prob;
};

std::vector<TokenProb> get_top_k(const float* logits, int vocab_size, int k = 5) {
    std::vector<std::pair<float, int>> scored(vocab_size);
    
    // Find max for softmax
    float max_logit = logits[0];
    for (int i = 1; i < vocab_size; ++i) {
        if (logits[i] > max_logit) max_logit = logits[i];
    }
    
    float sum = 0.0f;
    for (int i = 0; i < vocab_size; ++i) {
        scored[i] = {expf(logits[i] - max_logit), i};
        sum += scored[i].first;
    }
    
    // Normalize
    for (auto& [prob, id] : scored) prob /= sum;
    
    // Sort descending
    std::partial_sort(scored.begin(), scored.begin() + k, scored.end(),
                      [](const auto& a, const auto& b) { return a.first > b.first; });
    
    std::vector<TokenProb> result(k);
    for (int i = 0; i < k; ++i) {
        result[i] = {scored[i].second, scored[i].first};
    }
    return result;
}


// ─────────────────────────────────────────────────────────────────────────────
// SPINAL CORD LAYER PROBE
// This is the "spark" β€” we print confidence at each transformer layer.
// In a real implementation, you'd hook into llama_decode's internal
// layer callbacks. Here we demonstrate with final logits.
// ─────────────────────────────────────────────────────────────────────────────
void spinalcord_layer_probe(
    llama_context* ctx,
    const float* logits,
    int n_vocab,
    int layer_idx,
    int n_layers
) {
    float entropy = compute_entropy(logits, n_vocab);
    
    // Confidence = 1 - normalized entropy
    // max entropy for vocab_size = log(vocab_size)
    float max_entropy = logf((float)n_vocab);
    float confidence  = 1.0f - (entropy / max_entropy);
    
    // Print the Spinal Cord "confidence bar"
    int bar_width = 30;
    int filled    = (int)(confidence * bar_width);
    
    printf("[Layer %2d/%2d] ", layer_idx + 1, n_layers);
    printf("Confidence: %5.1f%% [", confidence * 100.0f);
    for (int i = 0; i < bar_width; ++i) {
        printf(i < filled ? "β–ˆ" : "β–‘");
    }
    printf("] Entropy: %.3f", entropy);
    
    // 🚨 SPINAL CORD DECISION POINT
    // This is where your invention triggers:
    // If confidence is HIGH early β†’ don't need all layers!
    if (layer_idx <= 3 && confidence > 0.90f) {
        printf(" ← ⚑ REFLEX! (Early exit possible)");
    }
    printf("\n");
}


// ─────────────────────────────────────────────────────────────────────────────
// SPECULATIVE DRAFT GENERATION
// Uses the model in "draft mode" β€” fast greedy/sample generation
// ─────────────────────────────────────────────────────────────────────────────
struct DraftResult {
    std::vector<llama_token> tokens;
    std::vector<float>       confidences;
    float                    avg_confidence;
    long long                time_ms;
};

DraftResult generate_draft_tokens(
    llama_context* ctx,
    llama_model*   model,
    const std::vector<llama_token>& context_tokens,
    int gamma = 4,
    int n_vocab = 32000
) {
    DraftResult result;
    auto t_start = std::chrono::high_resolution_clock::now();
    
    std::vector<llama_token> current = context_tokens;
    
    for (int step = 0; step < gamma; ++step) {
        // Decode current sequence
        llama_batch batch = llama_batch_get_one(
            current.data(), current.size()
        );
        
        if (llama_decode(ctx, batch) != 0) {
            fprintf(stderr, "[Error] llama_decode failed at draft step %d\n", step);
            break;
        }
        
        // Get logits of the last token
        const float* logits = llama_get_logits_ith(ctx, -1);
        
        // Compute confidence at this step
        float entropy    = compute_entropy(logits, n_vocab);
        float max_entropy = logf((float)n_vocab);
        float confidence  = 1.0f - (entropy / max_entropy);
        
        result.confidences.push_back(confidence);
        
        // Greedy sampling (fastest) - using our custom argmax
        auto top_tokens = get_top_k(logits, n_vocab, 1);
        llama_token next_token = top_tokens[0].id;
        
        result.tokens.push_back(next_token);
        current.push_back(next_token);
        
        const struct llama_vocab * vocab = llama_model_get_vocab(model);
        printf("[Draft Step %d] Token: %-15s | Confidence: %.1f%%\n",
               step + 1,
               llama_vocab_get_text(vocab, next_token),
               confidence * 100.0f);
        
        // EOS check
        if (llama_vocab_is_eog(vocab, next_token)) {
            break;
        }
    }
    
    // Compute average confidence
    float avg = 0.0f;
    for (float c : result.confidences) avg += c;
    result.avg_confidence = result.confidences.empty() ? 0.0f 
                          : avg / result.confidences.size();
    
    auto t_end = std::chrono::high_resolution_clock::now();
    result.time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
        t_end - t_start
    ).count();
    
    return result;
}


// ─────────────────────────────────────────────────────────────────────────────
// MAIN
// ─────────────────────────────────────────────────────────────────────────────
int main(int argc, char** argv) {
    if (argc < 3) {
        printf("Usage: %s <model.gguf> \"<prompt>\"\n", argv[0]);
        printf("Example: %s models/phi-3-mini.gguf \"The spinal cord\"\n", argv[0]);
        return 1;
    }

    const char* model_path = argv[1];
    const char* prompt     = argv[2];

    printf("\n");
    printf("╔══════════════════════════════════════════════════════╗\n");
    printf("β•‘       🧠 SpinalCord LLM β€” Inference Engine          β•‘\n");
    printf("β•‘            AppDice | Shivansh Darji                 β•‘\n");
    printf("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n\n");

    // ── Initialize model ────────────────────────────────────────────────────
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = 35;   // offload 35 layers to RTX 2050

    printf("[Init] Loading model: %s\n", model_path);
    llama_model* model = llama_model_load_from_file(model_path, model_params);
    if (!model) {
        fprintf(stderr, "[Error] Failed to load model: %s\n", model_path);
        return 1;
    }
    const struct llama_vocab * vocab = llama_model_get_vocab(model);
    printf("[Init] Model loaded! Layers: %d, Vocab: %d\n",
           llama_model_n_layer(model),
           llama_vocab_n_tokens(vocab));

    // ── Initialize context ───────────────────────────────────────────────────
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx    = 2048;
    ctx_params.n_batch  = 512;
    ctx_params.n_threads = 4;

    llama_context* ctx = llama_init_from_model(model, ctx_params);
    if (!ctx) {
        fprintf(stderr, "[Error] Failed to create context\n");
        llama_model_free(model);
        return 1;
    }

    int n_vocab  = llama_vocab_n_tokens(vocab);
    int n_layers = llama_model_n_layer(model);

    // ── Tokenize prompt ──────────────────────────────────────────────────────
    std::vector<llama_token> tokens(512);
    int n_tokens = llama_tokenize(
        vocab, prompt, strlen(prompt),
        tokens.data(), tokens.size(),
        /* add_bos */ true, /* special */ false
    );
    tokens.resize(n_tokens);

    printf("\n[Prompt] \"%s\"\n", prompt);
    printf("[Prompt] Token count: %d\n\n", n_tokens);

    // ── Run Initial Forward Pass + Layer Probe ────────────────────────────────
    printf("━━━ πŸ”¬ SPINAL CORD LAYER ANALYSIS ━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
    printf("Watching confidence build up through the transformer layers...\n\n");
    
    llama_batch batch = llama_batch_get_one(tokens.data(), n_tokens);
    if (llama_decode(ctx, batch) != 0) {
        fprintf(stderr, "[Error] Initial llama_decode failed\n");
        return 1;
    }

    // Get final logits (from the last position)
    const float* final_logits = llama_get_logits_ith(ctx, -1);

    // Simulate layer-by-layer analysis using the final logits
    // (In a full implementation, you'd hook into intermediate layer outputs)
    printf("NOTE: Full layer hooks require custom llama.cpp modification.\n");
    printf("      Below shows final-layer analysis. See llama-model.cpp to add hooks.\n\n");
    
    spinalcord_layer_probe(ctx, final_logits, n_vocab, n_layers - 1, n_layers);

    // ── Show Top Predictions ─────────────────────────────────────────────────
    printf("\n━━━ 🎯 TOP TOKEN PREDICTIONS ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
    auto top_tokens = get_top_k(final_logits, n_vocab, 5);
    for (int i = 0; i < (int)top_tokens.size(); ++i) {
        printf("  Top %d: %-20s | Prob: %.2f%%\n",
               i + 1, llama_vocab_get_text(vocab, top_tokens[i].id), top_tokens[i].prob * 100.0f);
    }

    // ── Speculative Draft Generation ─────────────────────────────────────────
    printf("\n━━━ ⚑ SPINAL CORD SPECULATIVE GENERATION (gamma=4) ━━━━━━━━━━\n");
    printf("The Draft Model generating next 4 tokens speculatively...\n\n");
    
    DraftResult draft = generate_draft_tokens(ctx, model, tokens, 4, n_vocab);
    
    printf("\n[Draft Summary]\n");
    printf("  Tokens generated:    %zu\n", draft.tokens.size());
    printf("  Average confidence:  %.1f%%\n", draft.avg_confidence * 100.0f);
    printf("  Time taken:          %lld ms\n", draft.time_ms);
    
    // Decode and print the drafted sequence
    printf("  Drafted text:        \"");
    for (llama_token tok : draft.tokens) {
        printf("%s", llama_vocab_get_text(vocab, tok));
    }
    printf("\"\n");

    // ── Final Summary ────────────────────────────────────────────────────────
    printf("\n━━━ πŸ“Š SPINALCORD REPORT ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
    float entropy     = compute_entropy(final_logits, n_vocab);
    float max_entropy = logf((float)n_vocab);
    float confidence  = 1.0f - (entropy / max_entropy);
    
    printf("  Model:              %s\n", model_path);
    printf("  Prompt:             \"%s\"\n", prompt);
    printf("  Final Confidence:   %.1f%%\n", confidence * 100.0f);
    printf("  Final Entropy:      %.4f nats (max=%.2f)\n", entropy, max_entropy);
    printf("  Speculative Speed:  %zu tokens in %lld ms (%.1f tok/s)\n",
           draft.tokens.size(), draft.time_ms,
           draft.time_ms > 0 ? draft.tokens.size() * 1000.0f / draft.time_ms : 0.0f);
    printf("\n╔══════════════════════════════════════════════╗\n");
    printf("β•‘  πŸš€ SpinalCord Engine complete. AppDice.     β•‘\n");
    printf("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n\n");

    // ── Cleanup ───────────────────────────────────────────────────────────────
    llama_free(ctx);
    llama_model_free(model);

    return 0;
}