Spinal-CordAI / inference /spinalcord.cpp
shivansh1709's picture
SpinalCord LLM: training, dashboard, speculative decoding, deploy docs, early-exit brain (PyTorch)
f52586c
/*
* SpinalCord LLM β€” C++ Inference Engine
* =======================================
* This program uses llama.cpp to:
* 1. Load a GGUF model (your converted SpinalCord Draft model)
* 2. Walk through all the transformer layers
* 3. *** Print confidence/logit statistics at each layer ***
* (This is the first "spark" of the Spinal Cord invention!)
*
* Build instructions:
* cmake -B build -DLLAMA_CUDA=ON
* cmake --build build --config Release
* ./spinalcord path/to/model.gguf "Your input prompt here"
*
* Author: Shivansh Darji | AppDice
*/
#include <cstdio>
#include <cstring>
#include <cmath>
#include <string>
#include <vector>
#include <chrono>
#include <algorithm>
// llama.cpp headers (must be after cloning and building llama.cpp)
#include "llama.h"
#include "common.h"
// ─────────────────────────────────────────────────────────────────────────────
// UTILITY: Calculate entropy of a probability distribution
// Entropy = measure of "uncertainty" or "confidence"
// Low entropy = model is very confident (spinal cord reflex working)
// High entropy = model is uncertain (brain needs to think)
// ─────────────────────────────────────────────────────────────────────────────
float compute_entropy(const float* logits, int vocab_size) {
// Find max for numerical stability (log-sum-exp trick)
float max_logit = logits[0];
for (int i = 1; i < vocab_size; ++i) {
if (logits[i] > max_logit) max_logit = logits[i];
}
// Compute softmax probabilities
float sum = 0.0f;
std::vector<float> probs(vocab_size);
for (int i = 0; i < vocab_size; ++i) {
probs[i] = expf(logits[i] - max_logit);
sum += probs[i];
}
for (int i = 0; i < vocab_size; ++i) {
probs[i] /= sum;
}
// Compute entropy: H = -sum(p * log(p))
float entropy = 0.0f;
for (int i = 0; i < vocab_size; ++i) {
if (probs[i] > 1e-9f) {
entropy -= probs[i] * logf(probs[i]);
}
}
return entropy;
}
// ─────────────────────────────────────────────────────────────────────────────
// UTILITY: Get top-k token IDs and their probabilities
// ─────────────────────────────────────────────────────────────────────────────
struct TokenProb {
llama_token id;
float prob;
};
std::vector<TokenProb> get_top_k(const float* logits, int vocab_size, int k = 5) {
std::vector<std::pair<float, int>> scored(vocab_size);
// Find max for softmax
float max_logit = logits[0];
for (int i = 1; i < vocab_size; ++i) {
if (logits[i] > max_logit) max_logit = logits[i];
}
float sum = 0.0f;
for (int i = 0; i < vocab_size; ++i) {
scored[i] = {expf(logits[i] - max_logit), i};
sum += scored[i].first;
}
// Normalize
for (auto& [prob, id] : scored) prob /= sum;
// Sort descending
std::partial_sort(scored.begin(), scored.begin() + k, scored.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
std::vector<TokenProb> result(k);
for (int i = 0; i < k; ++i) {
result[i] = {scored[i].second, scored[i].first};
}
return result;
}
// ─────────────────────────────────────────────────────────────────────────────
// SPINAL CORD LAYER PROBE
// This is the "spark" β€” we print confidence at each transformer layer.
// In a real implementation, you'd hook into llama_decode's internal
// layer callbacks. Here we demonstrate with final logits.
// ─────────────────────────────────────────────────────────────────────────────
void spinalcord_layer_probe(
llama_context* ctx,
const float* logits,
int n_vocab,
int layer_idx,
int n_layers
) {
float entropy = compute_entropy(logits, n_vocab);
// Confidence = 1 - normalized entropy
// max entropy for vocab_size = log(vocab_size)
float max_entropy = logf((float)n_vocab);
float confidence = 1.0f - (entropy / max_entropy);
// Print the Spinal Cord "confidence bar"
int bar_width = 30;
int filled = (int)(confidence * bar_width);
printf("[Layer %2d/%2d] ", layer_idx + 1, n_layers);
printf("Confidence: %5.1f%% [", confidence * 100.0f);
for (int i = 0; i < bar_width; ++i) {
printf(i < filled ? "β–ˆ" : "β–‘");
}
printf("] Entropy: %.3f", entropy);
// 🚨 SPINAL CORD DECISION POINT
// This is where your invention triggers:
// If confidence is HIGH early β†’ don't need all layers!
if (layer_idx <= 3 && confidence > 0.90f) {
printf(" ← ⚑ REFLEX! (Early exit possible)");
}
printf("\n");
}
// ─────────────────────────────────────────────────────────────────────────────
// SPECULATIVE DRAFT GENERATION
// Uses the model in "draft mode" β€” fast greedy/sample generation
// ─────────────────────────────────────────────────────────────────────────────
struct DraftResult {
std::vector<llama_token> tokens;
std::vector<float> confidences;
float avg_confidence;
long long time_ms;
};
DraftResult generate_draft_tokens(
llama_context* ctx,
llama_model* model,
const std::vector<llama_token>& context_tokens,
int gamma = 4,
int n_vocab = 32000
) {
DraftResult result;
auto t_start = std::chrono::high_resolution_clock::now();
std::vector<llama_token> current = context_tokens;
for (int step = 0; step < gamma; ++step) {
// Decode current sequence
llama_batch batch = llama_batch_get_one(
current.data(), current.size()
);
if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "[Error] llama_decode failed at draft step %d\n", step);
break;
}
// Get logits of the last token
const float* logits = llama_get_logits_ith(ctx, -1);
// Compute confidence at this step
float entropy = compute_entropy(logits, n_vocab);
float max_entropy = logf((float)n_vocab);
float confidence = 1.0f - (entropy / max_entropy);
result.confidences.push_back(confidence);
// Greedy sampling (fastest) - using our custom argmax
auto top_tokens = get_top_k(logits, n_vocab, 1);
llama_token next_token = top_tokens[0].id;
result.tokens.push_back(next_token);
current.push_back(next_token);
const struct llama_vocab * vocab = llama_model_get_vocab(model);
printf("[Draft Step %d] Token: %-15s | Confidence: %.1f%%\n",
step + 1,
llama_vocab_get_text(vocab, next_token),
confidence * 100.0f);
// EOS check
if (llama_vocab_is_eog(vocab, next_token)) {
break;
}
}
// Compute average confidence
float avg = 0.0f;
for (float c : result.confidences) avg += c;
result.avg_confidence = result.confidences.empty() ? 0.0f
: avg / result.confidences.size();
auto t_end = std::chrono::high_resolution_clock::now();
result.time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
t_end - t_start
).count();
return result;
}
// ─────────────────────────────────────────────────────────────────────────────
// MAIN
// ─────────────────────────────────────────────────────────────────────────────
int main(int argc, char** argv) {
if (argc < 3) {
printf("Usage: %s <model.gguf> \"<prompt>\"\n", argv[0]);
printf("Example: %s models/phi-3-mini.gguf \"The spinal cord\"\n", argv[0]);
return 1;
}
const char* model_path = argv[1];
const char* prompt = argv[2];
printf("\n");
printf("╔══════════════════════════════════════════════════════╗\n");
printf("β•‘ 🧠 SpinalCord LLM β€” Inference Engine β•‘\n");
printf("β•‘ AppDice | Shivansh Darji β•‘\n");
printf("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n\n");
// ── Initialize model ────────────────────────────────────────────────────
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = 35; // offload 35 layers to RTX 2050
printf("[Init] Loading model: %s\n", model_path);
llama_model* model = llama_model_load_from_file(model_path, model_params);
if (!model) {
fprintf(stderr, "[Error] Failed to load model: %s\n", model_path);
return 1;
}
const struct llama_vocab * vocab = llama_model_get_vocab(model);
printf("[Init] Model loaded! Layers: %d, Vocab: %d\n",
llama_model_n_layer(model),
llama_vocab_n_tokens(vocab));
// ── Initialize context ───────────────────────────────────────────────────
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 2048;
ctx_params.n_batch = 512;
ctx_params.n_threads = 4;
llama_context* ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
fprintf(stderr, "[Error] Failed to create context\n");
llama_model_free(model);
return 1;
}
int n_vocab = llama_vocab_n_tokens(vocab);
int n_layers = llama_model_n_layer(model);
// ── Tokenize prompt ──────────────────────────────────────────────────────
std::vector<llama_token> tokens(512);
int n_tokens = llama_tokenize(
vocab, prompt, strlen(prompt),
tokens.data(), tokens.size(),
/* add_bos */ true, /* special */ false
);
tokens.resize(n_tokens);
printf("\n[Prompt] \"%s\"\n", prompt);
printf("[Prompt] Token count: %d\n\n", n_tokens);
// ── Run Initial Forward Pass + Layer Probe ────────────────────────────────
printf("━━━ πŸ”¬ SPINAL CORD LAYER ANALYSIS ━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
printf("Watching confidence build up through the transformer layers...\n\n");
llama_batch batch = llama_batch_get_one(tokens.data(), n_tokens);
if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "[Error] Initial llama_decode failed\n");
return 1;
}
// Get final logits (from the last position)
const float* final_logits = llama_get_logits_ith(ctx, -1);
// Simulate layer-by-layer analysis using the final logits
// (In a full implementation, you'd hook into intermediate layer outputs)
printf("NOTE: Full layer hooks require custom llama.cpp modification.\n");
printf(" Below shows final-layer analysis. See llama-model.cpp to add hooks.\n\n");
spinalcord_layer_probe(ctx, final_logits, n_vocab, n_layers - 1, n_layers);
// ── Show Top Predictions ─────────────────────────────────────────────────
printf("\n━━━ 🎯 TOP TOKEN PREDICTIONS ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
auto top_tokens = get_top_k(final_logits, n_vocab, 5);
for (int i = 0; i < (int)top_tokens.size(); ++i) {
printf(" Top %d: %-20s | Prob: %.2f%%\n",
i + 1, llama_vocab_get_text(vocab, top_tokens[i].id), top_tokens[i].prob * 100.0f);
}
// ── Speculative Draft Generation ─────────────────────────────────────────
printf("\n━━━ ⚑ SPINAL CORD SPECULATIVE GENERATION (gamma=4) ━━━━━━━━━━\n");
printf("The Draft Model generating next 4 tokens speculatively...\n\n");
DraftResult draft = generate_draft_tokens(ctx, model, tokens, 4, n_vocab);
printf("\n[Draft Summary]\n");
printf(" Tokens generated: %zu\n", draft.tokens.size());
printf(" Average confidence: %.1f%%\n", draft.avg_confidence * 100.0f);
printf(" Time taken: %lld ms\n", draft.time_ms);
// Decode and print the drafted sequence
printf(" Drafted text: \"");
for (llama_token tok : draft.tokens) {
printf("%s", llama_vocab_get_text(vocab, tok));
}
printf("\"\n");
// ── Final Summary ────────────────────────────────────────────────────────
printf("\n━━━ πŸ“Š SPINALCORD REPORT ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
float entropy = compute_entropy(final_logits, n_vocab);
float max_entropy = logf((float)n_vocab);
float confidence = 1.0f - (entropy / max_entropy);
printf(" Model: %s\n", model_path);
printf(" Prompt: \"%s\"\n", prompt);
printf(" Final Confidence: %.1f%%\n", confidence * 100.0f);
printf(" Final Entropy: %.4f nats (max=%.2f)\n", entropy, max_entropy);
printf(" Speculative Speed: %zu tokens in %lld ms (%.1f tok/s)\n",
draft.tokens.size(), draft.time_ms,
draft.time_ms > 0 ? draft.tokens.size() * 1000.0f / draft.time_ms : 0.0f);
printf("\n╔══════════════════════════════════════════════╗\n");
printf("β•‘ πŸš€ SpinalCord Engine complete. AppDice. β•‘\n");
printf("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n\n");
// ── Cleanup ───────────────────────────────────────────────────────────────
llama_free(ctx);
llama_model_free(model);
return 0;
}