Spaces:
Sleeping
Sleeping
SpinalCord LLM: training, dashboard, speculative decoding, deploy docs, early-exit brain (PyTorch)
f52586c | /* | |
| * SpinalCord LLM β C++ Inference Engine | |
| * ======================================= | |
| * This program uses llama.cpp to: | |
| * 1. Load a GGUF model (your converted SpinalCord Draft model) | |
| * 2. Walk through all the transformer layers | |
| * 3. *** Print confidence/logit statistics at each layer *** | |
| * (This is the first "spark" of the Spinal Cord invention!) | |
| * | |
| * Build instructions: | |
| * cmake -B build -DLLAMA_CUDA=ON | |
| * cmake --build build --config Release | |
| * ./spinalcord path/to/model.gguf "Your input prompt here" | |
| * | |
| * Author: Shivansh Darji | AppDice | |
| */ | |
| // llama.cpp headers (must be after cloning and building llama.cpp) | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // UTILITY: Calculate entropy of a probability distribution | |
| // Entropy = measure of "uncertainty" or "confidence" | |
| // Low entropy = model is very confident (spinal cord reflex working) | |
| // High entropy = model is uncertain (brain needs to think) | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| float compute_entropy(const float* logits, int vocab_size) { | |
| // Find max for numerical stability (log-sum-exp trick) | |
| float max_logit = logits[0]; | |
| for (int i = 1; i < vocab_size; ++i) { | |
| if (logits[i] > max_logit) max_logit = logits[i]; | |
| } | |
| // Compute softmax probabilities | |
| float sum = 0.0f; | |
| std::vector<float> probs(vocab_size); | |
| for (int i = 0; i < vocab_size; ++i) { | |
| probs[i] = expf(logits[i] - max_logit); | |
| sum += probs[i]; | |
| } | |
| for (int i = 0; i < vocab_size; ++i) { | |
| probs[i] /= sum; | |
| } | |
| // Compute entropy: H = -sum(p * log(p)) | |
| float entropy = 0.0f; | |
| for (int i = 0; i < vocab_size; ++i) { | |
| if (probs[i] > 1e-9f) { | |
| entropy -= probs[i] * logf(probs[i]); | |
| } | |
| } | |
| return entropy; | |
| } | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // UTILITY: Get top-k token IDs and their probabilities | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| struct TokenProb { | |
| llama_token id; | |
| float prob; | |
| }; | |
| std::vector<TokenProb> get_top_k(const float* logits, int vocab_size, int k = 5) { | |
| std::vector<std::pair<float, int>> scored(vocab_size); | |
| // Find max for softmax | |
| float max_logit = logits[0]; | |
| for (int i = 1; i < vocab_size; ++i) { | |
| if (logits[i] > max_logit) max_logit = logits[i]; | |
| } | |
| float sum = 0.0f; | |
| for (int i = 0; i < vocab_size; ++i) { | |
| scored[i] = {expf(logits[i] - max_logit), i}; | |
| sum += scored[i].first; | |
| } | |
| // Normalize | |
| for (auto& [prob, id] : scored) prob /= sum; | |
| // Sort descending | |
| std::partial_sort(scored.begin(), scored.begin() + k, scored.end(), | |
| [](const auto& a, const auto& b) { return a.first > b.first; }); | |
| std::vector<TokenProb> result(k); | |
| for (int i = 0; i < k; ++i) { | |
| result[i] = {scored[i].second, scored[i].first}; | |
| } | |
| return result; | |
| } | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // SPINAL CORD LAYER PROBE | |
| // This is the "spark" β we print confidence at each transformer layer. | |
| // In a real implementation, you'd hook into llama_decode's internal | |
| // layer callbacks. Here we demonstrate with final logits. | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| void spinalcord_layer_probe( | |
| llama_context* ctx, | |
| const float* logits, | |
| int n_vocab, | |
| int layer_idx, | |
| int n_layers | |
| ) { | |
| float entropy = compute_entropy(logits, n_vocab); | |
| // Confidence = 1 - normalized entropy | |
| // max entropy for vocab_size = log(vocab_size) | |
| float max_entropy = logf((float)n_vocab); | |
| float confidence = 1.0f - (entropy / max_entropy); | |
| // Print the Spinal Cord "confidence bar" | |
| int bar_width = 30; | |
| int filled = (int)(confidence * bar_width); | |
| printf("[Layer %2d/%2d] ", layer_idx + 1, n_layers); | |
| printf("Confidence: %5.1f%% [", confidence * 100.0f); | |
| for (int i = 0; i < bar_width; ++i) { | |
| printf(i < filled ? "β" : "β"); | |
| } | |
| printf("] Entropy: %.3f", entropy); | |
| // π¨ SPINAL CORD DECISION POINT | |
| // This is where your invention triggers: | |
| // If confidence is HIGH early β don't need all layers! | |
| if (layer_idx <= 3 && confidence > 0.90f) { | |
| printf(" β β‘ REFLEX! (Early exit possible)"); | |
| } | |
| printf("\n"); | |
| } | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // SPECULATIVE DRAFT GENERATION | |
| // Uses the model in "draft mode" β fast greedy/sample generation | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| struct DraftResult { | |
| std::vector<llama_token> tokens; | |
| std::vector<float> confidences; | |
| float avg_confidence; | |
| long long time_ms; | |
| }; | |
| DraftResult generate_draft_tokens( | |
| llama_context* ctx, | |
| llama_model* model, | |
| const std::vector<llama_token>& context_tokens, | |
| int gamma = 4, | |
| int n_vocab = 32000 | |
| ) { | |
| DraftResult result; | |
| auto t_start = std::chrono::high_resolution_clock::now(); | |
| std::vector<llama_token> current = context_tokens; | |
| for (int step = 0; step < gamma; ++step) { | |
| // Decode current sequence | |
| llama_batch batch = llama_batch_get_one( | |
| current.data(), current.size() | |
| ); | |
| if (llama_decode(ctx, batch) != 0) { | |
| fprintf(stderr, "[Error] llama_decode failed at draft step %d\n", step); | |
| break; | |
| } | |
| // Get logits of the last token | |
| const float* logits = llama_get_logits_ith(ctx, -1); | |
| // Compute confidence at this step | |
| float entropy = compute_entropy(logits, n_vocab); | |
| float max_entropy = logf((float)n_vocab); | |
| float confidence = 1.0f - (entropy / max_entropy); | |
| result.confidences.push_back(confidence); | |
| // Greedy sampling (fastest) - using our custom argmax | |
| auto top_tokens = get_top_k(logits, n_vocab, 1); | |
| llama_token next_token = top_tokens[0].id; | |
| result.tokens.push_back(next_token); | |
| current.push_back(next_token); | |
| const struct llama_vocab * vocab = llama_model_get_vocab(model); | |
| printf("[Draft Step %d] Token: %-15s | Confidence: %.1f%%\n", | |
| step + 1, | |
| llama_vocab_get_text(vocab, next_token), | |
| confidence * 100.0f); | |
| // EOS check | |
| if (llama_vocab_is_eog(vocab, next_token)) { | |
| break; | |
| } | |
| } | |
| // Compute average confidence | |
| float avg = 0.0f; | |
| for (float c : result.confidences) avg += c; | |
| result.avg_confidence = result.confidences.empty() ? 0.0f | |
| : avg / result.confidences.size(); | |
| auto t_end = std::chrono::high_resolution_clock::now(); | |
| result.time_ms = std::chrono::duration_cast<std::chrono::milliseconds>( | |
| t_end - t_start | |
| ).count(); | |
| return result; | |
| } | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // MAIN | |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| int main(int argc, char** argv) { | |
| if (argc < 3) { | |
| printf("Usage: %s <model.gguf> \"<prompt>\"\n", argv[0]); | |
| printf("Example: %s models/phi-3-mini.gguf \"The spinal cord\"\n", argv[0]); | |
| return 1; | |
| } | |
| const char* model_path = argv[1]; | |
| const char* prompt = argv[2]; | |
| printf("\n"); | |
| printf("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); | |
| printf("β π§ SpinalCord LLM β Inference Engine β\n"); | |
| printf("β AppDice | Shivansh Darji β\n"); | |
| printf("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); | |
| // ββ Initialize model ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| llama_model_params model_params = llama_model_default_params(); | |
| model_params.n_gpu_layers = 35; // offload 35 layers to RTX 2050 | |
| printf("[Init] Loading model: %s\n", model_path); | |
| llama_model* model = llama_model_load_from_file(model_path, model_params); | |
| if (!model) { | |
| fprintf(stderr, "[Error] Failed to load model: %s\n", model_path); | |
| return 1; | |
| } | |
| const struct llama_vocab * vocab = llama_model_get_vocab(model); | |
| printf("[Init] Model loaded! Layers: %d, Vocab: %d\n", | |
| llama_model_n_layer(model), | |
| llama_vocab_n_tokens(vocab)); | |
| // ββ Initialize context βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| llama_context_params ctx_params = llama_context_default_params(); | |
| ctx_params.n_ctx = 2048; | |
| ctx_params.n_batch = 512; | |
| ctx_params.n_threads = 4; | |
| llama_context* ctx = llama_init_from_model(model, ctx_params); | |
| if (!ctx) { | |
| fprintf(stderr, "[Error] Failed to create context\n"); | |
| llama_model_free(model); | |
| return 1; | |
| } | |
| int n_vocab = llama_vocab_n_tokens(vocab); | |
| int n_layers = llama_model_n_layer(model); | |
| // ββ Tokenize prompt ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| std::vector<llama_token> tokens(512); | |
| int n_tokens = llama_tokenize( | |
| vocab, prompt, strlen(prompt), | |
| tokens.data(), tokens.size(), | |
| /* add_bos */ true, /* special */ false | |
| ); | |
| tokens.resize(n_tokens); | |
| printf("\n[Prompt] \"%s\"\n", prompt); | |
| printf("[Prompt] Token count: %d\n\n", n_tokens); | |
| // ββ Run Initial Forward Pass + Layer Probe ββββββββββββββββββββββββββββββββ | |
| printf("βββ π¬ SPINAL CORD LAYER ANALYSIS ββββββββββββββββββββββββββ\n"); | |
| printf("Watching confidence build up through the transformer layers...\n\n"); | |
| llama_batch batch = llama_batch_get_one(tokens.data(), n_tokens); | |
| if (llama_decode(ctx, batch) != 0) { | |
| fprintf(stderr, "[Error] Initial llama_decode failed\n"); | |
| return 1; | |
| } | |
| // Get final logits (from the last position) | |
| const float* final_logits = llama_get_logits_ith(ctx, -1); | |
| // Simulate layer-by-layer analysis using the final logits | |
| // (In a full implementation, you'd hook into intermediate layer outputs) | |
| printf("NOTE: Full layer hooks require custom llama.cpp modification.\n"); | |
| printf(" Below shows final-layer analysis. See llama-model.cpp to add hooks.\n\n"); | |
| spinalcord_layer_probe(ctx, final_logits, n_vocab, n_layers - 1, n_layers); | |
| // ββ Show Top Predictions βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| printf("\nβββ π― TOP TOKEN PREDICTIONS ββββββββββββββββββββββββββββββββ\n"); | |
| auto top_tokens = get_top_k(final_logits, n_vocab, 5); | |
| for (int i = 0; i < (int)top_tokens.size(); ++i) { | |
| printf(" Top %d: %-20s | Prob: %.2f%%\n", | |
| i + 1, llama_vocab_get_text(vocab, top_tokens[i].id), top_tokens[i].prob * 100.0f); | |
| } | |
| // ββ Speculative Draft Generation βββββββββββββββββββββββββββββββββββββββββ | |
| printf("\nβββ β‘ SPINAL CORD SPECULATIVE GENERATION (gamma=4) ββββββββββ\n"); | |
| printf("The Draft Model generating next 4 tokens speculatively...\n\n"); | |
| DraftResult draft = generate_draft_tokens(ctx, model, tokens, 4, n_vocab); | |
| printf("\n[Draft Summary]\n"); | |
| printf(" Tokens generated: %zu\n", draft.tokens.size()); | |
| printf(" Average confidence: %.1f%%\n", draft.avg_confidence * 100.0f); | |
| printf(" Time taken: %lld ms\n", draft.time_ms); | |
| // Decode and print the drafted sequence | |
| printf(" Drafted text: \""); | |
| for (llama_token tok : draft.tokens) { | |
| printf("%s", llama_vocab_get_text(vocab, tok)); | |
| } | |
| printf("\"\n"); | |
| // ββ Final Summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| printf("\nβββ π SPINALCORD REPORT ββββββββββββββββββββββββββββββββββββ\n"); | |
| float entropy = compute_entropy(final_logits, n_vocab); | |
| float max_entropy = logf((float)n_vocab); | |
| float confidence = 1.0f - (entropy / max_entropy); | |
| printf(" Model: %s\n", model_path); | |
| printf(" Prompt: \"%s\"\n", prompt); | |
| printf(" Final Confidence: %.1f%%\n", confidence * 100.0f); | |
| printf(" Final Entropy: %.4f nats (max=%.2f)\n", entropy, max_entropy); | |
| printf(" Speculative Speed: %zu tokens in %lld ms (%.1f tok/s)\n", | |
| draft.tokens.size(), draft.time_ms, | |
| draft.time_ms > 0 ? draft.tokens.size() * 1000.0f / draft.time_ms : 0.0f); | |
| printf("\nββββββββββββββββββββββββββββββββββββββββββββββββ\n"); | |
| printf("β π SpinalCord Engine complete. AppDice. β\n"); | |
| printf("ββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); | |
| // ββ Cleanup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| llama_free(ctx); | |
| llama_model_free(model); | |
| return 0; | |
| } | |