Spaces:

LisaMegaWatts
/

SymbioSLM

Running

App Files Files Community

LisaMegaWatts commited on Feb 27

Commit

676f15f

verified ·

1 Parent(s): d167fb8

Cache Monarch matrix + causal mask for faster inference

Browse files

Files changed (3) hide show

checkpoint.jl +6 -1
model.jl +41 -20
server.jl +4 -1

checkpoint.jl CHANGED Viewed

@@ -95,5 +95,10 @@ function load_inference_model(ckpt_path::String, config_path::String,
         println("  Adjusted vocab_size to $(config.vocab_size) from embedding weight")
     end
-    return (; config, ps, tokenizer, step, val_loss)
 end

         println("  Adjusted vocab_size to $(config.vocab_size) from embedding weight")
     end
+    # Pre-compute inference caches (Monarch matrices + causal mask)
+    println("Pre-computing inference caches ...")
+    caches = precompute_inference_caches(config, ps)
+    println("  Cached $(config.n_layers) Monarch matrices ($(config.context_length)x$(config.context_length))")
+    return (; config, ps, tokenizer, step, val_loss, caches)
 end

model.jl CHANGED Viewed

@@ -379,28 +379,46 @@ function organelle_gate_forward(organelle_outputs, logits)
 end
 # ═══════════════════════════════════════════════════════════════════
-# Symbiogenesis Sequence Mixer — 3 organelles + gate
 # ═══════════════════════════════════════════════════════════════════
-function symbio_sequence_mixer_forward(x, ps, context_length::Int, mask)
     D, T, B = size(x)
-    p = isqrt(context_length)
     # ── Organelle 1: CausalConv (local n-gram patterns) ──
     conv_out = causal_depthwise_conv1d(x, ps.conv.kernel)
     # ── Organelle 2: MonarchMatrix (global structured mixing, single-head) ──
-    # Realize full T_max × T_max Monarch matrix
-    M = monarch_realize(ps.monarch.L1, ps.monarch.L2, p)
-    # Apply causal mask (multiplicative 0/1)
-    M_causal = M .* mask
-    # Slice to actual sequence length (handles generation where T < context_length)
-    M_t = M_causal[1:T, 1:T]
     # Single-head: apply Monarch to ALL channels at once
-    # x: (D, T, B) → permute to (T, D, B) → flatten → matmul → reshape back
     x_seq = reshape(permutedims(x, (2, 1, 3)), T, D * B)  # (T, D*B)
     y_monarch = M_t * x_seq                                 # (T, D*B)
     monarch_out = permutedims(reshape(y_monarch, T, D, B), (2, 1, 3))  # (D, T, B)
@@ -415,18 +433,15 @@ function symbio_sequence_mixer_forward(x, ps, context_length::Int, mask)
 end
 # ═══════════════════════════════════════════════════════════════════
-# Full model forward pass
 # ═══════════════════════════════════════════════════════════════════
-function model_forward(config::ModelConfig, ps, x)
     T = size(x, 1)  # x: (seq_len, batch) of integer token IDs
     # Token embedding: (seq_len, batch) → (embed_dim, seq_len, batch)
     h = ps.tok_emb.weight[:, x]
-    # Causal mask (multiplicative 0/1 for symbiogenesis)
-    mask = make_causal_mask(config.context_length)
     # Symbiogenesis blocks
     for i in 1:config.n_layers
         name = Symbol("block_$i")
@@ -435,7 +450,7 @@ function model_forward(config::ModelConfig, ps, x)
         # Pre-norm sequence mixing + residual
         normed = rmsnorm_forward(h, bp.ln1.weight)
         mixed = symbio_sequence_mixer_forward(normed, bp.seq_mixer,
-                                               config.context_length, mask)
         h = h .+ mixed
         # Pre-norm FFN + residual
@@ -502,12 +517,18 @@ function generate_streaming(config::ModelConfig, ps,
                             temperature::Float64=0.8,
                             top_k::Int=0,
                             top_p::Float64=1.0,
-                            on_token=nothing)
     tokens = encode(tokenizer, prompt)
     if isempty(tokens)
         tokens = [rand(1:tokenizer_vocab_size(tokenizer))]
     end
     generated = String[]
     for _ in 1:max_tokens
@@ -518,7 +539,7 @@ function generate_streaming(config::ModelConfig, ps,
         end
         x = reshape(ctx, :, 1)
-        logits = model_forward(config, ps, x)
         next_logits = Vector{Float32}(logits[:, end, 1])
         if temperature != 1.0

 end
 # ═══════════════════════════════════════════════════════════════════
+# Pre-compute inference caches (Monarch matrices + causal mask)
 # ═══════════════════════════════════════════════════════════════════
+"""
+    precompute_inference_caches(config, ps) -> NamedTuple
+Pre-realize Monarch matrices and apply causal mask once at startup.
+Avoids recomputing them on every forward pass during generation.
+"""
+function precompute_inference_caches(config::ModelConfig, ps)
+    p = isqrt(config.context_length)
+    mask = make_causal_mask(config.context_length)
+    # Pre-realize Monarch matrix per layer (single-head): monarch_ms[layer] = masked T×T
+    monarch_ms = Vector{Matrix{Float32}}(undef, config.n_layers)
+    for i in 1:config.n_layers
+        name = Symbol("block_$i")
+        bp = getproperty(ps.blocks, name)
+        M = monarch_realize(bp.seq_mixer.monarch.L1, bp.seq_mixer.monarch.L2, p) .* mask
+        monarch_ms[i] = M
+    end
+    return (; mask, monarch_ms)
+end
+# ═══════════════════════════════════════════════════════════════════
+# Symbiogenesis Sequence Mixer — 3 organelles + gate (uses caches)
+# ═══════════════════════════════════════════════════════════════════
+function symbio_sequence_mixer_forward(x, ps, monarch_M)
     D, T, B = size(x)
     # ── Organelle 1: CausalConv (local n-gram patterns) ──
     conv_out = causal_depthwise_conv1d(x, ps.conv.kernel)
     # ── Organelle 2: MonarchMatrix (global structured mixing, single-head) ──
+    # Use pre-realized + masked matrix, slice to actual T
+    M_t = monarch_M[1:T, 1:T]
     # Single-head: apply Monarch to ALL channels at once
     x_seq = reshape(permutedims(x, (2, 1, 3)), T, D * B)  # (T, D*B)
     y_monarch = M_t * x_seq                                 # (T, D*B)
     monarch_out = permutedims(reshape(y_monarch, T, D, B), (2, 1, 3))  # (D, T, B)
 end
 # ═══════════════════════════════════════════════════════════════════
+# Full model forward pass (uses cached data)
 # ═══════════════════════════════════════════════════════════════════
+function model_forward(config::ModelConfig, ps, x, caches)
     T = size(x, 1)  # x: (seq_len, batch) of integer token IDs
     # Token embedding: (seq_len, batch) → (embed_dim, seq_len, batch)
     h = ps.tok_emb.weight[:, x]
     # Symbiogenesis blocks
     for i in 1:config.n_layers
         name = Symbol("block_$i")
         # Pre-norm sequence mixing + residual
         normed = rmsnorm_forward(h, bp.ln1.weight)
         mixed = symbio_sequence_mixer_forward(normed, bp.seq_mixer,
+                                               caches.monarch_ms[i])
         h = h .+ mixed
         # Pre-norm FFN + residual
                             temperature::Float64=0.8,
                             top_k::Int=0,
                             top_p::Float64=1.0,
+                            on_token=nothing,
+                            caches=nothing)
     tokens = encode(tokenizer, prompt)
     if isempty(tokens)
         tokens = [rand(1:tokenizer_vocab_size(tokenizer))]
     end
+    # Use provided caches or compute them once
+    if caches === nothing
+        caches = precompute_inference_caches(config, ps)
+    end
     generated = String[]
     for _ in 1:max_tokens
         end
         x = reshape(ctx, :, 1)
+        logits = model_forward(config, ps, x, caches)
         next_logits = Vector{Float32}(logits[:, end, 1])
         if temperature != 1.0

server.jl CHANGED Viewed

@@ -73,6 +73,7 @@ const INF_MODEL = load_inference_model(CKPT_PATH, CONFIG_PATH, VOCAB_PATH, MERGE
 const CONFIG = INF_MODEL.config
 const PS = INF_MODEL.ps
 const TOKENIZER = INF_MODEL.tokenizer
 const MODEL_CREATED_AT = Int(floor(time()))
 println("\nModel ready: arch=$(CONFIG.arch), vocab=$(CONFIG.vocab_size), embd=$(CONFIG.embed_dim), " *
@@ -225,6 +226,7 @@ function handle_request(request::HTTP.Request)
             generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
                                max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
                                on_token = function(token_str)
                                    token_count[] += 1
                                    chunk = Dict(
@@ -277,7 +279,8 @@ function handle_request(request::HTTP.Request)
             total_completion_tokens = 0
             for i in 1:n_completions
                 text = generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
-                                          max_tokens, temperature, top_k=top_k_val, top_p=top_p_val)
                 finish_reason = "length"  # generate_streaming always produces exactly max_tokens tokens
                 push!(choices, Dict(
                     "index" => i - 1,

 const CONFIG = INF_MODEL.config
 const PS = INF_MODEL.ps
 const TOKENIZER = INF_MODEL.tokenizer
+const CACHES = INF_MODEL.caches
 const MODEL_CREATED_AT = Int(floor(time()))
 println("\nModel ready: arch=$(CONFIG.arch), vocab=$(CONFIG.vocab_size), embd=$(CONFIG.embed_dim), " *
             generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
                                max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
+                               caches=CACHES,
                                on_token = function(token_str)
                                    token_count[] += 1
                                    chunk = Dict(
             total_completion_tokens = 0
             for i in 1:n_completions
                 text = generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
+                                          max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
+                                          caches=CACHES)
                 finish_reason = "length"  # generate_streaming always produces exactly max_tokens tokens
                 push!(choices, Dict(
                     "index" => i - 1,