Spaces:

LisaMegaWatts
/

MonarchSLM

Running

App Files Files Community

LisaMegaWatts commited on Feb 27

Commit

76b7110

verified ·

1 Parent(s): f0aedd4

Cache Monarch matrices + causal mask for faster inference

Browse files

Files changed (3) hide show

checkpoint.jl +7 -1
model.jl +49 -24
server.jl +4 -1

checkpoint.jl CHANGED Viewed

@@ -94,5 +94,11 @@ function load_inference_model(ckpt_path::String, config_path::String,
         println("  Adjusted vocab_size to $(config.vocab_size) from embedding weight")
     end
-    return (; config, ps, tokenizer, step, val_loss)
 end

         println("  Adjusted vocab_size to $(config.vocab_size) from embedding weight")
     end
+    # Pre-compute inference caches (Monarch matrices + causal mask)
+    println("Pre-computing inference caches ...")
+    caches = precompute_inference_caches(config, ps)
+    n_cached = config.n_layers * config.n_monarch_heads
+    println("  Cached $n_cached Monarch matrices ($(config.context_length)x$(config.context_length))")
+    return (; config, ps, tokenizer, step, val_loss, caches)
 end

model.jl CHANGED Viewed

@@ -305,31 +305,53 @@ function causal_depthwise_conv1d(x, kernel)
 end
 # ═══════════════════════════════════════════════════════════════════
-# Monarch Sequence Mixer forward pass
 # ═══════════════════════════════════════════════════════════════════
-function monarch_sequence_mixer_forward(x, ps, n_heads::Int, context_length::Int, mask)
     D, T, B = size(x)
     H = n_heads
-    HD = D ÷ H  # channels per head
-    p = isqrt(context_length)
     # 1. Causal depthwise conv for local context
     conv_out = causal_depthwise_conv1d(x, ps.conv.kernel)
-    # 2. Multi-head Monarch mixing for global context
     monarch_slices = map(1:H) do i
-        name = Symbol("head_$i")
-        ps_m = getproperty(ps.monarchs, name)
-        # Realize full T_max × T_max Monarch matrix
-        M = monarch_realize(ps_m.L1, ps_m.L2, p)
-        # Apply causal mask
-        M_causal = M .* mask
-        # Slice to actual sequence length T (for generation where T < context_length)
-        M_t = M_causal[1:T, 1:T]
         # Extract this head's channel slice: (HD, T, B)
         ch_start = (i - 1) * HD + 1
@@ -356,18 +378,15 @@ function monarch_sequence_mixer_forward(x, ps, n_heads::Int, context_length::Int
 end
 # ═══════════════════════════════════════════════════════════════════
-# Full model forward pass
 # ═══════════════════════════════════════════════════════════════════
-function model_forward(config::ModelConfig, ps, x)
     T = size(x, 1)  # x: (seq_len, batch) of integer token IDs
     # Token embedding: (seq_len, batch) → (embed_dim, seq_len, batch)
     h = ps.tok_emb.weight[:, x]
-    # Causal mask (multiplicative 0/1 for Monarch)
-    mask = make_causal_mask(config.context_length)
     # Monarch blocks
     for i in 1:config.n_layers
         name = Symbol("block_$i")
@@ -377,7 +396,7 @@ function model_forward(config::ModelConfig, ps, x)
         normed = rmsnorm_forward(h, bp.ln1.weight)
         mixed = monarch_sequence_mixer_forward(normed, bp.seq_mixer,
                                                 config.n_monarch_heads,
-                                                config.context_length, mask)
         h = h .+ mixed
         # Pre-norm FFN + residual
@@ -444,12 +463,18 @@ function generate_streaming(config::ModelConfig, ps,
                             temperature::Float64=0.8,
                             top_k::Int=0,
                             top_p::Float64=1.0,
-                            on_token=nothing)
     tokens = encode(tokenizer, prompt)
     if isempty(tokens)
         tokens = [rand(1:tokenizer_vocab_size(tokenizer))]
     end
     generated = String[]
     for _ in 1:max_tokens
@@ -460,7 +485,7 @@ function generate_streaming(config::ModelConfig, ps,
         end
         x = reshape(ctx, :, 1)
-        logits = model_forward(config, ps, x)
         next_logits = Vector{Float32}(logits[:, end, 1])
         if temperature != 1.0

 end
 # ═══════════════════════════════════════════════════════════════════
+# Pre-compute inference caches (Monarch matrices + causal mask)
 # ═══════════════════════════════════════════════════════════════════
+"""
+    precompute_inference_caches(config, ps) -> NamedTuple
+Pre-realize all Monarch matrices and apply causal mask once at startup.
+Avoids recomputing them on every forward pass during generation.
+"""
+function precompute_inference_caches(config::ModelConfig, ps)
+    p = isqrt(config.context_length)
+    mask = make_causal_mask(config.context_length)
+    # Pre-realize all Monarch matrices: monarch_ms[layer][head] = masked T×T matrix
+    monarch_ms = Vector{Vector{Matrix{Float32}}}(undef, config.n_layers)
+    for i in 1:config.n_layers
+        name = Symbol("block_$i")
+        bp = getproperty(ps.blocks, name)
+        layer_ms = Vector{Matrix{Float32}}(undef, config.n_monarch_heads)
+        for j in 1:config.n_monarch_heads
+            head_name = Symbol("head_$j")
+            ps_m = getproperty(bp.seq_mixer.monarchs, head_name)
+            M = monarch_realize(ps_m.L1, ps_m.L2, p) .* mask
+            layer_ms[j] = M
+        end
+        monarch_ms[i] = layer_ms
+    end
+    return (; mask, monarch_ms)
+end
+# ═══════════════════════════════════════════════════════════════════
+# Monarch Sequence Mixer forward pass (uses cached matrices)
+# ═══════════════════════════════════════════════════════════════════
+function monarch_sequence_mixer_forward(x, ps, n_heads::Int, monarch_ms_layer)
     D, T, B = size(x)
     H = n_heads
+    HD = D ÷ H
     # 1. Causal depthwise conv for local context
     conv_out = causal_depthwise_conv1d(x, ps.conv.kernel)
+    # 2. Multi-head Monarch mixing (pre-realized matrices)
     monarch_slices = map(1:H) do i
+        # Slice cached matrix to actual sequence length
+        M_t = monarch_ms_layer[i][1:T, 1:T]
         # Extract this head's channel slice: (HD, T, B)
         ch_start = (i - 1) * HD + 1
 end
 # ═══════════════════════════════════════════════════════════════════
+# Full model forward pass (uses cached data)
 # ═══════════════════════════════════════════════════════════════════
+function model_forward(config::ModelConfig, ps, x, caches)
     T = size(x, 1)  # x: (seq_len, batch) of integer token IDs
     # Token embedding: (seq_len, batch) → (embed_dim, seq_len, batch)
     h = ps.tok_emb.weight[:, x]
     # Monarch blocks
     for i in 1:config.n_layers
         name = Symbol("block_$i")
         normed = rmsnorm_forward(h, bp.ln1.weight)
         mixed = monarch_sequence_mixer_forward(normed, bp.seq_mixer,
                                                 config.n_monarch_heads,
+                                                caches.monarch_ms[i])
         h = h .+ mixed
         # Pre-norm FFN + residual
                             temperature::Float64=0.8,
                             top_k::Int=0,
                             top_p::Float64=1.0,
+                            on_token=nothing,
+                            caches=nothing)
     tokens = encode(tokenizer, prompt)
     if isempty(tokens)
         tokens = [rand(1:tokenizer_vocab_size(tokenizer))]
     end
+    # Use provided caches or compute them once
+    if caches === nothing
+        caches = precompute_inference_caches(config, ps)
+    end
     generated = String[]
     for _ in 1:max_tokens
         end
         x = reshape(ctx, :, 1)
+        logits = model_forward(config, ps, x, caches)
         next_logits = Vector{Float32}(logits[:, end, 1])
         if temperature != 1.0

server.jl CHANGED Viewed

@@ -72,6 +72,7 @@ const INF_MODEL = load_inference_model(CKPT_PATH, CONFIG_PATH, VOCAB_PATH, MERGE
 const CONFIG = INF_MODEL.config
 const PS = INF_MODEL.ps
 const TOKENIZER = INF_MODEL.tokenizer
 const MODEL_CREATED_AT = Int(floor(time()))
 println("\nModel ready: arch=$(CONFIG.arch), vocab=$(CONFIG.vocab_size), embd=$(CONFIG.embed_dim), " *
@@ -217,6 +218,7 @@ function handle_request(request::HTTP.Request)
             generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
                                max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
                                on_token = function(token_str)
                                    token_count[] += 1
                                    chunk = Dict(
@@ -269,7 +271,8 @@ function handle_request(request::HTTP.Request)
             total_completion_tokens = 0
             for i in 1:n_completions
                 text = generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
-                                          max_tokens, temperature, top_k=top_k_val, top_p=top_p_val)
                 finish_reason = "length"  # generate_streaming always produces exactly max_tokens tokens
                 push!(choices, Dict(
                     "index" => i - 1,

 const CONFIG = INF_MODEL.config
 const PS = INF_MODEL.ps
 const TOKENIZER = INF_MODEL.tokenizer
+const CACHES = INF_MODEL.caches
 const MODEL_CREATED_AT = Int(floor(time()))
 println("\nModel ready: arch=$(CONFIG.arch), vocab=$(CONFIG.vocab_size), embd=$(CONFIG.embed_dim), " *
             generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
                                max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
+                               caches=CACHES,
                                on_token = function(token_str)
                                    token_count[] += 1
                                    chunk = Dict(
             total_completion_tokens = 0
             for i in 1:n_completions
                 text = generate_streaming(CONFIG, PS, TOKENIZER, prompt_text;
+                                          max_tokens, temperature, top_k=top_k_val, top_p=top_p_val,
+                                          caches=CACHES)
                 finish_reason = "length"  # generate_streaming always produces exactly max_tokens tokens
                 push!(choices, Dict(
                     "index" => i - 1,