Spaces:

LisaMegaWatts
/

JuliaGPTDistill-space

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 26

Commit

0624a08

verified ·

1 Parent(s): 45c0e0a

Initial space setup: Distilled LLaMA-style OpenAI-compatible server

Browse files

Files changed (6) hide show

Dockerfile +35 -0
Project.toml +7 -0
README.md +45 -5
checkpoint.jl +222 -0
model.jl +290 -0
server.jl +312 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM julia:1.10-bookworm
+# HuggingFace Spaces requires user ID 1000
+RUN useradd -m -u 1000 user
+# Shared Julia depot for package caching
+ENV JULIA_DEPOT_PATH=/opt/julia-depot
+RUN mkdir -p /opt/julia-depot && chmod 777 /opt/julia-depot
+# Copy project file first for dependency caching
+COPY --chown=user Project.toml /home/user/app/
+# Install and precompile Julia packages
+RUN julia --project=/home/user/app -e ' \
+    using Pkg; \
+    Pkg.instantiate(); \
+    Pkg.precompile(); \
+    println("Precompile done")'
+# Copy application code
+COPY --chown=user model.jl /home/user/app/
+COPY --chown=user checkpoint.jl /home/user/app/
+COPY --chown=user server.jl /home/user/app/
+# Create checkpoints directory (model downloads from HF at runtime)
+RUN mkdir -p /home/user/app/checkpoints && chown user:user /home/user/app/checkpoints
+# Switch to non-root user
+USER user
+ENV HOME=/home/user
+WORKDIR /home/user/app
+EXPOSE 7860
+CMD ["julia", "--project=/home/user/app", "/home/user/app/server.jl"]

Project.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[deps]
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"

README.md CHANGED Viewed

@@ -1,10 +1,50 @@
 ---
-title: JuliaGPTDistill Space
-emoji: 👁
-colorFrom: pink
-colorTo: purple
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: JuliaGPTDistill
+emoji: "🧬"
+colorFrom: green
+colorTo: blue
 sdk: docker
+app_port: 7860
 pinned: false
+license: mit
+tags:
+  - julia
+  - flux-jl
+  - llama-style
+  - rope
+  - swiglu
+  - gqa
+  - rmsnorm
+  - bpe
+  - distillation
+  - philosophy
+  - openai-compatible
 ---
+# JuliaGPTDistill Space
+Distilled LLaMA-style decoder model (256d, 4L, 4Q/2KV) trained via knowledge distillation from JuliaFluxGPT. BPE tokenizer (2000 tokens). Trained on classical philosophy and mathematics.
+## Endpoints
+- `GET /` — Health check and model info
+- `GET /v1/models` — List available models
+- `POST /v1/chat/completions` — Generate text (supports streaming, top-k, top-p)
+## Usage
+```bash
+curl -X POST https://LisaMegaWatts-JuliaGPTDistill-space.hf.space/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"messages": [{"role": "user", "content": "the nature of"}], "max_tokens": 200}'
+```
+## Architecture
+- **Model**: 256d embed, 4 layers, 4Q/2KV heads (GQA), ~1.5M params
+- **Tokenizer**: BPE (2000 tokens)
+- **Normalization**: RMSNorm (pre-norm)
+- **Feed-forward**: SwiGLU activation
+- **Weight tying**: Shared embedding/output projection
+- **Training**: Knowledge distillation from JuliaFluxGPT (10M params)
+- **Framework**: Flux.jl

checkpoint.jl ADDED Viewed

	@@ -0,0 +1,222 @@

+#=
+checkpoint.jl — Load Flux model checkpoints for JuliaFluxGPT
+Loads JLD2 checkpoints saved by the juliaflux_v2 training notebook.
+Supports BPE tokenizer (tokenizer.json format) with character-level fallback.
+NOTE: The GPT struct no longer has TiedDense — weight tying is done in the
+forward pass. This simplifies checkpoint loading: we load all components
+normally and skip any lm_head key in the checkpoint (it's redundant since
+the output projection uses wte.weight directly).
+=#
+include("model.jl")
+using JLD2
+using JSON3
+# ═══════════════════════════════════════════════════════════════════════════════
+# BPE Tokenizer (loaded from tokenizer.json — HuggingFace format)
+# ═══════════════════════════════════════════════════════════════════════════════
+struct BPETokenizer
+    vocab::Dict{String, Int}
+    id_to_token::Dict{Int, String}
+    merges::Vector{Tuple{String, String}}
+    merge_rank::Dict{Tuple{String, String}, Int}
+    byte_to_unicode::Dict{UInt8, String}
+    unicode_to_byte::Dict{Char, UInt8}
+    word_cache::Dict{String, Vector{Int}}
+    gpt2_pattern::Regex
+end
+function build_byte_to_unicode()
+    bs = UInt8[]
+    cs = Char[]
+    for r in [0x21:0x7e, 0xa1:0xac, 0xae:0xff]
+        for b in r
+            push!(bs, b)
+            push!(cs, Char(b))
+        end
+    end
+    n = 0
+    for b in 0x00:0xff
+        if b ∉ bs
+            push!(bs, b)
+            push!(cs, Char(256 + n))
+            n += 1
+        end
+    end
+    b2u = Dict(bs[i] => string(cs[i]) for i in eachindex(bs))
+    u2b = Dict(v[1] => k for (k, v) in b2u)
+    return b2u, u2b
+end
+function load_bpe_tokenizer(path::String)
+    tok_json = JSON3.read(read(path, String))
+    vocab = Dict{String, Int}()
+    for (tok_str, id) in pairs(tok_json.model.vocab)
+        vocab[string(tok_str)] = Int(id) + 1  # +1 for Julia 1-indexing
+    end
+    merges = Tuple{String, String}[]
+    for merge_entry in tok_json.model.merges
+        if merge_entry isa AbstractVector && length(merge_entry) >= 2
+            push!(merges, (String(merge_entry[1]), String(merge_entry[2])))
+        else
+            parts = split(string(merge_entry), " ", limit=2)
+            if length(parts) == 2
+                push!(merges, (String(parts[1]), String(parts[2])))
+            end
+        end
+    end
+    id_to_token = Dict{Int, String}(id => tok for (tok, id) in vocab)
+    merge_rank = Dict{Tuple{String, String}, Int}(
+        (a, b) => i for (i, (a, b)) in enumerate(merges)
+    )
+    b2u, u2b = build_byte_to_unicode()
+    gpt2_pat = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
+    BPETokenizer(vocab, id_to_token, merges, merge_rank, b2u, u2b,
+                 Dict{String, Vector{Int}}(), gpt2_pat)
+end
+function bpe_encode_word(tok::BPETokenizer, word::Vector{String})
+    tokens = copy(word)
+    while length(tokens) >= 2
+        best_rank = typemax(Int)
+        best_pair = ("", "")
+        for i in 1:length(tokens)-1
+            rank = get(tok.merge_rank, (tokens[i], tokens[i+1]), typemax(Int))
+            if rank < best_rank
+                best_rank = rank
+                best_pair = (tokens[i], tokens[i+1])
+            end
+        end
+        best_rank == typemax(Int) && break
+        a, b = best_pair
+        new_tokens = String[]
+        i = 1
+        while i <= length(tokens)
+            if i < length(tokens) && tokens[i] == a && tokens[i+1] == b
+                push!(new_tokens, a * b)
+                i += 2
+            else
+                push!(new_tokens, tokens[i])
+                i += 1
+            end
+        end
+        tokens = new_tokens
+    end
+    return tokens
+end
+function encode_bpe(tok::BPETokenizer, s::String)
+    ids = Int[]
+    for m in eachmatch(tok.gpt2_pattern, s)
+        word = m.match
+        cached = get(tok.word_cache, word, nothing)
+        if cached !== nothing
+            append!(ids, cached)
+        else
+            word_bytes = Vector{UInt8}(word)
+            chars = [tok.byte_to_unicode[b] for b in word_bytes]
+            tokens = bpe_encode_word(tok, chars)
+            word_ids = Int[]
+            for t in tokens
+                id = get(tok.vocab, t, nothing)
+                id !== nothing && push!(word_ids, id)
+            end
+            tok.word_cache[word] = word_ids
+            append!(ids, word_ids)
+        end
+    end
+    return ids
+end
+function decode_bpe(tok::BPETokenizer, ids::Vector{Int})
+    text = join(get(tok.id_to_token, id, "") for id in ids)
+    bytes = UInt8[tok.unicode_to_byte[c] for c in text if haskey(tok.unicode_to_byte, c)]
+    return String(bytes)
+end
+# ═════════════════════���═════════════════════════════════════════════════════════
+# Checkpoint loading
+# ═══════════════════════════════════════════════════════════════════════════════
+function load_flux_checkpoint(checkpoint_path::String; tokenizer_path::String="")
+    println("Loading checkpoint from $checkpoint_path ...")
+    data = JLD2.load(checkpoint_path)
+    hp = data["hyperparams"]
+    vocab_size = Int(hp["vocab_size"])
+    n_embd = Int(hp["n_embd"])
+    block_size = Int(hp["block_size"])
+    n_layer = Int(hp["n_layer"])
+    n_head = Int(hp["n_head"])
+    n_kv_head = Int(get(hp, "n_kv_head", hp["n_head"]))
+    dropout_val = Float64(get(hp, "dropout", 0.0))
+    model = GPT(;
+        vocab_size = vocab_size,
+        n_embd = n_embd,
+        block_size = block_size,
+        n_layer = n_layer,
+        n_head = n_head,
+        n_kv_head = n_kv_head,
+        dropout = 0.0  # No dropout at inference
+    )
+    # Load weights component-by-component
+    ms = data["model_state"]
+    Flux.loadmodel!(model.wte, ms[:wte])
+    Flux.loadmodel!(model.drop, ms[:drop])
+    Flux.loadmodel!(model.blocks, ms[:blocks])
+    Flux.loadmodel!(model.ln_f, ms[:ln_f])
+    # Set to test mode (disables dropout)
+    Flux.testmode!(model)
+    step = get(data, "step", 0)
+    best_val = get(data, "best_val_loss", Inf)
+    println("  Model loaded: vocab=$vocab_size, embd=$n_embd, layers=$n_layer, " *
+            "heads=$(n_head)Q/$(n_kv_head)KV, block=$block_size")
+    println("  Step=$step, best_val=$(round(best_val, digits=4))")
+    # Load tokenizer
+    encode_fn = nothing
+    decode_fn = nothing
+    if !isempty(tokenizer_path) && isfile(tokenizer_path)
+        println("  Loading BPE tokenizer from $tokenizer_path")
+        bpe = load_bpe_tokenizer(tokenizer_path)
+        tok_vocab_size = length(bpe.vocab)
+        if tok_vocab_size != vocab_size
+            @warn "Vocab mismatch! Model expects vocab_size=$vocab_size but tokenizer has $tok_vocab_size tokens. " *
+                  "Token IDs above $vocab_size will be clamped."
+        end
+        encode_fn = function(s)
+            ids = encode_bpe(bpe, s)
+            return [clamp(id, 1, vocab_size) for id in ids]
+        end
+        decode_fn = ids -> decode_bpe(bpe, ids)
+        println("  BPE tokenizer loaded: $(tok_vocab_size) tokens (model vocab: $vocab_size)")
+    else
+        # Character-level fallback
+        chars = vcat(collect('a':'z'), [' ', '.'])
+        stoi = Dict(c => i for (i, c) in enumerate(chars))
+        itos = Dict(i => c for (i, c) in enumerate(chars))
+        encode_fn = s -> [get(stoi, c, 1) for c in s]
+        decode_fn = ids -> join(get(itos, id, '?') for id in ids)
+        println("  No tokenizer.json found, using character-level fallback ($(length(chars)) chars)")
+    end
+    return (;
+        model, vocab_size, n_embd, block_size, n_layer, n_head, n_kv_head,
+        step, best_val, encode_fn, decode_fn
+    )
+end

model.jl ADDED Viewed

	@@ -0,0 +1,290 @@

+#=
+model.jl — LLaMA-style GPT model in Flux.jl for JuliaFluxGPT
+Contains: RMSNorm, SwiGLU, CausalSelfAttention (GQA + RoPE),
+TransformerBlock, GPT, and generation utilities.
+Same architecture as juliaflux_v2.ipynb — extracted for inference serving.
+NOTE: Weight tying is done by computing the output projection directly using
+m.wte.weight in the forward pass. This matches the training notebooks and
+ensures Flux.loadmodel! works without needing to skip lm_head.
+=#
+using Flux
+using NNlib
+using NNlib: batched_mul
+using Statistics
+using Random
+using LinearAlgebra
+# ═══════════════════════════════════════════════════════════════════════════════
+# RoPE — Rotary Positional Embeddings
+# ═══════════════════════════════════════════════════════════════════════════════
+function precompute_rope_freqs(head_dim::Int, max_seq_len::Int; base::Float32 = 10000.0f0)
+    half_dim = head_dim ÷ 2
+    freqs = Float32[1.0f0 / (base ^ (Float32(2 * (i - 1)) / Float32(head_dim))) for i in 1:half_dim]
+    positions = Float32.(collect(0:max_seq_len-1))
+    angles = freqs * positions'
+    return cos.(angles), sin.(angles)
+end
+function apply_rope(x, cos_f, sin_f, T::Int)
+    d = size(x, 1) ÷ 2
+    x1 = x[1:d, :, :]
+    x2 = x[d+1:2d, :, :]
+    c = cos_f[:, 1:T]
+    s = sin_f[:, 1:T]
+    return vcat(x1 .* c .- x2 .* s, x1 .* s .+ x2 .* c)
+end
+# ═══════════════════════════════════════════════════════════════════════════════
+# Model components
+# ═══════════════════════════════════════════════════════════════════════════════
+struct RMSNorm{W <: AbstractVector}
+    weight::W
+    eps::Float32
+end
+Flux.@layer RMSNorm
+RMSNorm(dim::Int; eps::Float32 = 1.0f-6) = RMSNorm(ones(Float32, dim), eps)
+function (rn::RMSNorm)(x)
+    rms = sqrt.(mean(x .^ 2, dims=1) .+ rn.eps)
+    return (x ./ rms) .* rn.weight
+end
+struct SwiGLUFFN
+    w_gate::Dense
+    w_up::Dense
+    w_down::Dense
+    drop::Dropout
+end
+Flux.@layer SwiGLUFFN
+function SwiGLUFFN(n_embd::Int; bias=false, dropout=0.0)
+    raw_inner = Int(floor(4 * n_embd * 2 / 3))
+    inner_dim = max(64, 64 * div(raw_inner + 32, 64))
+    SwiGLUFFN(
+        Dense(n_embd => inner_dim; bias),
+        Dense(n_embd => inner_dim; bias),
+        Dense(inner_dim => n_embd; bias),
+        Dropout(dropout)
+    )
+end
+function (ff::SwiGLUFFN)(x)
+    ff.drop(ff.w_down(NNlib.swish(ff.w_gate(x)) .* ff.w_up(x)))
+end
+struct CausalSelfAttention
+    wq::Dense
+    wkv::Dense
+    proj::Dense
+    n_head::Int
+    n_kv_head::Int
+end
+Flux.@layer CausalSelfAttention trainable=(wq, wkv, proj)
+function CausalSelfAttention(n_embd::Int, n_head::Int, n_kv_head::Int; bias=false)
+    head_dim = n_embd ÷ n_head
+    kv_dim = head_dim * n_kv_head
+    CausalSelfAttention(
+        Dense(n_embd => n_embd; bias),
+        Dense(n_embd => 2 * kv_dim; bias),
+        Dense(n_embd => n_embd; bias),
+        n_head,
+        n_kv_head
+    )
+end
+function (attn::CausalSelfAttention)(x, causal_mask, rope_cos, rope_sin)
+    C, T, B = size(x)
+    nh = attn.n_head
+    nkv = attn.n_kv_head
+    hs = C ÷ nh
+    kv_dim = hs * nkv
+    groups = nh ÷ nkv
+    q = attn.wq(x)
+    kv = attn.wkv(x)
+    k = kv[1:kv_dim, :, :]
+    v = kv[kv_dim+1:2*kv_dim, :, :]
+    q = reshape(permutedims(reshape(q, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
+    k = reshape(permutedims(reshape(k, hs, nkv, T, B), (1, 3, 2, 4)), hs, T, nkv * B)
+    v = reshape(permutedims(reshape(v, hs, nkv, T, B), (1, 3, 2, 4)), hs, T, nkv * B)
+    q = apply_rope(q, rope_cos, rope_sin, T)
+    k = apply_rope(k, rope_cos, rope_sin, T)
+    if groups > 1
+        k_4d = reshape(k, hs, T, nkv, B)
+        v_4d = reshape(v, hs, T, nkv, B)
+        k_rep = repeat(reshape(k_4d, hs, T, nkv, 1, B), 1, 1, 1, groups, 1)
+        v_rep = repeat(reshape(v_4d, hs, T, nkv, 1, B), 1, 1, 1, groups, 1)
+        k = reshape(permutedims(k_rep, (1, 2, 4, 3, 5)), hs, T, nh * B)
+        v = reshape(permutedims(v_rep, (1, 2, 4, 3, 5)), hs, T, nh * B)
+    end
+    scale = Float32(1 / sqrt(hs))
+    wei = batched_mul(permutedims(q, (2, 1, 3)), k) .* scale
+    wei = wei .+ causal_mask[1:T, 1:T]
+    wei = softmax(wei; dims=2)
+    out = batched_mul(v, permutedims(wei, (2, 1, 3)))
+    out = reshape(permutedims(reshape(out, hs, T, nh, B), (1, 3, 2, 4)), C, T, B)
+    attn.proj(out)
+end
+struct TransformerBlock
+    ln1::RMSNorm
+    attn::CausalSelfAttention
+    ln2::RMSNorm
+    ffwd::SwiGLUFFN
+end
+Flux.@layer TransformerBlock
+function TransformerBlock(n_embd::Int, n_head::Int, n_kv_head::Int; dropout=0.0)
+    TransformerBlock(
+        RMSNorm(n_embd),
+        CausalSelfAttention(n_embd, n_head, n_kv_head),
+        RMSNorm(n_embd),
+        SwiGLUFFN(n_embd; dropout)
+    )
+end
+# ═══════════════════════════════════════════════════════════════════════════════
+# GPT — weight-tied output projection (matches training notebooks)
+# ═══════════════════════════════════════════════════════════════════════════════
+struct GPT
+    wte::Embedding
+    drop::Dropout
+    blocks::Chain
+    ln_f::RMSNorm
+    # Precomputed constants (not trainable)
+    causal_mask::Matrix{Float32}
+    rope_cos::Matrix{Float32}
+    rope_sin::Matrix{Float32}
+    n_head::Int
+    n_kv_head::Int
+    block_size::Int
+end
+Flux.@layer GPT trainable=(wte, drop, blocks, ln_f)
+function GPT(; vocab_size, n_embd, block_size, n_layer, n_head, n_kv_head, dropout=0.0)
+    head_dim = n_embd ÷ n_head
+    wte = Embedding(vocab_size => n_embd)
+    causal_mask = triu(fill(typemin(Float32), block_size, block_size), 1)
+    rope_cos, rope_sin = precompute_rope_freqs(head_dim, block_size)
+    GPT(
+        wte,
+        Dropout(dropout),
+        Chain([TransformerBlock(n_embd, n_head, n_kv_head; dropout) for _ in 1:n_layer]...),
+        RMSNorm(n_embd),
+        causal_mask,
+        rope_cos,
+        rope_sin,
+        n_head,
+        n_kv_head,
+        block_size
+    )
+end
+function (m::GPT)(idx)
+    B, T = size(idx)
+    tok = permutedims(m.wte(idx), (1, 3, 2))  # (C, T, B)
+    x = m.drop(tok)
+    for block in m.blocks
+        x = x .+ block.attn(block.ln1(x), m.causal_mask, m.rope_cos, m.rope_sin)
+        x = x .+ block.ffwd(block.ln2(x))
+    end
+    x = m.ln_f(x)
+    # Weight-tied output projection — same weight as embedding
+    W = m.wte.weight
+    C = size(x, 1)
+    x_flat = reshape(x, C, T * B)
+    out = W' * x_flat
+    reshape(out, size(W, 2), T, B)
+end
+# ═══════════════════════════════════════════════════════════════════════════════
+# Text generation with streaming support
+# ═══════════════════════════════════════════════════════════════════════════════
+function generate_streaming(model, encode_fn, decode_fn, vocab_size::Int, block_size::Int;
+                            prompt::String="", max_tokens::Int=200, temperature::Float64=0.8,
+                            top_k::Int=40, top_p::Float64=1.0, on_token=nothing)
+    if !isempty(prompt)
+        prompt_ids = encode_fn(prompt)
+        idx = reshape(prompt_ids, 1, :)
+    else
+        idx = reshape([rand(1:vocab_size)], 1, 1)
+    end
+    generated_ids = Int[]
+    for _ in 1:max_tokens
+        idx_cond = idx[:, max(1, end-block_size+1):end]
+        logits = model(idx_cond)
+        logits_last = Vector{Float32}(logits[:, end, 1])
+        # Temperature scaling
+        logits_last ./= Float32(max(temperature, 0.01))
+        # Top-k filtering
+        if top_k > 0 && top_k < length(logits_last)
+            threshold = partialsort(logits_last, top_k; rev=true)
+            for i in eachindex(logits_last)
+                if logits_last[i] < threshold
+                    logits_last[i] = -Inf32
+                end
+            end
+        end
+        # Top-p (nucleus) filtering
+        if top_p < 1.0
+            sorted_indices = sortperm(logits_last; rev=true)
+            sorted_logits = logits_last[sorted_indices]
+            probs_sorted = NNlib.softmax(sorted_logits)
+            cumprobs = cumsum(Array(probs_sorted))
+            cutoff = something(findfirst(>=(Float32(top_p)), cumprobs), length(probs_sorted))
+            for i in (cutoff+1):length(sorted_indices)
+                logits_last[sorted_indices[i]] = -Inf32
+            end
+        end
+        probs = NNlib.softmax(logits_last)
+        probs_cpu = Float64.(probs)
+        r = rand()
+        cum = 0.0
+        next_id = length(probs_cpu)
+        for (i, p) in enumerate(probs_cpu)
+            cum += p
+            if r <= cum
+                next_id = i
+                break
+            end
+        end
+        push!(generated_ids, next_id)
+        idx = hcat(idx, reshape([next_id], 1, 1))
+        if on_token !== nothing
+            token_str = decode_fn([next_id])
+            on_token(token_str)
+        end
+    end
+    return decode_fn(generated_ids)
+end

server.jl ADDED Viewed

	@@ -0,0 +1,312 @@

+#=
+server.jl — OpenAI-compatible inference server for JuliaGPTDistill
+Serves a Flux.jl trained LLaMA-style GPT model (RoPE, GQA, RMSNorm, SwiGLU).
+Downloads checkpoint and tokenizer from HuggingFace model repo on first run.
+Endpoints:
+    GET  /                       -> health check / API info
+    GET  /v1/models              -> list available models
+    POST /v1/chat/completions    -> generate text (OpenAI format, streaming supported)
+=#
+include("checkpoint.jl")
+using HTTP
+using UUIDs
+using Downloads
+# ═══════════════════════════════════════════════════════════════════
+# Download artifacts from HuggingFace
+# ═══════════════════════════════════════════════════════════════════
+const CKPT_DIR = "checkpoints"
+const CKPT_PATH = joinpath(CKPT_DIR, "best_model.jld2")
+const TOKENIZER_PATH = joinpath(CKPT_DIR, "tokenizer.json")
+const HF_REPO = get(ENV, "HF_REPO", "LisaMegaWatts/JuliaGPTDistill")
+const PORT = parse(Int, get(ENV, "PORT", "7860"))
+function download_from_hf(repo::String, filename::String, local_path::String)
+    url = "https://huggingface.co/$repo/resolve/main/$filename"
+    println("Downloading $url ...")
+    mkpath(dirname(local_path))
+    Downloads.download(url, local_path)
+    sz = round(filesize(local_path) / 1024^2, digits=1)
+    println("  -> $local_path ($sz MB)")
+end
+function ensure_artifacts()
+    for (localpath, remote) in [(CKPT_PATH, "best_model.jld2"),
+                                (TOKENIZER_PATH, "tokenizer.json")]
+        if !isfile(localpath)
+            println("No local $remote found, downloading from $HF_REPO ...")
+            try
+                download_from_hf(HF_REPO, remote, localpath)
+            catch e
+                println("Download failed for $remote: $e")
+                println("Place $remote at $localpath manually.")
+                exit(1)
+            end
+        end
+    end
+end
+# ═══════════════════════════════════════════════════════════════════
+# Download and load model
+# ═══════════════════════════════════════════════════════════════════
+ensure_artifacts()
+println("\nLoading model...")
+const CKPT = load_flux_checkpoint(CKPT_PATH; tokenizer_path=TOKENIZER_PATH)
+const MODEL = CKPT.model
+const VOCAB_SIZE = CKPT.vocab_size
+const BLOCK_SIZE = CKPT.block_size
+const ENCODE_FN = CKPT.encode_fn
+const DECODE_FN = CKPT.decode_fn
+const MODEL_CREATED_AT = Int(floor(time()))
+println("\nModel ready: vocab=$(VOCAB_SIZE), embd=$(CKPT.n_embd), " *
+        "layers=$(CKPT.n_layer), heads=$(CKPT.n_head)Q/$(CKPT.n_kv_head)KV, " *
+        "block=$(BLOCK_SIZE)")
+# ═══════════════════════════════════════════════════════════════════
+# HTTP helpers
+# ═══════════════════════════════════════════════════════════════════
+const CORS_HEADERS = [
+    "Access-Control-Allow-Origin" => "*",
+    "Access-Control-Allow-Methods" => "GET, POST, OPTIONS",
+    "Access-Control-Allow-Headers" => "Content-Type, Authorization",
+]
+function json_response(status::Int, body; extra_headers=[])
+    json_bytes = JSON3.write(body)
+    headers = [
+        "Content-Type" => "application/json",
+        CORS_HEADERS...,
+        extra_headers...
+    ]
+    return HTTP.Response(status, headers, json_bytes)
+end
+function cors_preflight()
+    return HTTP.Response(204, CORS_HEADERS)
+end
+# ═══════════════════════════════════════════════════════════════════
+# Extract prompt from OpenAI chat messages
+# ═══════════════════════════════════════════════════════════════════
+function extract_prompt(messages)
+    if isempty(messages)
+        return ""
+    end
+    for i in length(messages):-1:1
+        role = string(get(messages[i], :role, ""))
+        if role == "user"
+            return string(get(messages[i], :content, ""))
+        end
+    end
+    return string(get(messages[end], :content, ""))
+end
+# ═══════════════════════════════════════════════════���═══════════════
+# SSE helpers
+# ═══════════════════════════════════════════════════════════════════
+function sse_line(data)
+    return "data: $(JSON3.write(data))\n\n"
+end
+# ═══════════════════════════════════════════════════════════════════
+# Request handler
+# ═══════════════════════════════════════════════════════════════════
+function handle_request(request::HTTP.Request)
+    method = request.method
+    target = request.target
+    # CORS preflight
+    if method == "OPTIONS"
+        return cors_preflight()
+    end
+    # GET / — health check and model info
+    if method == "GET" && target == "/"
+        return json_response(200, Dict(
+            "name" => "JuliaGPTDistill",
+            "version" => "1.0.0",
+            "description" => "Distilled LLaMA-style GPT in Flux.jl — knowledge distillation from JuliaFluxGPT",
+            "architecture" => "RoPE + SwiGLU + GQA + RMSNorm + weight tying",
+            "model" => Dict(
+                "vocab_size" => VOCAB_SIZE,
+                "n_embd" => CKPT.n_embd,
+                "n_layer" => CKPT.n_layer,
+                "n_head" => CKPT.n_head,
+                "n_kv_head" => CKPT.n_kv_head,
+                "block_size" => BLOCK_SIZE
+            ),
+            "endpoints" => ["/v1/models", "/v1/chat/completions"],
+            "features" => ["streaming", "OpenAI-compatible", "top-k", "top-p"],
+            "compatible_with" => ["OpenAI API", "OpenRouter"]
+        ))
+    end
+    # GET /v1/models — list available models
+    if method == "GET" && target == "/v1/models"
+        return json_response(200, Dict(
+            "object" => "list",
+            "data" => [Dict(
+                "id" => "juliagptdistill-philosophy",
+                "object" => "model",
+                "created" => MODEL_CREATED_AT,
+                "owned_by" => "juliagptdistill"
+            )]
+        ))
+    end
+    # POST /v1/chat/completions — generate text
+    if method == "POST" && target == "/v1/chat/completions"
+        local body
+        try
+            body = JSON3.read(String(request.body))
+        catch e
+            return json_response(400, Dict("error" => Dict(
+                "message" => "Invalid JSON in request body",
+                "type" => "invalid_request_error",
+                "code" => "invalid_json")))
+        end
+        temperature = Float64(clamp(get(body, :temperature, 0.8), 0.01, 2.0))
+        max_tokens = Int(clamp(get(body, :max_tokens, 200), 1, BLOCK_SIZE))
+        top_k_val = Int(clamp(get(body, :top_k, 40), 0, VOCAB_SIZE))
+        top_p_val = Float64(clamp(get(body, :top_p, 1.0), 0.0, 1.0))
+        stream = Bool(get(body, :stream, false))
+        messages = get(body, :messages, [])
+        prompt_text = extract_prompt(messages)
+        if stream
+            # ── SSE streaming response (buffered) ──
+            completion_id = "chatcmpl-" * string(uuid4())
+            created = Int(floor(time()))
+            buf = IOBuffer()
+            # Initial chunk with role
+            initial_chunk = Dict(
+                "id" => completion_id,
+                "object" => "chat.completion.chunk",
+                "created" => created,
+                "model" => "juliagptdistill-philosophy",
+                "choices" => [Dict(
+                    "index" => 0,
+                    "delta" => Dict("role" => "assistant", "content" => ""),
+                    "finish_reason" => nothing
+                )]
+            )
+            write(buf, sse_line(initial_chunk))
+            token_count = Ref(0)
+            generate_streaming(MODEL, ENCODE_FN, DECODE_FN, VOCAB_SIZE, BLOCK_SIZE;
+                               prompt=prompt_text, max_tokens=max_tokens,
+                               temperature=temperature, top_k=top_k_val, top_p=top_p_val,
+                               on_token = function(token_str)
+                                   token_count[] += 1
+                                   chunk = Dict(
+                                       "id" => completion_id,
+                                       "object" => "chat.completion.chunk",
+                                       "created" => created,
+                                       "model" => "juliagptdistill-philosophy",
+                                       "choices" => [Dict(
+                                           "index" => 0,
+                                           "delta" => Dict("content" => token_str),
+                                           "finish_reason" => nothing
+                                       )]
+                                   )
+                                   write(buf, sse_line(chunk))
+                               end)
+            # Final chunk with finish_reason
+            prompt_tokens = length(ENCODE_FN(prompt_text))
+            finish_chunk = Dict(
+                "id" => completion_id,
+                "object" => "chat.completion.chunk",
+                "created" => created,
+                "model" => "juliagptdistill-philosophy",
+                "choices" => [Dict(
+                    "index" => 0,
+                    "delta" => Dict(),
+                    "finish_reason" => token_count[] >= max_tokens ? "length" : "stop"
+                )],
+                "usage" => Dict(
+                    "prompt_tokens" => prompt_tokens,
+                    "completion_tokens" => token_count[],
+                    "total_tokens" => prompt_tokens + token_count[]
+                )
+            )
+            write(buf, sse_line(finish_chunk))
+            write(buf, "data: [DONE]\n\n")
+            sse_body = take!(buf)
+            headers = [
+                "Content-Type" => "text/event-stream",
+                "Cache-Control" => "no-cache",
+                "X-Accel-Buffering" => "no",
+                CORS_HEADERS...
+            ]
+            return HTTP.Response(200, headers, sse_body)
+        else
+            # ── Standard (non-streaming) response ──
+            n_completions = Int(clamp(get(body, :n, 1), 1, 4))
+            choices = []
+            total_completion_tokens = 0
+            for i in 1:n_completions
+                text = generate_streaming(MODEL, ENCODE_FN, DECODE_FN, VOCAB_SIZE, BLOCK_SIZE;
+                                          prompt=prompt_text, max_tokens=max_tokens,
+                                          temperature=temperature, top_k=top_k_val, top_p=top_p_val)
+                finish_reason = length(text) >= max_tokens ? "length" : "stop"
+                push!(choices, Dict(
+                    "index" => i - 1,
+                    "message" => Dict("role" => "assistant", "content" => text),
+                    "finish_reason" => finish_reason))
+                total_completion_tokens += length(text)
+            end
+            prompt_tokens = length(ENCODE_FN(prompt_text))
+            return json_response(200, Dict(
+                "id" => "chatcmpl-" * string(uuid4()),
+                "object" => "chat.completion",
+                "created" => Int(floor(time())),
+                "model" => "juliagptdistill-philosophy",
+                "choices" => choices,
+                "usage" => Dict(
+                    "prompt_tokens" => prompt_tokens,
+                    "completion_tokens" => total_completion_tokens,
+                    "total_tokens" => prompt_tokens + total_completion_tokens),
+                "system_fingerprint" => "juliagptdistill-flux-v1"))
+        end
+    end
+    # 404 fallback
+    return json_response(404, Dict("error" => Dict(
+        "message" => "Not found: $method $target",
+        "type" => "invalid_request_error",
+        "code" => "not_found")))
+end
+# ═══════════════════════════════════════════════════════════════════
+# Start server
+# ═══════════════════════════════════════════════════════════════════
+println("\nJuliaGPTDistill server starting on 0.0.0.0:$PORT ...")
+println("  GET  http://localhost:$PORT/")
+println("  GET  http://localhost:$PORT/v1/models")
+println("  POST http://localhost:$PORT/v1/chat/completions")
+println("  POST http://localhost:$PORT/v1/chat/completions  (stream=true)")
+println()
+HTTP.serve(handle_request, "0.0.0.0", PORT)