Spaces:

LisaMegaWatts
/

JuliaGPT

Sleeping

DavinciDreams Claude Opus 4.6 commited on Feb 25

Commit

492f84f

1 Parent(s): ba27b12

Switch to Python/FastAPI server (RandyGPT pattern)

Replace Julia HTTP.jl server with FastAPI/uvicorn for reliable
HF Spaces streaming. Loads Flux JLD2 weights via h5py into PyTorch.

- server.py: FastAPI + StreamingResponse, JLD2→PyTorch weight loader
- GPT-2 architecture: LayerNorm, GELU, combined QKV, 6L×384D, 10.7M params
- Downloads best_model.jld2 + vocab.json from HF Hub at startup
- Dockerfile: python:3.11-slim + uvicorn
- Remove Julia files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (8) hide show

Dockerfile +6 -28
Project.toml +0 -7
checkpoint.jl +0 -53
checkpoints/best_model.json +0 -0
model.jl +0 -188
requirements.txt +6 -0
server.jl +0 -236
server.py +429 -0

Dockerfile CHANGED Viewed

@@ -1,38 +1,16 @@
-FROM julia:1.10-bookworm
-# HuggingFace Spaces requires user ID 1000
 RUN useradd -m -u 1000 user
-# Shared Julia depot for package caching
-ENV JULIA_DEPOT_PATH=/opt/julia-depot
-RUN mkdir -p /opt/julia-depot && chmod 777 /opt/julia-depot
-# Copy project file first for dependency caching
-COPY --chown=user Project.toml /home/user/app/
-# Install and precompile Julia packages (Flux + JLD2 + HTTP)
-RUN julia --project=/home/user/app -e ' \
-    using Pkg; \
-    Pkg.instantiate(); \
-    Pkg.precompile(); \
-    println("Precompile done")'
-# Copy application code
-COPY --chown=user model.jl /home/user/app/
-COPY --chown=user checkpoint.jl /home/user/app/
-COPY --chown=user server.jl /home/user/app/
-# Create checkpoints directory (model downloads from HF at runtime)
-RUN mkdir -p /home/user/app/checkpoints && chown user:user /home/user/app/checkpoints
-# If a local checkpoint or vocab is provided, copy it
-COPY --chown=user checkpoints/ /home/user/app/checkpoints/
-# Switch to non-root user
 USER user
 ENV HOME=/home/user
-WORKDIR /home/user/app
 EXPOSE 7860
-CMD ["julia", "--project=/home/user/app", "/home/user/app/server.jl"]

+FROM python:3.11-slim
 RUN useradd -m -u 1000 user
+WORKDIR /home/user/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY --chown=user server.py .
 USER user
 ENV HOME=/home/user
 EXPOSE 7860
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

Project.toml DELETED Viewed

@@ -1,7 +0,0 @@
-[deps]
-Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"

checkpoint.jl DELETED Viewed

@@ -1,53 +0,0 @@
-#=
-checkpoint.jl — Load JLD2 Flux checkpoint for JuliaGPT Space
-Loads model_state + hyperparams from JLD2, creates the Flux GPT model,
-and returns everything needed for inference.
-=#
-include("model.jl")
-using JLD2
-using JSON3
-function load_flux_checkpoint(path::String)
-    println("Loading Flux checkpoint from $path ...")
-    data = JLD2.load(path)
-    hp = data["hyperparams"]
-    vocab_size = Int(hp["vocab_size"])
-    n_embd = Int(hp["n_embd"])
-    block_size = Int(hp["block_size"])
-    n_layer = Int(hp["n_layer"])
-    n_head = Int(hp["n_head"])
-    dropout = Float64(get(hp, "dropout", 0.0))
-    init_model_constants!(block_size)
-    m = GPT(; vocab_size, n_embd, block_size, n_layer, n_head, dropout=0.0)
-    Flux.loadmodel!(m, data["model_state"])
-    Flux.testmode!(m)
-    step = get(data, "step", 0)
-    best_val = get(data, "best_val_loss", Inf)
-    println("  vocab=$vocab_size, embd=$n_embd, layers=$n_layer, heads=$n_head, block=$block_size")
-    println("  step=$step, best_val_loss=$(round(best_val, digits=4))")
-    return (; model=m, vocab_size, n_embd, block_size, n_layer, n_head, step, best_val_loss=best_val)
-end
-function load_char_vocab(path::String)
-    if !isfile(path)
-        return nothing
-    end
-    raw = JSON3.read(read(path, String))
-    uchars = [only(String(s)) for s in raw]
-    stoi = Dict(c => i for (i, c) in enumerate(uchars))
-    itos = Dict(i => c for (i, c) in enumerate(uchars))
-    return (; uchars, stoi, itos, vocab_size=length(uchars))
-end
-function save_char_vocab(path::String, chars::Vector{Char})
-    open(path, "w") do f
-        JSON3.write(f, [string(c) for c in chars])
-    end
-end

checkpoints/best_model.json DELETED Viewed

The diff for this file is too large to render. See raw diff

model.jl DELETED Viewed

@@ -1,188 +0,0 @@
-#=
-model.jl — JuliaFlux v1 GPT architecture for JuliaGPT Space
-GPT-2 style: LayerNorm, GELU, standard MHA, learned position embeddings.
-Matches juliaflux.ipynb training notebook exactly so Flux.loadmodel! works.
-CPU-only inference.
-=#
-using Flux
-using NNlib
-using NNlib: batched_mul
-using Statistics
-using LinearAlgebra
-# ═══════════════════════════════════════════════════════════════════
-# Pre-computed constants — set by init_model_constants!()
-# ═══════════════════════════════════════════════════════════════════
-CAUSAL_MASK = Matrix{Float32}(undef, 0, 0)
-POS_RANGE = Vector{Int32}()
-function init_model_constants!(block_size::Int)
-    global CAUSAL_MASK = triu(fill(typemin(Float32), block_size, block_size), 1)
-    global POS_RANGE = collect(Int32, 1:block_size)
-    println("  Constants: mask=$(size(CAUSAL_MASK)), pos_range=$(length(POS_RANGE))")
-end
-# ═══════════════════════════════════════════════════════════════════
-# Model structs — matches juliaflux.ipynb exactly
-# ═══════════════════════════════════════════════════════════════════
-struct CausalSelfAttention
-    qkv::Dense       # n_embd -> 3*n_embd
-    proj::Dense       # n_embd -> n_embd
-    n_head::Int
-end
-Flux.@layer CausalSelfAttention trainable=(qkv, proj)
-function CausalSelfAttention(n_embd::Int, n_head::Int; bias=false)
-    CausalSelfAttention(Dense(n_embd => 3 * n_embd; bias), Dense(n_embd => n_embd; bias), n_head)
-end
-function (attn::CausalSelfAttention)(x)
-    C, T, B = size(x)
-    hs = C ÷ attn.n_head; nh = attn.n_head
-    qkv = attn.qkv(x)
-    q = qkv[1:C, :, :]; k = qkv[C+1:2C, :, :]; v = qkv[2C+1:3C, :, :]
-    q = reshape(permutedims(reshape(q, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
-    k = reshape(permutedims(reshape(k, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
-    v = reshape(permutedims(reshape(v, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
-    scale = Float32(1 / sqrt(hs))
-    wei = batched_mul(permutedims(q, (2, 1, 3)), k) .* scale
-    mask = CAUSAL_MASK[1:T, 1:T]
-    wei = softmax(wei .+ mask; dims=2)
-    out = batched_mul(v, permutedims(wei, (2, 1, 3)))
-    out = reshape(permutedims(reshape(out, hs, T, nh, B), (1, 3, 2, 4)), C, T, B)
-    attn.proj(out)
-end
-struct FeedForward
-    net::Chain
-end
-Flux.@layer FeedForward
-function FeedForward(n_embd::Int; bias=false, dropout=0.0)
-    FeedForward(Chain(Dense(n_embd => 4 * n_embd; bias), gelu, Dense(4 * n_embd => n_embd; bias), Dropout(dropout)))
-end
-(ff::FeedForward)(x) = ff.net(x)
-struct TransformerBlock
-    ln1::LayerNorm
-    attn::CausalSelfAttention
-    ln2::LayerNorm
-    ffwd::FeedForward
-end
-Flux.@layer TransformerBlock
-function TransformerBlock(n_embd::Int, n_head::Int; dropout=0.0)
-    TransformerBlock(LayerNorm(n_embd), CausalSelfAttention(n_embd, n_head), LayerNorm(n_embd), FeedForward(n_embd; dropout))
-end
-function (block::TransformerBlock)(x)
-    x = x .+ block.attn(block.ln1(x))
-    x = x .+ block.ffwd(block.ln2(x))
-    x
-end
-struct GPT
-    wte::Embedding
-    wpe::Embedding
-    drop::Dropout
-    blocks::Chain
-    ln_f::LayerNorm
-    lm_head::Dense
-end
-Flux.@layer GPT
-function GPT(; vocab_size, n_embd, block_size, n_layer, n_head, dropout=0.0)
-    GPT(Embedding(vocab_size => n_embd), Embedding(block_size => n_embd), Dropout(dropout),
-        Chain([TransformerBlock(n_embd, n_head; dropout) for _ in 1:n_layer]...),
-        LayerNorm(n_embd), Dense(n_embd => vocab_size; bias=false))
-end
-function (m::GPT)(idx)
-    B, T = size(idx)
-    tok = permutedims(m.wte(idx), (1, 3, 2))
-    pos_ids = repeat(reshape(POS_RANGE[1:T], 1, T), B, 1)
-    pos = permutedims(m.wpe(pos_ids), (1, 3, 2))
-    x = m.drop(tok .+ pos)
-    x = m.blocks(x)
-    x = m.ln_f(x)
-    m.lm_head(x)
-end
-# ═══════════════════════════════════════════════════════════════════
-# Text generation — top-k sampling with repetition penalty
-# ═══════════════════════════════════════════════════════════════════
-function generate_text(model, vocab_size::Int, itos::Dict{Int,Char}, block_size::Int;
-                       seed_ids::Vector{Int}=Int[], max_tokens=200, temperature=0.1,
-                       top_k=8, repetition_penalty=1.3)
-    Flux.testmode!(model)
-    # Start from seed (encoded prompt) or random token
-    if !isempty(seed_ids)
-        idx = reshape(seed_ids, 1, length(seed_ids))
-    else
-        idx = reshape([rand(1:vocab_size)], 1, 1)
-    end
-    generated = Int[]
-    for _ in 1:max_tokens
-        idx_cond = idx[:, max(1, end-block_size+1):end]
-        logits = model(idx_cond)
-        logits_last = Vector{Float32}(logits[:, end, 1])
-        # Repetition penalty: reduce logits for recently generated tokens
-        if repetition_penalty != 1.0
-            seen = Set{Int}()
-            # Penalise tokens from context window
-            for j in max(1, length(generated)-block_size+1):length(generated)
-                push!(seen, generated[j])
-            end
-            for j in 1:size(idx_cond, 2)
-                push!(seen, idx_cond[1, j])
-            end
-            for id in seen
-                if 1 <= id <= length(logits_last)
-                    if logits_last[id] > 0
-                        logits_last[id] /= Float32(repetition_penalty)
-                    else
-                        logits_last[id] *= Float32(repetition_penalty)
-                    end
-                end
-            end
-        end
-        # Temperature scaling
-        logits_last ./= Float32(max(temperature, 0.01))
-        # Top-k filtering
-        k = min(top_k, length(logits_last))
-        if k < length(logits_last)
-            threshold = partialsort(logits_last, k; rev=true)
-            for i in eachindex(logits_last)
-                if logits_last[i] < threshold
-                    logits_last[i] = typemin(Float32)
-                end
-            end
-        end
-        probs = softmax(logits_last)
-        probs_cpu = Float64.(probs)
-        # Categorical sample
-        r = rand()
-        cum = 0.0
-        next_id = length(probs_cpu)
-        for (i, p) in enumerate(probs_cpu)
-            cum += p
-            if r <= cum; next_id = i; break; end
-        end
-        push!(generated, next_id)
-        idx = hcat(idx, reshape([next_id], 1, 1))
-    end
-    return join(get(itos, id, '?') for id in generated)
-end

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi>=0.110.0
+uvicorn>=0.29.0
+torch>=2.0.0
+h5py>=3.10.0
+huggingface_hub>=0.20.0
+pydantic>=2.0.0

server.jl DELETED Viewed

@@ -1,236 +0,0 @@
-#=
-server.jl — OpenAI-compatible inference server for JuliaGPT
-Serves a JuliaFlux v1 GPT model (Flux.jl, GPT-2 style architecture).
-Downloads checkpoint from HuggingFace model repo on first run.
-Endpoints:
-    GET  /                       -> health check / API info
-    GET  /v1/models              -> list available models
-    POST /v1/chat/completions    -> generate philosophy text (OpenAI format)
-=#
-include("checkpoint.jl")
-using HTTP
-using UUIDs
-using Sockets
-using Downloads
-# ═══════════════════════════════════════════════════════════════════
-# Download checkpoint from HuggingFace if needed
-# ═══════════════════════════════════════════════════════════════════
-const CKPT_DIR = "checkpoints"
-const CKPT_PATH = joinpath(CKPT_DIR, "best_model.jld2")
-const VOCAB_PATH = joinpath(CKPT_DIR, "vocab.json")
-const HF_REPO = get(ENV, "HF_REPO", "LisaMegaWatts/JuliaGPT")
-const PORT = parse(Int, get(ENV, "PORT", "7860"))
-function download_from_hf(repo::String, filename::String, local_path::String)
-    url = "https://huggingface.co/$repo/resolve/main/$filename"
-    println("Downloading $url ...")
-    mkpath(dirname(local_path))
-    Downloads.download(url, local_path)
-    sz = round(filesize(local_path) / 1024^2, digits=1)
-    println("  -> $local_path ($sz MB)")
-end
-if !isfile(CKPT_PATH)
-    println("No local checkpoint, downloading from $HF_REPO ...")
-    try
-        download_from_hf(HF_REPO, "best_model.jld2", CKPT_PATH)
-    catch e
-        println("Download failed: $e")
-        println("Place a checkpoint at $CKPT_PATH manually.")
-        exit(1)
-    end
-end
-# Also download vocab.json if missing
-if !isfile(VOCAB_PATH)
-    println("No local vocab, downloading from $HF_REPO ...")
-    try
-        download_from_hf(HF_REPO, "vocab.json", VOCAB_PATH)
-    catch e
-        println("vocab.json download failed (will use fallback): $e")
-    end
-end
-# ═══════════════════════════════════════════════════════════════════
-# Load model
-# ═══════════════════════════════════════════════════════════════════
-println("\nLoading model...")
-const CKPT = load_flux_checkpoint(CKPT_PATH)
-const MODEL = CKPT.model
-const VOCAB_SIZE = CKPT.vocab_size
-const BLOCK_SIZE = CKPT.block_size
-# Character vocab: try vocab.json, else build from embedded quotes
-const ITOS, STOI = let
-    vdata = load_char_vocab(VOCAB_PATH)
-    if vdata !== nothing && vdata.vocab_size >= VOCAB_SIZE
-        println("Loaded char vocab from $VOCAB_PATH ($(vdata.vocab_size) chars, model expects $VOCAB_SIZE)")
-        vdata.itos, vdata.stoi
-    elseif vdata !== nothing
-        println("WARN: vocab.json has $(vdata.vocab_size) chars but model expects $VOCAB_SIZE, using vocab.json anyway")
-        vdata.itos, vdata.stoi
-    else
-        # Build from training data charset: a-z + space + period (28 chars)
-        chars = sort(collect(Set("abcdefghijklmnopqrstuvwxyz .")))
-        if length(chars) < VOCAB_SIZE
-            # Pad with additional printable chars if model expects more
-            for c in ",0123456789!?;:'-\n"
-                length(chars) >= VOCAB_SIZE && break
-                c in chars || push!(chars, c)
-            end
-            sort!(chars)
-        end
-        chars = chars[1:min(end, VOCAB_SIZE)]
-        itos = Dict(i => c for (i, c) in enumerate(chars))
-        stoi = Dict(c => i for (i, c) in enumerate(chars))
-        println("Built char vocab: $(length(chars)) chars -> [$(join(chars))]")
-        itos, stoi
-    end
-end
-const MODEL_CREATED_AT = Int(floor(time()))
-println("\nModel ready: vocab=$VOCAB_SIZE, embd=$(CKPT.n_embd), layers=$(CKPT.n_layer), block=$BLOCK_SIZE")
-# ═══════════════════════════════════════════════════════════════════
-# API handlers
-# ═══════════════════════════════════════════════════════════════════
-function json_response(status::Int, body)
-    HTTP.Response(status,
-        ["Content-Type" => "application/json",
-         "Access-Control-Allow-Origin" => "*",
-         "Access-Control-Allow-Methods" => "GET, POST, OPTIONS",
-         "Access-Control-Allow-Headers" => "Content-Type, Authorization"],
-        JSON3.write(body))
-end
-function handle_root(req::HTTP.Request)
-    json_response(200, Dict(
-        "name" => "JuliaGPT",
-        "version" => "2.0.0",
-        "description" => "A Flux.jl GPT trained on classical philosophy texts",
-        "architecture" => "GPT-2 style (LayerNorm, GELU, MHA)",
-        "model" => Dict(
-            "vocab_size" => VOCAB_SIZE,
-            "n_embd" => CKPT.n_embd,
-            "n_layer" => CKPT.n_layer,
-            "n_head" => CKPT.n_head,
-            "block_size" => BLOCK_SIZE
-        ),
-        "endpoints" => ["/v1/models", "/v1/chat/completions"],
-        "compatible_with" => ["OpenAI API", "OpenRouter"]
-    ))
-end
-function handle_models(req::HTTP.Request)
-    json_response(200, Dict(
-        "object" => "list",
-        "data" => [Dict(
-            "id" => "juliagpt-philosophy",
-            "object" => "model",
-            "created" => MODEL_CREATED_AT,
-            "owned_by" => "juliagpt"
-        )]
-    ))
-end
-function handle_chat_completions(req::HTTP.Request)
-    local body
-    try
-        body = JSON3.read(String(req.body))
-    catch e
-        return json_response(400, Dict("error" => Dict(
-            "message" => "Invalid JSON in request body",
-            "type" => "invalid_request_error",
-            "code" => "invalid_json")))
-    end
-    temperature = Float64(clamp(get(body, :temperature, 0.1), 0.01, 2.0))
-    max_tokens = Int(clamp(get(body, :max_tokens, 200), 1, BLOCK_SIZE))
-    n_completions = Int(clamp(get(body, :n, 1), 1, 4))
-    top_k = Int(clamp(get(body, :top_k, 8), 1, VOCAB_SIZE))
-    rep_penalty = Float64(clamp(get(body, :repetition_penalty, 1.3), 1.0, 3.0))
-    messages = get(body, :messages, [])
-    prompt_text = ""
-    if !isempty(messages)
-        prompt_text = string(get(messages[end], :content, ""))
-    end
-    # Encode prompt as seed token IDs (char-level)
-    seed_ids = Int[]
-    if !isempty(prompt_text)
-        prompt_lower = lowercase(prompt_text)
-        for c in prompt_lower
-            id = get(STOI, c, nothing)
-            id !== nothing && push!(seed_ids, id)
-        end
-        # Truncate to fit block_size (leave room for generation)
-        if length(seed_ids) > BLOCK_SIZE ÷ 2
-            seed_ids = seed_ids[end - BLOCK_SIZE ÷ 2 + 1:end]
-        end
-    end
-    choices = []
-    total_completion_tokens = 0
-    for i in 1:n_completions
-        text = generate_text(MODEL, VOCAB_SIZE, ITOS, BLOCK_SIZE;
-                             seed_ids=seed_ids, max_tokens=max_tokens,
-                             temperature=temperature, top_k=top_k,
-                             repetition_penalty=rep_penalty)
-        finish_reason = length(text) >= max_tokens ? "length" : "stop"
-        push!(choices, Dict(
-            "index" => i - 1,
-            "message" => Dict("role" => "assistant", "content" => text),
-            "finish_reason" => finish_reason))
-        total_completion_tokens += length(text)
-    end
-    json_response(200, Dict(
-        "id" => "chatcmpl-" * string(uuid4()),
-        "object" => "chat.completion",
-        "created" => Int(floor(time())),
-        "model" => "juliagpt-philosophy",
-        "choices" => choices,
-        "usage" => Dict(
-            "prompt_tokens" => length(prompt_text),
-            "completion_tokens" => total_completion_tokens,
-            "total_tokens" => length(prompt_text) + total_completion_tokens),
-        "system_fingerprint" => "juliagpt-fluxv1"))
-end
-# ═══════════════════════════════════════════════════════════════════
-# Router + CORS
-# ═══════════════════════════════════════════════════════════════════
-function cors_preflight(req::HTTP.Request)
-    HTTP.Response(204,
-        ["Access-Control-Allow-Origin" => "*",
-         "Access-Control-Allow-Methods" => "GET, POST, OPTIONS",
-         "Access-Control-Allow-Headers" => "Content-Type, Authorization"])
-end
-const ROUTER = HTTP.Router()
-HTTP.register!(ROUTER, "GET", "/", handle_root)
-HTTP.register!(ROUTER, "GET", "/v1/models", handle_models)
-HTTP.register!(ROUTER, "POST", "/v1/chat/completions", handle_chat_completions)
-HTTP.register!(ROUTER, "OPTIONS", "/v1/chat/completions", cors_preflight)
-HTTP.register!(ROUTER, "OPTIONS", "/v1/models", cors_preflight)
-# ═══════════════════════════════════════════════════════════════════
-# Start server
-# ═══════════════════════════════════════════════════════════════════
-println("\nJuliaGPT server starting on 0.0.0.0:$PORT ...")
-println("  GET  http://localhost:$PORT/")
-println("  GET  http://localhost:$PORT/v1/models")
-println("  POST http://localhost:$PORT/v1/chat/completions")
-println()
-HTTP.serve(ROUTER, "0.0.0.0", PORT)

server.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""
+server.py — JuliaGPT OpenAI-compatible inference server
+Serves POST /v1/chat/completions (streaming + non-streaming) and GET /v1/models.
+Loads the Flux.jl GPT-2 model from best_model.jld2 on HF Hub.
+Architecture: GPT-2 style — LayerNorm, GELU, combined QKV, learned position embeddings.
+6 layers, 384-dim, 6 heads, 38-char vocab, val_loss=2.91.
+Weights are extracted from JLD2 (HDF5-based) via h5py, loaded into PyTorch.
+Follows the RandyGPT FastAPI/uvicorn pattern for proven HF Spaces compatibility.
+"""
+import json
+import math
+import time
+import uuid
+import os
+import h5py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.exceptions import RequestValidationError
+from pydantic import BaseModel
+from typing import List, Optional
+from huggingface_hub import hf_hub_download
+# ── Model definition (GPT-2 style, matches Flux training) ────────────────────
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd, n_head):
+        super().__init__()
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+        self.qkv = nn.Linear(n_embd, 3 * n_embd, bias=False)
+        self.proj = nn.Linear(n_embd, n_embd, bias=False)
+    def forward(self, x):
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.split(C, dim=-1)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        scores = q @ k.transpose(-2, -1) * self.scale
+        mask = torch.full((T, T), float('-inf'), device=x.device).triu(1)
+        attn = F.softmax(scores + mask, dim=-1)
+        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
+        return self.proj(out)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd):
+        super().__init__()
+        self.fc1 = nn.Linear(n_embd, 4 * n_embd, bias=False)
+        self.fc2 = nn.Linear(4 * n_embd, n_embd, bias=False)
+    def forward(self, x):
+        return self.fc2(F.gelu(self.fc1(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, n_embd, n_head):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.ffwd = FeedForward(n_embd)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size):
+        super().__init__()
+        self.block_size = block_size
+        self.wte = nn.Embedding(vocab_size, n_embd)
+        self.wpe = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
+    def forward(self, ids):
+        B, T = ids.shape
+        x = self.wte(ids) + self.wpe(torch.arange(T, device=ids.device).unsqueeze(0))
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_f(x)
+        return self.lm_head(x)
+    @torch.no_grad()
+    def generate_stream(self, ids, max_new_tokens=200, temperature=0.1,
+                        top_k=8, repetition_penalty=1.3):
+        self.eval()
+        generated = []
+        for i in range(max_new_tokens):
+            ctx = ids[:, -self.block_size:]
+            logits = self(ctx)[:, -1, :]
+            logits = logits[0]
+            if repetition_penalty > 1.0:
+                seen = set()
+                for t in generated[-self.block_size:]:
+                    seen.add(t)
+                for t in ctx[0].tolist():
+                    seen.add(t)
+                for t in seen:
+                    if 0 <= t < logits.shape[0]:
+                        if logits[t] > 0:
+                            logits[t] /= repetition_penalty
+                        else:
+                            logits[t] *= repetition_penalty
+            logits = logits / max(temperature, 0.01)
+            if top_k > 0 and top_k < logits.shape[0]:
+                topk_vals, _ = torch.topk(logits, top_k)
+                logits[logits < topk_vals[-1]] = float('-inf')
+            probs = F.softmax(logits, dim=-1)
+            nxt = torch.multinomial(probs, 1)
+            ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
+            token_id = nxt.item()
+            generated.append(token_id)
+            is_last = (i == max_new_tokens - 1)
+            yield token_id, is_last
+    @torch.no_grad()
+    def generate(self, ids, max_new_tokens=200, temperature=0.1,
+                 top_k=8, repetition_penalty=1.3):
+        self.eval()
+        generated = []
+        for token_id, _ in self.generate_stream(ids, max_new_tokens, temperature,
+                                                 top_k, repetition_penalty):
+            generated.append(token_id)
+        return generated
+# ── Char-level tokenizer ──────────────────────────────────────────────────────
+class CharTokenizer:
+    def __init__(self, uchars):
+        self.uchars = uchars
+        self.stoi = {c: i for i, c in enumerate(uchars)}
+        self.itos = {i: c for i, c in enumerate(uchars)}
+        self.vocab_size = len(uchars)
+    def encode(self, text):
+        return [self.stoi[c] for c in text.lower() if c in self.stoi]
+    def decode(self, ids):
+        return "".join(self.itos.get(i, "?") for i in ids)
+# ── Load JLD2 weights via h5py ───────────────────────────────────────────────
+def load_jld2_gpt2(jld2_path, vocab_path=None):
+    """Load Flux GPT-2 weights from JLD2, build PyTorch model."""
+    print(f"Loading JLD2 from {jld2_path} ...")
+    f = h5py.File(jld2_path, "r")
+    ms = f["model_state"][()]
+    def deref(ref):
+        return np.array(f[ref])
+    # Get architecture params
+    b1 = ms["blocks"]["layers"]["1"]
+    n_head = int(b1["attn"]["n_head"])
+    wte_w = deref(ms["wte"]["weight"])
+    vocab_size, n_embd = wte_w.shape
+    wpe_w = deref(ms["wpe"]["weight"])
+    block_size = wpe_w.shape[0]
+    layer_names = sorted(ms["blocks"]["layers"].dtype.names, key=int)
+    n_layer = len(layer_names)
+    step = int(f["step"][()])
+    best_val = float(f["best_val_loss"][()])
+    print(f"  vocab={vocab_size}, embd={n_embd}, heads={n_head}, layers={n_layer}, block={block_size}")
+    print(f"  step={step}, best_val_loss={best_val:.4f}")
+    # Build PyTorch model
+    model = GPT(vocab_size, n_embd, n_head, n_layer, block_size)
+    state = {}
+    # Embeddings: h5py (vocab, embd) = PyTorch (vocab, embd), no transpose
+    state["wte.weight"] = torch.tensor(wte_w, dtype=torch.float32)
+    state["wpe.weight"] = torch.tensor(wpe_w, dtype=torch.float32)
+    # Dense weights: h5py gives (in, out) due to Julia column-major → need .T for PyTorch (out, in)
+    for i, lname in enumerate(layer_names):
+        layer = ms["blocks"]["layers"][lname]
+        # LayerNorm (1D, no transpose)
+        state[f"blocks.{i}.ln1.weight"] = torch.tensor(deref(layer["ln1"]["diag"]["scale"]), dtype=torch.float32)
+        state[f"blocks.{i}.ln1.bias"] = torch.tensor(deref(layer["ln1"]["diag"]["bias"]), dtype=torch.float32)
+        state[f"blocks.{i}.ln2.weight"] = torch.tensor(deref(layer["ln2"]["diag"]["scale"]), dtype=torch.float32)
+        state[f"blocks.{i}.ln2.bias"] = torch.tensor(deref(layer["ln2"]["diag"]["bias"]), dtype=torch.float32)
+        # Attention QKV + proj (transpose Dense weights)
+        state[f"blocks.{i}.attn.qkv.weight"] = torch.tensor(deref(layer["attn"]["qkv"]["weight"]).T.copy(), dtype=torch.float32)
+        state[f"blocks.{i}.attn.proj.weight"] = torch.tensor(deref(layer["attn"]["proj"]["weight"]).T.copy(), dtype=torch.float32)
+        # FeedForward (transpose Dense weights)
+        state[f"blocks.{i}.ffwd.fc1.weight"] = torch.tensor(deref(layer["ffwd"]["net"]["layers"]["1"]["weight"]).T.copy(), dtype=torch.float32)
+        state[f"blocks.{i}.ffwd.fc2.weight"] = torch.tensor(deref(layer["ffwd"]["net"]["layers"]["3"]["weight"]).T.copy(), dtype=torch.float32)
+    # Final LayerNorm
+    state["ln_f.weight"] = torch.tensor(deref(ms["ln_f"]["diag"]["scale"]), dtype=torch.float32)
+    state["ln_f.bias"] = torch.tensor(deref(ms["ln_f"]["diag"]["bias"]), dtype=torch.float32)
+    # Output projection (transpose Dense weight)
+    state["lm_head.weight"] = torch.tensor(deref(ms["lm_head"]["weight"]).T.copy(), dtype=torch.float32)
+    model.load_state_dict(state)
+    model.eval()
+    f.close()
+    params = sum(p.numel() for p in model.parameters())
+    print(f"  PyTorch model loaded: {params:,} params")
+    # Load char vocab
+    tok = None
+    if vocab_path and os.path.exists(vocab_path):
+        uchars = json.loads(Path(vocab_path).read_text())
+        tok = CharTokenizer(uchars)
+        print(f"  Loaded char vocab: {tok.vocab_size} chars")
+    return model, tok, {
+        "vocab_size": vocab_size, "n_embd": n_embd, "n_head": n_head,
+        "n_layer": n_layer, "block_size": block_size, "step": step,
+        "best_val_loss": best_val, "params": params,
+    }
+# ── Load model at startup ────────────────────────────────────────────────────
+REPO = os.environ.get("HF_REPO", "LisaMegaWatts/JuliaGPT")
+MODEL_ID = "juliagpt-philosophy"
+print(f"Downloading model from {REPO} ...")
+jld2_path = hf_hub_download(repo_id=REPO, filename="best_model.jld2")
+try:
+    vocab_path = hf_hub_download(repo_id=REPO, filename="vocab.json")
+except Exception:
+    vocab_path = None
+model, tok, hp = load_jld2_gpt2(jld2_path, vocab_path)
+n_embd = hp["n_embd"]
+n_head = hp["n_head"]
+n_layer = hp["n_layer"]
+block_size = hp["block_size"]
+vocab_size = hp["vocab_size"]
+# Fallback tokenizer if vocab.json missing
+if tok is None:
+    chars = sorted(set("abcdefghijklmnopqrstuvwxyz ."))
+    tok = CharTokenizer(chars)
+    print(f"  Built fallback char vocab: {tok.vocab_size} chars")
+print(f"\nModel ready — {hp['params']:,} params, vocab={tok.vocab_size}, val_loss={hp['best_val_loss']:.4f}")
+# ── FastAPI app ───────────────────────────────────────────────────────────────
+app = FastAPI(title="JuliaGPT", version="2.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def _openai_error(status, message, err_type="invalid_request_error", code=None):
+    body = {"error": {"message": message, "type": err_type}}
+    if code:
+        body["error"]["code"] = code
+    return JSONResponse(status_code=status, content=body)
+@app.exception_handler(HTTPException)
+async def http_exc(request, exc):
+    return _openai_error(exc.status_code, str(exc.detail))
+@app.exception_handler(RequestValidationError)
+async def val_exc(request, exc):
+    msg = "; ".join(f"{e['loc'][-1]}: {e['msg']}" for e in exc.errors())
+    return _openai_error(422, msg, code="invalid_request_error")
+@app.get("/")
+def root():
+    return {
+        "name": "JuliaGPT",
+        "version": "2.0.0",
+        "description": "Flux.jl GPT-2 trained on classical philosophy (served via PyTorch)",
+        "architecture": "GPT-2 (LayerNorm, GELU, combined QKV)",
+        "model": {
+            "vocab_size": tok.vocab_size, "n_embd": n_embd,
+            "n_layer": n_layer, "n_head": n_head,
+            "block_size": block_size, "params": hp["params"],
+        },
+        "endpoints": ["/v1/models", "/v1/chat/completions"],
+        "features": ["streaming", "OpenAI-compatible"],
+    }
+@app.get("/v1/models")
+def list_models():
+    return {
+        "object": "list",
+        "data": [{"id": MODEL_ID, "object": "model",
+                  "created": 1700000000, "owned_by": "juliagpt"}]
+    }
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    model: Optional[str] = MODEL_ID
+    messages: List[Message]
+    max_tokens: Optional[int] = 200
+    temperature: Optional[float] = 0.1
+    top_k: Optional[int] = 8
+    repetition_penalty: Optional[float] = 1.3
+    n: Optional[int] = 1
+    stream: Optional[bool] = False
+def _sse(data):
+    return f"data: {json.dumps(data)}\n\n"
+def _stream_completion(ids, max_tokens, temperature, top_k, rep_penalty,
+                       completion_id, _model, _tok):
+    yield _sse({
+        "id": completion_id, "object": "chat.completion.chunk",
+        "created": int(time.time()), "model": MODEL_ID,
+        "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""},
+                     "finish_reason": None}],
+    })
+    token_count = 0
+    for token_id, is_last in _model.generate_stream(
+        ids, max_new_tokens=max_tokens, temperature=temperature,
+        top_k=top_k, repetition_penalty=rep_penalty
+    ):
+        token_text = _tok.decode([token_id])
+        token_count += 1
+        finish_reason = ("length" if token_count >= max_tokens else "stop") if is_last else None
+        yield _sse({
+            "id": completion_id, "object": "chat.completion.chunk",
+            "created": int(time.time()), "model": MODEL_ID,
+            "choices": [{"index": 0, "delta": {"content": token_text},
+                         "finish_reason": finish_reason}],
+        })
+    yield "data: [DONE]\n\n"
+@app.post("/v1/chat/completions")
+def chat_completions(req: ChatRequest):
+    _m, _t = model, tok
+    prompt = req.messages[-1].content.strip() if req.messages else ""
+    if not prompt:
+        raise HTTPException(status_code=400, detail="No content in messages")
+    ids = _t.encode(prompt)
+    if not ids:
+        ids = [0]
+    max_tokens = max(1, min(req.max_tokens or 200, block_size))
+    temperature = max(0.01, min(req.temperature or 0.1, 2.0))
+    top_k = max(1, min(req.top_k or 8, tok.vocab_size))
+    rep_penalty = max(1.0, min(req.repetition_penalty or 1.3, 3.0))
+    n = max(1, min(req.n or 1, 4))
+    completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+    tensor = torch.tensor([ids], dtype=torch.long)
+    if req.stream:
+        return StreamingResponse(
+            _stream_completion(tensor, max_tokens, temperature, top_k,
+                               rep_penalty, completion_id, _m, _t),
+            media_type="text/event-stream",
+            headers={"X-Accel-Buffering": "no"},
+        )
+    choices = []
+    total_completion_tokens = 0
+    for i in range(n):
+        generated = _m.generate(tensor.clone(), max_new_tokens=max_tokens,
+                                temperature=temperature, top_k=top_k,
+                                repetition_penalty=rep_penalty)
+        text = _t.decode(generated)
+        total_completion_tokens += len(generated)
+        choices.append({
+            "index": i,
+            "message": {"role": "assistant", "content": text},
+            "finish_reason": "length" if len(generated) >= max_tokens else "stop",
+        })
+    return {
+        "id": completion_id, "object": "chat.completion",
+        "created": int(time.time()), "model": MODEL_ID,
+        "system_fingerprint": "juliagpt-v2",
+        "choices": choices,
+        "usage": {
+            "prompt_tokens": len(ids),
+            "completion_tokens": total_completion_tokens,
+            "total_tokens": len(ids) + total_completion_tokens,
+        },
+    }