DavinciDreams Claude Opus 4.6 commited on
Commit
492f84f
Β·
1 Parent(s): ba27b12

Switch to Python/FastAPI server (RandyGPT pattern)

Browse files

Replace Julia HTTP.jl server with FastAPI/uvicorn for reliable
HF Spaces streaming. Loads Flux JLD2 weights via h5py into PyTorch.

- server.py: FastAPI + StreamingResponse, JLD2β†’PyTorch weight loader
- GPT-2 architecture: LayerNorm, GELU, combined QKV, 6LΓ—384D, 10.7M params
- Downloads best_model.jld2 + vocab.json from HF Hub at startup
- Dockerfile: python:3.11-slim + uvicorn
- Remove Julia files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (8) hide show
  1. Dockerfile +6 -28
  2. Project.toml +0 -7
  3. checkpoint.jl +0 -53
  4. checkpoints/best_model.json +0 -0
  5. model.jl +0 -188
  6. requirements.txt +6 -0
  7. server.jl +0 -236
  8. server.py +429 -0
Dockerfile CHANGED
@@ -1,38 +1,16 @@
1
- FROM julia:1.10-bookworm
2
 
3
- # HuggingFace Spaces requires user ID 1000
4
  RUN useradd -m -u 1000 user
5
 
6
- # Shared Julia depot for package caching
7
- ENV JULIA_DEPOT_PATH=/opt/julia-depot
8
- RUN mkdir -p /opt/julia-depot && chmod 777 /opt/julia-depot
9
-
10
- # Copy project file first for dependency caching
11
- COPY --chown=user Project.toml /home/user/app/
12
-
13
- # Install and precompile Julia packages (Flux + JLD2 + HTTP)
14
- RUN julia --project=/home/user/app -e ' \
15
- using Pkg; \
16
- Pkg.instantiate(); \
17
- Pkg.precompile(); \
18
- println("Precompile done")'
19
-
20
- # Copy application code
21
- COPY --chown=user model.jl /home/user/app/
22
- COPY --chown=user checkpoint.jl /home/user/app/
23
- COPY --chown=user server.jl /home/user/app/
24
-
25
- # Create checkpoints directory (model downloads from HF at runtime)
26
- RUN mkdir -p /home/user/app/checkpoints && chown user:user /home/user/app/checkpoints
27
 
28
- # If a local checkpoint or vocab is provided, copy it
29
- COPY --chown=user checkpoints/ /home/user/app/checkpoints/
30
 
31
- # Switch to non-root user
32
  USER user
33
  ENV HOME=/home/user
34
- WORKDIR /home/user/app
35
 
36
  EXPOSE 7860
37
 
38
- CMD ["julia", "--project=/home/user/app", "/home/user/app/server.jl"]
 
1
+ FROM python:3.11-slim
2
 
 
3
  RUN useradd -m -u 1000 user
4
 
5
+ WORKDIR /home/user/app
6
+ COPY --chown=user requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ COPY --chown=user server.py .
 
10
 
 
11
  USER user
12
  ENV HOME=/home/user
 
13
 
14
  EXPOSE 7860
15
 
16
+ CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]
Project.toml DELETED
@@ -1,7 +0,0 @@
1
- [deps]
2
- Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
3
- Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
4
- HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
5
- JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
6
- JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
7
- NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 
 
 
 
 
 
 
 
checkpoint.jl DELETED
@@ -1,53 +0,0 @@
1
- #=
2
- checkpoint.jl β€” Load JLD2 Flux checkpoint for JuliaGPT Space
3
-
4
- Loads model_state + hyperparams from JLD2, creates the Flux GPT model,
5
- and returns everything needed for inference.
6
- =#
7
-
8
- include("model.jl")
9
- using JLD2
10
- using JSON3
11
-
12
- function load_flux_checkpoint(path::String)
13
- println("Loading Flux checkpoint from $path ...")
14
- data = JLD2.load(path)
15
-
16
- hp = data["hyperparams"]
17
- vocab_size = Int(hp["vocab_size"])
18
- n_embd = Int(hp["n_embd"])
19
- block_size = Int(hp["block_size"])
20
- n_layer = Int(hp["n_layer"])
21
- n_head = Int(hp["n_head"])
22
- dropout = Float64(get(hp, "dropout", 0.0))
23
-
24
- init_model_constants!(block_size)
25
-
26
- m = GPT(; vocab_size, n_embd, block_size, n_layer, n_head, dropout=0.0)
27
- Flux.loadmodel!(m, data["model_state"])
28
- Flux.testmode!(m)
29
-
30
- step = get(data, "step", 0)
31
- best_val = get(data, "best_val_loss", Inf)
32
- println(" vocab=$vocab_size, embd=$n_embd, layers=$n_layer, heads=$n_head, block=$block_size")
33
- println(" step=$step, best_val_loss=$(round(best_val, digits=4))")
34
-
35
- return (; model=m, vocab_size, n_embd, block_size, n_layer, n_head, step, best_val_loss=best_val)
36
- end
37
-
38
- function load_char_vocab(path::String)
39
- if !isfile(path)
40
- return nothing
41
- end
42
- raw = JSON3.read(read(path, String))
43
- uchars = [only(String(s)) for s in raw]
44
- stoi = Dict(c => i for (i, c) in enumerate(uchars))
45
- itos = Dict(i => c for (i, c) in enumerate(uchars))
46
- return (; uchars, stoi, itos, vocab_size=length(uchars))
47
- end
48
-
49
- function save_char_vocab(path::String, chars::Vector{Char})
50
- open(path, "w") do f
51
- JSON3.write(f, [string(c) for c in chars])
52
- end
53
- end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/best_model.json DELETED
The diff for this file is too large to render. See raw diff
 
model.jl DELETED
@@ -1,188 +0,0 @@
1
- #=
2
- model.jl β€” JuliaFlux v1 GPT architecture for JuliaGPT Space
3
-
4
- GPT-2 style: LayerNorm, GELU, standard MHA, learned position embeddings.
5
- Matches juliaflux.ipynb training notebook exactly so Flux.loadmodel! works.
6
- CPU-only inference.
7
- =#
8
-
9
- using Flux
10
- using NNlib
11
- using NNlib: batched_mul
12
- using Statistics
13
- using LinearAlgebra
14
-
15
- # ═══════════════════════════════════════════════════════════════════
16
- # Pre-computed constants β€” set by init_model_constants!()
17
- # ═══════════════════════════════════════════════════════════════════
18
-
19
- CAUSAL_MASK = Matrix{Float32}(undef, 0, 0)
20
- POS_RANGE = Vector{Int32}()
21
-
22
- function init_model_constants!(block_size::Int)
23
- global CAUSAL_MASK = triu(fill(typemin(Float32), block_size, block_size), 1)
24
- global POS_RANGE = collect(Int32, 1:block_size)
25
- println(" Constants: mask=$(size(CAUSAL_MASK)), pos_range=$(length(POS_RANGE))")
26
- end
27
-
28
- # ═══════════════════════════════════════════════════════════════════
29
- # Model structs β€” matches juliaflux.ipynb exactly
30
- # ═══════════════════════════════════════════════════════════════════
31
-
32
- struct CausalSelfAttention
33
- qkv::Dense # n_embd -> 3*n_embd
34
- proj::Dense # n_embd -> n_embd
35
- n_head::Int
36
- end
37
- Flux.@layer CausalSelfAttention trainable=(qkv, proj)
38
-
39
- function CausalSelfAttention(n_embd::Int, n_head::Int; bias=false)
40
- CausalSelfAttention(Dense(n_embd => 3 * n_embd; bias), Dense(n_embd => n_embd; bias), n_head)
41
- end
42
-
43
- function (attn::CausalSelfAttention)(x)
44
- C, T, B = size(x)
45
- hs = C Γ· attn.n_head; nh = attn.n_head
46
-
47
- qkv = attn.qkv(x)
48
- q = qkv[1:C, :, :]; k = qkv[C+1:2C, :, :]; v = qkv[2C+1:3C, :, :]
49
-
50
- q = reshape(permutedims(reshape(q, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
51
- k = reshape(permutedims(reshape(k, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
52
- v = reshape(permutedims(reshape(v, hs, nh, T, B), (1, 3, 2, 4)), hs, T, nh * B)
53
-
54
- scale = Float32(1 / sqrt(hs))
55
- wei = batched_mul(permutedims(q, (2, 1, 3)), k) .* scale
56
- mask = CAUSAL_MASK[1:T, 1:T]
57
- wei = softmax(wei .+ mask; dims=2)
58
-
59
- out = batched_mul(v, permutedims(wei, (2, 1, 3)))
60
- out = reshape(permutedims(reshape(out, hs, T, nh, B), (1, 3, 2, 4)), C, T, B)
61
- attn.proj(out)
62
- end
63
-
64
- struct FeedForward
65
- net::Chain
66
- end
67
- Flux.@layer FeedForward
68
- function FeedForward(n_embd::Int; bias=false, dropout=0.0)
69
- FeedForward(Chain(Dense(n_embd => 4 * n_embd; bias), gelu, Dense(4 * n_embd => n_embd; bias), Dropout(dropout)))
70
- end
71
- (ff::FeedForward)(x) = ff.net(x)
72
-
73
- struct TransformerBlock
74
- ln1::LayerNorm
75
- attn::CausalSelfAttention
76
- ln2::LayerNorm
77
- ffwd::FeedForward
78
- end
79
- Flux.@layer TransformerBlock
80
- function TransformerBlock(n_embd::Int, n_head::Int; dropout=0.0)
81
- TransformerBlock(LayerNorm(n_embd), CausalSelfAttention(n_embd, n_head), LayerNorm(n_embd), FeedForward(n_embd; dropout))
82
- end
83
- function (block::TransformerBlock)(x)
84
- x = x .+ block.attn(block.ln1(x))
85
- x = x .+ block.ffwd(block.ln2(x))
86
- x
87
- end
88
-
89
- struct GPT
90
- wte::Embedding
91
- wpe::Embedding
92
- drop::Dropout
93
- blocks::Chain
94
- ln_f::LayerNorm
95
- lm_head::Dense
96
- end
97
- Flux.@layer GPT
98
-
99
- function GPT(; vocab_size, n_embd, block_size, n_layer, n_head, dropout=0.0)
100
- GPT(Embedding(vocab_size => n_embd), Embedding(block_size => n_embd), Dropout(dropout),
101
- Chain([TransformerBlock(n_embd, n_head; dropout) for _ in 1:n_layer]...),
102
- LayerNorm(n_embd), Dense(n_embd => vocab_size; bias=false))
103
- end
104
-
105
- function (m::GPT)(idx)
106
- B, T = size(idx)
107
- tok = permutedims(m.wte(idx), (1, 3, 2))
108
- pos_ids = repeat(reshape(POS_RANGE[1:T], 1, T), B, 1)
109
- pos = permutedims(m.wpe(pos_ids), (1, 3, 2))
110
- x = m.drop(tok .+ pos)
111
- x = m.blocks(x)
112
- x = m.ln_f(x)
113
- m.lm_head(x)
114
- end
115
-
116
- # ═══════════════════════════════════════════════════════════════════
117
- # Text generation β€” top-k sampling with repetition penalty
118
- # ═══════════════════════════════════════════════════════════════════
119
-
120
- function generate_text(model, vocab_size::Int, itos::Dict{Int,Char}, block_size::Int;
121
- seed_ids::Vector{Int}=Int[], max_tokens=200, temperature=0.1,
122
- top_k=8, repetition_penalty=1.3)
123
- Flux.testmode!(model)
124
-
125
- # Start from seed (encoded prompt) or random token
126
- if !isempty(seed_ids)
127
- idx = reshape(seed_ids, 1, length(seed_ids))
128
- else
129
- idx = reshape([rand(1:vocab_size)], 1, 1)
130
- end
131
-
132
- generated = Int[]
133
- for _ in 1:max_tokens
134
- idx_cond = idx[:, max(1, end-block_size+1):end]
135
- logits = model(idx_cond)
136
- logits_last = Vector{Float32}(logits[:, end, 1])
137
-
138
- # Repetition penalty: reduce logits for recently generated tokens
139
- if repetition_penalty != 1.0
140
- seen = Set{Int}()
141
- # Penalise tokens from context window
142
- for j in max(1, length(generated)-block_size+1):length(generated)
143
- push!(seen, generated[j])
144
- end
145
- for j in 1:size(idx_cond, 2)
146
- push!(seen, idx_cond[1, j])
147
- end
148
- for id in seen
149
- if 1 <= id <= length(logits_last)
150
- if logits_last[id] > 0
151
- logits_last[id] /= Float32(repetition_penalty)
152
- else
153
- logits_last[id] *= Float32(repetition_penalty)
154
- end
155
- end
156
- end
157
- end
158
-
159
- # Temperature scaling
160
- logits_last ./= Float32(max(temperature, 0.01))
161
-
162
- # Top-k filtering
163
- k = min(top_k, length(logits_last))
164
- if k < length(logits_last)
165
- threshold = partialsort(logits_last, k; rev=true)
166
- for i in eachindex(logits_last)
167
- if logits_last[i] < threshold
168
- logits_last[i] = typemin(Float32)
169
- end
170
- end
171
- end
172
-
173
- probs = softmax(logits_last)
174
- probs_cpu = Float64.(probs)
175
-
176
- # Categorical sample
177
- r = rand()
178
- cum = 0.0
179
- next_id = length(probs_cpu)
180
- for (i, p) in enumerate(probs_cpu)
181
- cum += p
182
- if r <= cum; next_id = i; break; end
183
- end
184
- push!(generated, next_id)
185
- idx = hcat(idx, reshape([next_id], 1, 1))
186
- end
187
- return join(get(itos, id, '?') for id in generated)
188
- end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn>=0.29.0
3
+ torch>=2.0.0
4
+ h5py>=3.10.0
5
+ huggingface_hub>=0.20.0
6
+ pydantic>=2.0.0
server.jl DELETED
@@ -1,236 +0,0 @@
1
- #=
2
- server.jl β€” OpenAI-compatible inference server for JuliaGPT
3
-
4
- Serves a JuliaFlux v1 GPT model (Flux.jl, GPT-2 style architecture).
5
- Downloads checkpoint from HuggingFace model repo on first run.
6
-
7
- Endpoints:
8
- GET / -> health check / API info
9
- GET /v1/models -> list available models
10
- POST /v1/chat/completions -> generate philosophy text (OpenAI format)
11
- =#
12
-
13
- include("checkpoint.jl")
14
- using HTTP
15
- using UUIDs
16
- using Sockets
17
- using Downloads
18
-
19
- # ═══════════════════════════════════════════════════════════════════
20
- # Download checkpoint from HuggingFace if needed
21
- # ═══════════════════════════════════════════════════════════════════
22
-
23
- const CKPT_DIR = "checkpoints"
24
- const CKPT_PATH = joinpath(CKPT_DIR, "best_model.jld2")
25
- const VOCAB_PATH = joinpath(CKPT_DIR, "vocab.json")
26
- const HF_REPO = get(ENV, "HF_REPO", "LisaMegaWatts/JuliaGPT")
27
- const PORT = parse(Int, get(ENV, "PORT", "7860"))
28
-
29
- function download_from_hf(repo::String, filename::String, local_path::String)
30
- url = "https://huggingface.co/$repo/resolve/main/$filename"
31
- println("Downloading $url ...")
32
- mkpath(dirname(local_path))
33
- Downloads.download(url, local_path)
34
- sz = round(filesize(local_path) / 1024^2, digits=1)
35
- println(" -> $local_path ($sz MB)")
36
- end
37
-
38
- if !isfile(CKPT_PATH)
39
- println("No local checkpoint, downloading from $HF_REPO ...")
40
- try
41
- download_from_hf(HF_REPO, "best_model.jld2", CKPT_PATH)
42
- catch e
43
- println("Download failed: $e")
44
- println("Place a checkpoint at $CKPT_PATH manually.")
45
- exit(1)
46
- end
47
- end
48
-
49
- # Also download vocab.json if missing
50
- if !isfile(VOCAB_PATH)
51
- println("No local vocab, downloading from $HF_REPO ...")
52
- try
53
- download_from_hf(HF_REPO, "vocab.json", VOCAB_PATH)
54
- catch e
55
- println("vocab.json download failed (will use fallback): $e")
56
- end
57
- end
58
-
59
- # ═══════════════════════════════════════════════════════════════════
60
- # Load model
61
- # ═══════════════════════════════════════════════════════════════════
62
-
63
- println("\nLoading model...")
64
- const CKPT = load_flux_checkpoint(CKPT_PATH)
65
- const MODEL = CKPT.model
66
- const VOCAB_SIZE = CKPT.vocab_size
67
- const BLOCK_SIZE = CKPT.block_size
68
-
69
- # Character vocab: try vocab.json, else build from embedded quotes
70
- const ITOS, STOI = let
71
- vdata = load_char_vocab(VOCAB_PATH)
72
- if vdata !== nothing && vdata.vocab_size >= VOCAB_SIZE
73
- println("Loaded char vocab from $VOCAB_PATH ($(vdata.vocab_size) chars, model expects $VOCAB_SIZE)")
74
- vdata.itos, vdata.stoi
75
- elseif vdata !== nothing
76
- println("WARN: vocab.json has $(vdata.vocab_size) chars but model expects $VOCAB_SIZE, using vocab.json anyway")
77
- vdata.itos, vdata.stoi
78
- else
79
- # Build from training data charset: a-z + space + period (28 chars)
80
- chars = sort(collect(Set("abcdefghijklmnopqrstuvwxyz .")))
81
- if length(chars) < VOCAB_SIZE
82
- # Pad with additional printable chars if model expects more
83
- for c in ",0123456789!?;:'-\n"
84
- length(chars) >= VOCAB_SIZE && break
85
- c in chars || push!(chars, c)
86
- end
87
- sort!(chars)
88
- end
89
- chars = chars[1:min(end, VOCAB_SIZE)]
90
- itos = Dict(i => c for (i, c) in enumerate(chars))
91
- stoi = Dict(c => i for (i, c) in enumerate(chars))
92
- println("Built char vocab: $(length(chars)) chars -> [$(join(chars))]")
93
- itos, stoi
94
- end
95
- end
96
-
97
- const MODEL_CREATED_AT = Int(floor(time()))
98
- println("\nModel ready: vocab=$VOCAB_SIZE, embd=$(CKPT.n_embd), layers=$(CKPT.n_layer), block=$BLOCK_SIZE")
99
-
100
- # ═══════════════════════════════════════════════════════════════════
101
- # API handlers
102
- # ═══════════════════════════════════════════════════════════════════
103
-
104
- function json_response(status::Int, body)
105
- HTTP.Response(status,
106
- ["Content-Type" => "application/json",
107
- "Access-Control-Allow-Origin" => "*",
108
- "Access-Control-Allow-Methods" => "GET, POST, OPTIONS",
109
- "Access-Control-Allow-Headers" => "Content-Type, Authorization"],
110
- JSON3.write(body))
111
- end
112
-
113
- function handle_root(req::HTTP.Request)
114
- json_response(200, Dict(
115
- "name" => "JuliaGPT",
116
- "version" => "2.0.0",
117
- "description" => "A Flux.jl GPT trained on classical philosophy texts",
118
- "architecture" => "GPT-2 style (LayerNorm, GELU, MHA)",
119
- "model" => Dict(
120
- "vocab_size" => VOCAB_SIZE,
121
- "n_embd" => CKPT.n_embd,
122
- "n_layer" => CKPT.n_layer,
123
- "n_head" => CKPT.n_head,
124
- "block_size" => BLOCK_SIZE
125
- ),
126
- "endpoints" => ["/v1/models", "/v1/chat/completions"],
127
- "compatible_with" => ["OpenAI API", "OpenRouter"]
128
- ))
129
- end
130
-
131
- function handle_models(req::HTTP.Request)
132
- json_response(200, Dict(
133
- "object" => "list",
134
- "data" => [Dict(
135
- "id" => "juliagpt-philosophy",
136
- "object" => "model",
137
- "created" => MODEL_CREATED_AT,
138
- "owned_by" => "juliagpt"
139
- )]
140
- ))
141
- end
142
-
143
- function handle_chat_completions(req::HTTP.Request)
144
- local body
145
- try
146
- body = JSON3.read(String(req.body))
147
- catch e
148
- return json_response(400, Dict("error" => Dict(
149
- "message" => "Invalid JSON in request body",
150
- "type" => "invalid_request_error",
151
- "code" => "invalid_json")))
152
- end
153
-
154
- temperature = Float64(clamp(get(body, :temperature, 0.1), 0.01, 2.0))
155
- max_tokens = Int(clamp(get(body, :max_tokens, 200), 1, BLOCK_SIZE))
156
- n_completions = Int(clamp(get(body, :n, 1), 1, 4))
157
- top_k = Int(clamp(get(body, :top_k, 8), 1, VOCAB_SIZE))
158
- rep_penalty = Float64(clamp(get(body, :repetition_penalty, 1.3), 1.0, 3.0))
159
-
160
- messages = get(body, :messages, [])
161
- prompt_text = ""
162
- if !isempty(messages)
163
- prompt_text = string(get(messages[end], :content, ""))
164
- end
165
-
166
- # Encode prompt as seed token IDs (char-level)
167
- seed_ids = Int[]
168
- if !isempty(prompt_text)
169
- prompt_lower = lowercase(prompt_text)
170
- for c in prompt_lower
171
- id = get(STOI, c, nothing)
172
- id !== nothing && push!(seed_ids, id)
173
- end
174
- # Truncate to fit block_size (leave room for generation)
175
- if length(seed_ids) > BLOCK_SIZE Γ· 2
176
- seed_ids = seed_ids[end - BLOCK_SIZE Γ· 2 + 1:end]
177
- end
178
- end
179
-
180
- choices = []
181
- total_completion_tokens = 0
182
- for i in 1:n_completions
183
- text = generate_text(MODEL, VOCAB_SIZE, ITOS, BLOCK_SIZE;
184
- seed_ids=seed_ids, max_tokens=max_tokens,
185
- temperature=temperature, top_k=top_k,
186
- repetition_penalty=rep_penalty)
187
- finish_reason = length(text) >= max_tokens ? "length" : "stop"
188
- push!(choices, Dict(
189
- "index" => i - 1,
190
- "message" => Dict("role" => "assistant", "content" => text),
191
- "finish_reason" => finish_reason))
192
- total_completion_tokens += length(text)
193
- end
194
-
195
- json_response(200, Dict(
196
- "id" => "chatcmpl-" * string(uuid4()),
197
- "object" => "chat.completion",
198
- "created" => Int(floor(time())),
199
- "model" => "juliagpt-philosophy",
200
- "choices" => choices,
201
- "usage" => Dict(
202
- "prompt_tokens" => length(prompt_text),
203
- "completion_tokens" => total_completion_tokens,
204
- "total_tokens" => length(prompt_text) + total_completion_tokens),
205
- "system_fingerprint" => "juliagpt-fluxv1"))
206
- end
207
-
208
- # ═══════════════════════════════════════════════════════════════════
209
- # Router + CORS
210
- # ═══════════════════════════════════════════════════════════════════
211
-
212
- function cors_preflight(req::HTTP.Request)
213
- HTTP.Response(204,
214
- ["Access-Control-Allow-Origin" => "*",
215
- "Access-Control-Allow-Methods" => "GET, POST, OPTIONS",
216
- "Access-Control-Allow-Headers" => "Content-Type, Authorization"])
217
- end
218
-
219
- const ROUTER = HTTP.Router()
220
- HTTP.register!(ROUTER, "GET", "/", handle_root)
221
- HTTP.register!(ROUTER, "GET", "/v1/models", handle_models)
222
- HTTP.register!(ROUTER, "POST", "/v1/chat/completions", handle_chat_completions)
223
- HTTP.register!(ROUTER, "OPTIONS", "/v1/chat/completions", cors_preflight)
224
- HTTP.register!(ROUTER, "OPTIONS", "/v1/models", cors_preflight)
225
-
226
- # ═══════════════════════════════════════════════════════════════════
227
- # Start server
228
- # ═══════════════════════════════════════════════════════════════════
229
-
230
- println("\nJuliaGPT server starting on 0.0.0.0:$PORT ...")
231
- println(" GET http://localhost:$PORT/")
232
- println(" GET http://localhost:$PORT/v1/models")
233
- println(" POST http://localhost:$PORT/v1/chat/completions")
234
- println()
235
-
236
- HTTP.serve(ROUTER, "0.0.0.0", PORT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server.py β€” JuliaGPT OpenAI-compatible inference server
3
+ Serves POST /v1/chat/completions (streaming + non-streaming) and GET /v1/models.
4
+
5
+ Loads the Flux.jl GPT-2 model from best_model.jld2 on HF Hub.
6
+ Architecture: GPT-2 style β€” LayerNorm, GELU, combined QKV, learned position embeddings.
7
+ 6 layers, 384-dim, 6 heads, 38-char vocab, val_loss=2.91.
8
+
9
+ Weights are extracted from JLD2 (HDF5-based) via h5py, loaded into PyTorch.
10
+ Follows the RandyGPT FastAPI/uvicorn pattern for proven HF Spaces compatibility.
11
+ """
12
+
13
+ import json
14
+ import math
15
+ import time
16
+ import uuid
17
+ import os
18
+ import h5py
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from pathlib import Path
24
+ from fastapi import FastAPI, HTTPException, Request
25
+ from fastapi.responses import JSONResponse, StreamingResponse
26
+ from fastapi.middleware.cors import CORSMiddleware
27
+ from fastapi.exceptions import RequestValidationError
28
+ from pydantic import BaseModel
29
+ from typing import List, Optional
30
+ from huggingface_hub import hf_hub_download
31
+
32
+
33
+ # ── Model definition (GPT-2 style, matches Flux training) ────────────────────
34
+
35
+ class CausalSelfAttention(nn.Module):
36
+ def __init__(self, n_embd, n_head):
37
+ super().__init__()
38
+ self.n_head = n_head
39
+ self.head_dim = n_embd // n_head
40
+ self.scale = 1.0 / math.sqrt(self.head_dim)
41
+ self.qkv = nn.Linear(n_embd, 3 * n_embd, bias=False)
42
+ self.proj = nn.Linear(n_embd, n_embd, bias=False)
43
+
44
+ def forward(self, x):
45
+ B, T, C = x.shape
46
+ qkv = self.qkv(x)
47
+ q, k, v = qkv.split(C, dim=-1)
48
+ q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
49
+ k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
50
+ v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
51
+ scores = q @ k.transpose(-2, -1) * self.scale
52
+ mask = torch.full((T, T), float('-inf'), device=x.device).triu(1)
53
+ attn = F.softmax(scores + mask, dim=-1)
54
+ out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
55
+ return self.proj(out)
56
+
57
+
58
+ class FeedForward(nn.Module):
59
+ def __init__(self, n_embd):
60
+ super().__init__()
61
+ self.fc1 = nn.Linear(n_embd, 4 * n_embd, bias=False)
62
+ self.fc2 = nn.Linear(4 * n_embd, n_embd, bias=False)
63
+
64
+ def forward(self, x):
65
+ return self.fc2(F.gelu(self.fc1(x)))
66
+
67
+
68
+ class TransformerBlock(nn.Module):
69
+ def __init__(self, n_embd, n_head):
70
+ super().__init__()
71
+ self.ln1 = nn.LayerNorm(n_embd)
72
+ self.attn = CausalSelfAttention(n_embd, n_head)
73
+ self.ln2 = nn.LayerNorm(n_embd)
74
+ self.ffwd = FeedForward(n_embd)
75
+
76
+ def forward(self, x):
77
+ x = x + self.attn(self.ln1(x))
78
+ x = x + self.ffwd(self.ln2(x))
79
+ return x
80
+
81
+
82
+ class GPT(nn.Module):
83
+ def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size):
84
+ super().__init__()
85
+ self.block_size = block_size
86
+ self.wte = nn.Embedding(vocab_size, n_embd)
87
+ self.wpe = nn.Embedding(block_size, n_embd)
88
+ self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
89
+ self.ln_f = nn.LayerNorm(n_embd)
90
+ self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
91
+
92
+ def forward(self, ids):
93
+ B, T = ids.shape
94
+ x = self.wte(ids) + self.wpe(torch.arange(T, device=ids.device).unsqueeze(0))
95
+ for block in self.blocks:
96
+ x = block(x)
97
+ x = self.ln_f(x)
98
+ return self.lm_head(x)
99
+
100
+ @torch.no_grad()
101
+ def generate_stream(self, ids, max_new_tokens=200, temperature=0.1,
102
+ top_k=8, repetition_penalty=1.3):
103
+ self.eval()
104
+ generated = []
105
+ for i in range(max_new_tokens):
106
+ ctx = ids[:, -self.block_size:]
107
+ logits = self(ctx)[:, -1, :]
108
+ logits = logits[0]
109
+
110
+ if repetition_penalty > 1.0:
111
+ seen = set()
112
+ for t in generated[-self.block_size:]:
113
+ seen.add(t)
114
+ for t in ctx[0].tolist():
115
+ seen.add(t)
116
+ for t in seen:
117
+ if 0 <= t < logits.shape[0]:
118
+ if logits[t] > 0:
119
+ logits[t] /= repetition_penalty
120
+ else:
121
+ logits[t] *= repetition_penalty
122
+
123
+ logits = logits / max(temperature, 0.01)
124
+
125
+ if top_k > 0 and top_k < logits.shape[0]:
126
+ topk_vals, _ = torch.topk(logits, top_k)
127
+ logits[logits < topk_vals[-1]] = float('-inf')
128
+
129
+ probs = F.softmax(logits, dim=-1)
130
+ nxt = torch.multinomial(probs, 1)
131
+ ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
132
+ token_id = nxt.item()
133
+ generated.append(token_id)
134
+ is_last = (i == max_new_tokens - 1)
135
+ yield token_id, is_last
136
+
137
+ @torch.no_grad()
138
+ def generate(self, ids, max_new_tokens=200, temperature=0.1,
139
+ top_k=8, repetition_penalty=1.3):
140
+ self.eval()
141
+ generated = []
142
+ for token_id, _ in self.generate_stream(ids, max_new_tokens, temperature,
143
+ top_k, repetition_penalty):
144
+ generated.append(token_id)
145
+ return generated
146
+
147
+
148
+ # ── Char-level tokenizer ──────────────────────────────────────────────────────
149
+
150
+ class CharTokenizer:
151
+ def __init__(self, uchars):
152
+ self.uchars = uchars
153
+ self.stoi = {c: i for i, c in enumerate(uchars)}
154
+ self.itos = {i: c for i, c in enumerate(uchars)}
155
+ self.vocab_size = len(uchars)
156
+
157
+ def encode(self, text):
158
+ return [self.stoi[c] for c in text.lower() if c in self.stoi]
159
+
160
+ def decode(self, ids):
161
+ return "".join(self.itos.get(i, "?") for i in ids)
162
+
163
+
164
+ # ── Load JLD2 weights via h5py ───────────────────────────────────────────────
165
+
166
+ def load_jld2_gpt2(jld2_path, vocab_path=None):
167
+ """Load Flux GPT-2 weights from JLD2, build PyTorch model."""
168
+ print(f"Loading JLD2 from {jld2_path} ...")
169
+ f = h5py.File(jld2_path, "r")
170
+ ms = f["model_state"][()]
171
+
172
+ def deref(ref):
173
+ return np.array(f[ref])
174
+
175
+ # Get architecture params
176
+ b1 = ms["blocks"]["layers"]["1"]
177
+ n_head = int(b1["attn"]["n_head"])
178
+ wte_w = deref(ms["wte"]["weight"])
179
+ vocab_size, n_embd = wte_w.shape
180
+ wpe_w = deref(ms["wpe"]["weight"])
181
+ block_size = wpe_w.shape[0]
182
+
183
+ layer_names = sorted(ms["blocks"]["layers"].dtype.names, key=int)
184
+ n_layer = len(layer_names)
185
+
186
+ step = int(f["step"][()])
187
+ best_val = float(f["best_val_loss"][()])
188
+
189
+ print(f" vocab={vocab_size}, embd={n_embd}, heads={n_head}, layers={n_layer}, block={block_size}")
190
+ print(f" step={step}, best_val_loss={best_val:.4f}")
191
+
192
+ # Build PyTorch model
193
+ model = GPT(vocab_size, n_embd, n_head, n_layer, block_size)
194
+
195
+ state = {}
196
+ # Embeddings: h5py (vocab, embd) = PyTorch (vocab, embd), no transpose
197
+ state["wte.weight"] = torch.tensor(wte_w, dtype=torch.float32)
198
+ state["wpe.weight"] = torch.tensor(wpe_w, dtype=torch.float32)
199
+
200
+ # Dense weights: h5py gives (in, out) due to Julia column-major β†’ need .T for PyTorch (out, in)
201
+ for i, lname in enumerate(layer_names):
202
+ layer = ms["blocks"]["layers"][lname]
203
+
204
+ # LayerNorm (1D, no transpose)
205
+ state[f"blocks.{i}.ln1.weight"] = torch.tensor(deref(layer["ln1"]["diag"]["scale"]), dtype=torch.float32)
206
+ state[f"blocks.{i}.ln1.bias"] = torch.tensor(deref(layer["ln1"]["diag"]["bias"]), dtype=torch.float32)
207
+ state[f"blocks.{i}.ln2.weight"] = torch.tensor(deref(layer["ln2"]["diag"]["scale"]), dtype=torch.float32)
208
+ state[f"blocks.{i}.ln2.bias"] = torch.tensor(deref(layer["ln2"]["diag"]["bias"]), dtype=torch.float32)
209
+
210
+ # Attention QKV + proj (transpose Dense weights)
211
+ state[f"blocks.{i}.attn.qkv.weight"] = torch.tensor(deref(layer["attn"]["qkv"]["weight"]).T.copy(), dtype=torch.float32)
212
+ state[f"blocks.{i}.attn.proj.weight"] = torch.tensor(deref(layer["attn"]["proj"]["weight"]).T.copy(), dtype=torch.float32)
213
+
214
+ # FeedForward (transpose Dense weights)
215
+ state[f"blocks.{i}.ffwd.fc1.weight"] = torch.tensor(deref(layer["ffwd"]["net"]["layers"]["1"]["weight"]).T.copy(), dtype=torch.float32)
216
+ state[f"blocks.{i}.ffwd.fc2.weight"] = torch.tensor(deref(layer["ffwd"]["net"]["layers"]["3"]["weight"]).T.copy(), dtype=torch.float32)
217
+
218
+ # Final LayerNorm
219
+ state["ln_f.weight"] = torch.tensor(deref(ms["ln_f"]["diag"]["scale"]), dtype=torch.float32)
220
+ state["ln_f.bias"] = torch.tensor(deref(ms["ln_f"]["diag"]["bias"]), dtype=torch.float32)
221
+
222
+ # Output projection (transpose Dense weight)
223
+ state["lm_head.weight"] = torch.tensor(deref(ms["lm_head"]["weight"]).T.copy(), dtype=torch.float32)
224
+
225
+ model.load_state_dict(state)
226
+ model.eval()
227
+ f.close()
228
+
229
+ params = sum(p.numel() for p in model.parameters())
230
+ print(f" PyTorch model loaded: {params:,} params")
231
+
232
+ # Load char vocab
233
+ tok = None
234
+ if vocab_path and os.path.exists(vocab_path):
235
+ uchars = json.loads(Path(vocab_path).read_text())
236
+ tok = CharTokenizer(uchars)
237
+ print(f" Loaded char vocab: {tok.vocab_size} chars")
238
+
239
+ return model, tok, {
240
+ "vocab_size": vocab_size, "n_embd": n_embd, "n_head": n_head,
241
+ "n_layer": n_layer, "block_size": block_size, "step": step,
242
+ "best_val_loss": best_val, "params": params,
243
+ }
244
+
245
+
246
+ # ── Load model at startup ────────────────────────────────────────────────────
247
+
248
+ REPO = os.environ.get("HF_REPO", "LisaMegaWatts/JuliaGPT")
249
+ MODEL_ID = "juliagpt-philosophy"
250
+
251
+ print(f"Downloading model from {REPO} ...")
252
+ jld2_path = hf_hub_download(repo_id=REPO, filename="best_model.jld2")
253
+ try:
254
+ vocab_path = hf_hub_download(repo_id=REPO, filename="vocab.json")
255
+ except Exception:
256
+ vocab_path = None
257
+
258
+ model, tok, hp = load_jld2_gpt2(jld2_path, vocab_path)
259
+ n_embd = hp["n_embd"]
260
+ n_head = hp["n_head"]
261
+ n_layer = hp["n_layer"]
262
+ block_size = hp["block_size"]
263
+ vocab_size = hp["vocab_size"]
264
+
265
+ # Fallback tokenizer if vocab.json missing
266
+ if tok is None:
267
+ chars = sorted(set("abcdefghijklmnopqrstuvwxyz ."))
268
+ tok = CharTokenizer(chars)
269
+ print(f" Built fallback char vocab: {tok.vocab_size} chars")
270
+
271
+ print(f"\nModel ready β€” {hp['params']:,} params, vocab={tok.vocab_size}, val_loss={hp['best_val_loss']:.4f}")
272
+
273
+
274
+ # ── FastAPI app ───────────────────────────────────────────────────────────────
275
+
276
+ app = FastAPI(title="JuliaGPT", version="2.0.0")
277
+
278
+ app.add_middleware(
279
+ CORSMiddleware,
280
+ allow_origins=["*"],
281
+ allow_methods=["*"],
282
+ allow_headers=["*"],
283
+ )
284
+
285
+
286
+ def _openai_error(status, message, err_type="invalid_request_error", code=None):
287
+ body = {"error": {"message": message, "type": err_type}}
288
+ if code:
289
+ body["error"]["code"] = code
290
+ return JSONResponse(status_code=status, content=body)
291
+
292
+
293
+ @app.exception_handler(HTTPException)
294
+ async def http_exc(request, exc):
295
+ return _openai_error(exc.status_code, str(exc.detail))
296
+
297
+
298
+ @app.exception_handler(RequestValidationError)
299
+ async def val_exc(request, exc):
300
+ msg = "; ".join(f"{e['loc'][-1]}: {e['msg']}" for e in exc.errors())
301
+ return _openai_error(422, msg, code="invalid_request_error")
302
+
303
+
304
+ @app.get("/")
305
+ def root():
306
+ return {
307
+ "name": "JuliaGPT",
308
+ "version": "2.0.0",
309
+ "description": "Flux.jl GPT-2 trained on classical philosophy (served via PyTorch)",
310
+ "architecture": "GPT-2 (LayerNorm, GELU, combined QKV)",
311
+ "model": {
312
+ "vocab_size": tok.vocab_size, "n_embd": n_embd,
313
+ "n_layer": n_layer, "n_head": n_head,
314
+ "block_size": block_size, "params": hp["params"],
315
+ },
316
+ "endpoints": ["/v1/models", "/v1/chat/completions"],
317
+ "features": ["streaming", "OpenAI-compatible"],
318
+ }
319
+
320
+
321
+ @app.get("/v1/models")
322
+ def list_models():
323
+ return {
324
+ "object": "list",
325
+ "data": [{"id": MODEL_ID, "object": "model",
326
+ "created": 1700000000, "owned_by": "juliagpt"}]
327
+ }
328
+
329
+
330
+ class Message(BaseModel):
331
+ role: str
332
+ content: str
333
+
334
+ class ChatRequest(BaseModel):
335
+ model: Optional[str] = MODEL_ID
336
+ messages: List[Message]
337
+ max_tokens: Optional[int] = 200
338
+ temperature: Optional[float] = 0.1
339
+ top_k: Optional[int] = 8
340
+ repetition_penalty: Optional[float] = 1.3
341
+ n: Optional[int] = 1
342
+ stream: Optional[bool] = False
343
+
344
+
345
+ def _sse(data):
346
+ return f"data: {json.dumps(data)}\n\n"
347
+
348
+
349
+ def _stream_completion(ids, max_tokens, temperature, top_k, rep_penalty,
350
+ completion_id, _model, _tok):
351
+ yield _sse({
352
+ "id": completion_id, "object": "chat.completion.chunk",
353
+ "created": int(time.time()), "model": MODEL_ID,
354
+ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""},
355
+ "finish_reason": None}],
356
+ })
357
+
358
+ token_count = 0
359
+ for token_id, is_last in _model.generate_stream(
360
+ ids, max_new_tokens=max_tokens, temperature=temperature,
361
+ top_k=top_k, repetition_penalty=rep_penalty
362
+ ):
363
+ token_text = _tok.decode([token_id])
364
+ token_count += 1
365
+ finish_reason = ("length" if token_count >= max_tokens else "stop") if is_last else None
366
+ yield _sse({
367
+ "id": completion_id, "object": "chat.completion.chunk",
368
+ "created": int(time.time()), "model": MODEL_ID,
369
+ "choices": [{"index": 0, "delta": {"content": token_text},
370
+ "finish_reason": finish_reason}],
371
+ })
372
+
373
+ yield "data: [DONE]\n\n"
374
+
375
+
376
+ @app.post("/v1/chat/completions")
377
+ def chat_completions(req: ChatRequest):
378
+ _m, _t = model, tok
379
+
380
+ prompt = req.messages[-1].content.strip() if req.messages else ""
381
+ if not prompt:
382
+ raise HTTPException(status_code=400, detail="No content in messages")
383
+
384
+ ids = _t.encode(prompt)
385
+ if not ids:
386
+ ids = [0]
387
+
388
+ max_tokens = max(1, min(req.max_tokens or 200, block_size))
389
+ temperature = max(0.01, min(req.temperature or 0.1, 2.0))
390
+ top_k = max(1, min(req.top_k or 8, tok.vocab_size))
391
+ rep_penalty = max(1.0, min(req.repetition_penalty or 1.3, 3.0))
392
+ n = max(1, min(req.n or 1, 4))
393
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
394
+
395
+ tensor = torch.tensor([ids], dtype=torch.long)
396
+
397
+ if req.stream:
398
+ return StreamingResponse(
399
+ _stream_completion(tensor, max_tokens, temperature, top_k,
400
+ rep_penalty, completion_id, _m, _t),
401
+ media_type="text/event-stream",
402
+ headers={"X-Accel-Buffering": "no"},
403
+ )
404
+
405
+ choices = []
406
+ total_completion_tokens = 0
407
+ for i in range(n):
408
+ generated = _m.generate(tensor.clone(), max_new_tokens=max_tokens,
409
+ temperature=temperature, top_k=top_k,
410
+ repetition_penalty=rep_penalty)
411
+ text = _t.decode(generated)
412
+ total_completion_tokens += len(generated)
413
+ choices.append({
414
+ "index": i,
415
+ "message": {"role": "assistant", "content": text},
416
+ "finish_reason": "length" if len(generated) >= max_tokens else "stop",
417
+ })
418
+
419
+ return {
420
+ "id": completion_id, "object": "chat.completion",
421
+ "created": int(time.time()), "model": MODEL_ID,
422
+ "system_fingerprint": "juliagpt-v2",
423
+ "choices": choices,
424
+ "usage": {
425
+ "prompt_tokens": len(ids),
426
+ "completion_tokens": total_completion_tokens,
427
+ "total_tokens": len(ids) + total_completion_tokens,
428
+ },
429
+ }