FROM ./Janus-35B-A3B.Q4_K_M.gguf # Chat template — Qwen 3.6 ChatML in Ollama Go-template form, with the # tool-calling blocks Ollama's capability detector looks for. Without a # TEMPLATE that references .Tools and .ToolCalls, /api/chat and # /v1/chat/completions reject any request carrying a `tools` array with # ` does not support tools`. Same template as the 27B dense sibling # (FoolDev/janus-27b) — both share the Qwen 3.6 chat format. TEMPLATE """{{- $lastUserIdx := -1 -}} {{- range $idx, $msg := .Messages -}} {{- if eq $msg.Role "user" }}{{ $lastUserIdx = $idx }}{{ end -}} {{- end }} {{- if or .System .Tools }}<|im_start|>system {{ if .System }}{{ .System }} {{ end }} {{- if .Tools }}# Tools You may call one or more functions to assist with the user query. You are provided with function signatures within XML tags: {{- range .Tools }} {"type": "function", "function": {{ .Function }}} {{- end }} For each function call, return a json object with function name and arguments within XML tags: {"name": , "arguments": } {{- end -}}<|im_end|> {{ end }} {{- range $i, $_ := .Messages }} {{- $last := eq (len (slice $.Messages $i)) 1 -}} {{- if eq .Role "user" }}<|im_start|>user {{ .Content }}<|im_end|> {{ else if eq .Role "assistant" }}<|im_start|>assistant {{ if (and $.IsThinkSet (and .Thinking (or $last (gt $i $lastUserIdx)))) -}} {{ .Thinking }} {{ end -}} {{ if .Content }}{{ .Content }}{{ end }} {{- if .ToolCalls }} {{- range .ToolCalls }} {"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} {{- end }} {{- end }}{{ if not $last }}<|im_end|> {{ end }} {{- else if eq .Role "tool" }}<|im_start|>user {{ .Content }} <|im_end|> {{ end }} {{- if and (ne .Role "assistant") $last }}<|im_start|>assistant {{ end }} {{- end }}""" # Sampling tuned for reasoning + general use. See README "Recommended sampling" # for creative/RP alternatives. PARAMETER temperature 0.6 PARAMETER top_p 0.95 PARAMETER top_k 20 PARAMETER repeat_penalty 1.05 PARAMETER num_ctx 16384 # Stop tokens. Without these, Ollama only honors <|im_end|> from the GGUF # metadata; the model occasionally emits <|endoftext|> instead and Ollama # keeps generating past it (synthesising a fake new user turn). Listing # both — plus <|im_start|> as a belt-and-braces guard against the same # loop — keeps responses cleanly terminated. Same fix the 27B sibling # (FoolDev/janus-27b) shipped in commit 6672746. PARAMETER stop "<|im_end|>" PARAMETER stop "<|endoftext|>" PARAMETER stop "<|im_start|>" SYSTEM """You are Janus, a precise and capable assistant for reasoning, writing, coding, and long-form dialogue. Behavior rules: - Answer the user's actual request directly. - Be accurate, complete, and structured. - Think before answering, but do not get stuck in repetitive loops or meta-commentary. - If the request is ambiguous or incomplete, state what is missing and make the smallest reasonable assumption needed to continue. - If the user wants creative writing, preserve tone, continuity, and character consistency. - If the user wants analysis or technical help, prefer concrete steps, examples, and decisions over fluff. - Finish with a usable answer, not just planning.""" # Hardware notes # -------------- # This Q4_K_M is ~19 GB on disk. Real footprint at runtime: # weights mmap ~19 GB # compute graph alloc ~19 GB (Ollama log: device.go:272 "total memory") # KV cache @ 16K ctx ~1 GB (with OLLAMA_KV_CACHE_TYPE=q8_0) # total minimum ~38 GB # # Working configurations (verified or documented): # ✓ Single H100 80GB / A100 80GB — full GPU offload # ✓ RTX 5090 32GB / RTX 4090 24GB — partial offload, ~15-25 tok/s # ✓ Mac Studio M2/M3 Ultra 64GB+ — unified memory, ~20+ tok/s # ✓ Linux box with 64GB+ RAM (CPU-only) — ~3-6 tok/s # ⚠ ASUS ROG Flow Z13 (Ryzen AI Max+, 32GB) — OOMs at default num_ctx 16384; # fits with num_ctx ≤ 4096 and # num_batch ≤ 256 (verified) # # Measured data point (ASUS ROG Flow Z13 GZ302EA-RU004W, Ryzen AI Max+ 395 + # Radeon 8060S iGPU, 32 GB unified, ROCm gfx1151, OLLAMA_FLASH_ATTENTION=1, # OLLAMA_KV_CACHE_TYPE=q8_0, num_ctx 4096, num_batch 256): # Q4_K_M, 3-prompt mix → 28.71 tok/s aggregate # (717 tokens / 25.0 s; 29.55 / 29.24 / 28.57 short/medium/long). # ~97% of layers offload to the iGPU via ROCm. Compute split per # `ollama ps` shows 3% CPU / 97% GPU at 4096 ctx. # # To run on a 32 GB unified-memory laptop, override these in your local # Modelfile copy (or pass via -o on `ollama run`): # PARAMETER num_ctx 4096 # PARAMETER num_batch 256 # # If you have ≥48 GB RAM but want partial GPU offload, set: # PARAMETER num_gpu 24 # offload most layers (model has 40)