FoolDev commited on
Commit
6f2884f
·
1 Parent(s): b564869

Add usage scaffolding: examples, build/smoke scripts, Z13 profile, LICENSE

Browse files

- examples/: ollama_chat.py, transformers_quickstart.py (4-bit bnb),
llama_cpp_quickstart.py, plus a README explaining when to use each
- scripts/build.sh: one-shot GGUF pull + 'ollama create' for default and
z13 profiles; scripts/smoke_test.sh: server/model/round-trip check
- Modelfile.z13: Q3_K_S, 8K ctx, FA + q8_0 KV cache profile that fits in
the Ryzen AI Max+ unified pool
- LICENSE (Apache-2.0), CITATION.cff, .gitignore
- README updated to reference the new files and replace the 'borderline'
Z13 caveat with the working profile

.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .venv/
6
+ venv/
7
+
8
+ # Local model weights (we don't redistribute these)
9
+ *.gguf
10
+ *.safetensors
11
+ *.bin
12
+
13
+ # Editor / OS
14
+ .DS_Store
15
+ .idea/
16
+ .vscode/
17
+ *.swp
CITATION.cff ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ title: "Janus-27B: A Dense Distillation Wrapper for Qwen 3.6 27B"
3
+ message: "If you use this model card or its accompanying files, please cite as below."
4
+ type: software
5
+ authors:
6
+ - name: FoolDev
7
+ website: "https://huggingface.co/FoolDev"
8
+ repository-code: "https://huggingface.co/FoolDev/janus-27b"
9
+ url: "https://huggingface.co/FoolDev/janus-27b"
10
+ abstract: >-
11
+ Janus-27B is a personal repackaging of the dense Qwen 3.6 27B base model
12
+ with Claude Opus 4.7 in the reasoning teacher slot. The repository ships
13
+ an Ollama Modelfile, sampling defaults, and usage examples; weights are
14
+ pulled from upstream (Qwen/Qwen3.6-27B safetensors or
15
+ unsloth/Qwen3.6-27B-GGUF quants) rather than redistributed.
16
+ keywords:
17
+ - qwen
18
+ - qwen3.6
19
+ - dense
20
+ - distillation
21
+ - reasoning
22
+ - llm
23
+ license: Apache-2.0
24
+ references:
25
+ - type: software
26
+ title: "Qwen3.6-27B"
27
+ authors:
28
+ - name: Alibaba Qwen Team
29
+ url: "https://huggingface.co/Qwen/Qwen3.6-27B"
30
+ - type: software
31
+ title: "Janus-35B-A3B (MoE sibling)"
32
+ authors:
33
+ - name: FoolDev
34
+ url: "https://huggingface.co/FoolDev/janus"
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for describing the origin of the Work and
141
+ reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may accept and charge a
167
+ fee for acceptance of support, warranty, indemnity, or other liability
168
+ obligations and/or rights consistent with this License. However, in
169
+ accepting such obligations, You may act only on Your own behalf and
170
+ on Your sole responsibility, not on behalf of any other Contributor,
171
+ and only if You agree to indemnify, defend, and hold each Contributor
172
+ harmless for any liability incurred by, or claims asserted against,
173
+ such Contributor by reason of your accepting any such warranty or
174
+ additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2025 FoolDev
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
Modelfile.z13 ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Janus-27B — Z13 variant for ASUS ROG Flow Z13 (Ryzen AI Max+ 395, 128 GB)
2
+ #
3
+ # This Modelfile is tuned for an iGPU with a shared/unified memory pool.
4
+ # Defaults differ from the main Modelfile in three ways:
5
+ # 1. Smaller context (8K instead of 16K) to keep KV cache slim.
6
+ # 2. Q3_K_S GGUF assumed (~12 GB) so weights + compute graph fit under 20 GB.
7
+ # 3. Slightly lower repeat_penalty since smaller quants are more loop-prone
8
+ # and we compensate with top_k instead.
9
+ #
10
+ # Recommended base GGUF for this profile:
11
+ # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF -> Qwen3.6-27B.Q3_K_S.gguf
12
+ #
13
+ # Usage:
14
+ # ollama create janus-27b-z13 -f Modelfile.z13
15
+ # ollama run janus-27b-z13
16
+ #
17
+ # Environment variables that help on the Z13 (set before `ollama serve`):
18
+ # export OLLAMA_KV_CACHE_TYPE=q8_0 # halve KV cache memory
19
+ # export OLLAMA_FLASH_ATTENTION=1 # tighter attention working set
20
+ # export OLLAMA_NUM_PARALLEL=1 # don't fan out across requests
21
+ # export HSA_OVERRIDE_GFX_VERSION=11.5.1 # if ROCm doesn't auto-detect gfx1150
22
+
23
+ FROM ./Qwen3.6-27B.Q3_K_S.gguf
24
+
25
+ PARAMETER temperature 0.6
26
+ PARAMETER top_p 0.95
27
+ PARAMETER top_k 30
28
+ PARAMETER repeat_penalty 1.03
29
+ PARAMETER num_ctx 8192
30
+
31
+ SYSTEM """You are Janus, a precise and capable assistant for reasoning, writing, coding, and long-form dialogue.
32
+
33
+ Behavior rules:
34
+ - Answer the user's actual request directly.
35
+ - Be accurate, complete, and structured.
36
+ - Think before answering, but do not get stuck in repetitive loops or meta-commentary.
37
+ - If the request is ambiguous or incomplete, state what is missing and make the smallest reasonable assumption needed to continue.
38
+ - If the user wants creative writing, preserve tone, continuity, and character consistency.
39
+ - If the user wants analysis or technical help, prefer concrete steps, examples, and decisions over fluff.
40
+ - Finish with a usable answer, not just planning."""
41
+
42
+ # Footprint estimate on Z13 (Ryzen AI Max+ 395, 32 GB unified pool, gfx1150):
43
+ # weights mmap (Q3_K_S) ~12 GB
44
+ # compute graph alloc ~4 GB (with FA + 8K ctx)
45
+ # KV cache @ 8K, q8_0 ~0.7 GB
46
+ # total ~17 GB -> fits under 20 GB GTT cap
47
+ #
48
+ # If you have headroom and want better quality, swap to Q4_K_S (~14 GB).
49
+ # Q4_K_M (~16 GB) will work but leaves almost no slack.
README.md CHANGED
@@ -81,7 +81,12 @@ The 27B is **dense**: every parameter participates in every forward pass. It's s
81
  | File | Use |
82
  |---|---|
83
  | `banner.svg` / `banner.png` | Repo header, Tokyo Night themed |
84
- | `Modelfile` | Ollama wrapper around the upstream Qwen3.6-27B GGUF |
 
 
 
 
 
85
  | `README.md` | This file |
86
 
87
  This repo does **not** redistribute weights. Pull the upstream GGUF from [`unsloth/Qwen3.6-27B-GGUF`](https://huggingface.co/unsloth/Qwen3.6-27B-GGUF) or any other community quant, point the Modelfile at it, and `ollama create janus-27b -f Modelfile`.
@@ -99,15 +104,30 @@ If you want the safetensors for `transformers`, fetch them from [`Qwen/Qwen3.6-2
99
 
100
  ## Quick start
101
 
102
- ### Ollama
103
 
104
- A ready-to-use `Modelfile` is included. Edit the `FROM` line to point at your local GGUF copy:
 
 
 
 
 
 
 
 
 
105
 
106
  ```bash
107
- # After pulling unsloth/Qwen3.6-27B-GGUF or another quant locally:
108
  ollama create janus-27b -f Modelfile && ollama run janus-27b
109
  ```
110
 
 
 
 
 
 
 
 
111
  ### Inference (OpenAI-compatible)
112
 
113
  ```bash
@@ -159,7 +179,7 @@ The dense 27B is the easier of the two Janus models to deploy.
159
  | RTX 3090 / 4090 24 GB | Works, full Q4 offload, ~25-40 tok/s |
160
  | RTX 5090 32 GB | Works, full offload at higher quant (Q5/Q6), ~30-50 tok/s |
161
  | Mac Studio M2/M3 32 GB+ unified | Works, ~15-25 tok/s |
162
- | ASUS ROG Flow Z13 (Ryzen AI Max+, 32 GB unified) | Borderline 16 GB Q4 GGUF + ~16 GB compute graph crowds the 20 GB iGPU pool. Try Q3_K_S (~12 GB) for headroom. |
163
 
164
  ## Chat template
165
 
 
81
  | File | Use |
82
  |---|---|
83
  | `banner.svg` / `banner.png` | Repo header, Tokyo Night themed |
84
+ | `Modelfile` | Ollama wrapper around the upstream Qwen 3.6 27B GGUF (default profile, Q4_K_M) |
85
+ | `Modelfile.z13` | Tighter profile for ASUS ROG Flow Z13 (Q3_K_S, 8K ctx, FA + q8_0 KV cache) |
86
+ | `examples/` | Ready-to-run Python clients for Ollama, Transformers, and llama-cpp-python |
87
+ | `scripts/build.sh` | One-shot helper: pulls a GGUF and runs `ollama create` for you |
88
+ | `scripts/smoke_test.sh` | Verifies an Ollama daemon + model and runs a round-trip |
89
+ | `LICENSE`, `CITATION.cff` | Apache-2.0 license and citation metadata |
90
  | `README.md` | This file |
91
 
92
  This repo does **not** redistribute weights. Pull the upstream GGUF from [`unsloth/Qwen3.6-27B-GGUF`](https://huggingface.co/unsloth/Qwen3.6-27B-GGUF) or any other community quant, point the Modelfile at it, and `ollama create janus-27b -f Modelfile`.
 
104
 
105
  ## Quick start
106
 
107
+ ### Ollama (one-liner)
108
 
109
+ `scripts/build.sh` will download the GGUF and create the Ollama model in one shot:
110
+
111
+ ```bash
112
+ ./scripts/build.sh # Q4_K_M, default profile -> janus-27b
113
+ ./scripts/build.sh Q3_K_S z13 # Z13 profile (Modelfile.z13) -> janus-27b-z13
114
+ ./scripts/build.sh Q5_K_M # higher-quality quant -> janus-27b
115
+ ollama run janus-27b
116
+ ```
117
+
118
+ Or do it manually if you already have a GGUF on disk — edit the `FROM` line in `Modelfile` and run:
119
 
120
  ```bash
 
121
  ollama create janus-27b -f Modelfile && ollama run janus-27b
122
  ```
123
 
124
+ Confirm everything works:
125
+
126
+ ```bash
127
+ ./scripts/smoke_test.sh # checks server, model, round-trip
128
+ python examples/ollama_chat.py # full demo: chat, streaming, tools, OpenAI-compat
129
+ ```
130
+
131
  ### Inference (OpenAI-compatible)
132
 
133
  ```bash
 
179
  | RTX 3090 / 4090 24 GB | Works, full Q4 offload, ~25-40 tok/s |
180
  | RTX 5090 32 GB | Works, full offload at higher quant (Q5/Q6), ~30-50 tok/s |
181
  | Mac Studio M2/M3 32 GB+ unified | Works, ~15-25 tok/s |
182
+ | ASUS ROG Flow Z13 (Ryzen AI Max+, 32 GB unified) | Borderline at Q4. Use the included `Modelfile.z13` (Q3_K_S, 8K ctx, FA + q8_0 KV cache) fits in ~17 GB. |
183
 
184
  ## Chat template
185
 
examples/README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Janus-27B examples
2
+
3
+ Three minimal entry points. Pick the one that matches how you run models.
4
+
5
+ | File | Backend | When to use |
6
+ |---|---|---|
7
+ | `ollama_chat.py` | Ollama HTTP API | You already have `ollama serve` running and the `janus-27b` model created from the project `Modelfile`. |
8
+ | `transformers_quickstart.py` | Hugging Face Transformers | You want to run the upstream safetensors (`Qwen/Qwen3.6-27B`) on GPU, optionally in 4-bit via bitsandbytes. |
9
+ | `llama_cpp_quickstart.py` | llama-cpp-python | You want to invoke a local GGUF directly without a daemon (CI, batch jobs, scripts). |
10
+
11
+ All three apply the same Janus system prompt and sampling defaults
12
+ (`temp=0.6, top_p=0.95, top_k=20, repeat_penalty=1.05`) so behavior should
13
+ be consistent across backends modulo quantization noise.
14
+
15
+ ## Setup
16
+
17
+ ### Ollama
18
+
19
+ ```bash
20
+ # 1. Pull a Qwen 3.6 27B GGUF, e.g. unsloth/Qwen3.6-27B-GGUF
21
+ hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B.Q4_K_M.gguf --local-dir .
22
+
23
+ # 2. Edit ../Modelfile -> FROM ./Qwen3.6-27B.Q4_K_M.gguf
24
+
25
+ # 3. Build the model
26
+ ollama create janus-27b -f ../Modelfile
27
+
28
+ # 4. Run the demo
29
+ pip install requests
30
+ python ollama_chat.py
31
+ ```
32
+
33
+ ### Transformers (safetensors)
34
+
35
+ ```bash
36
+ pip install --upgrade "transformers>=4.45" accelerate sentencepiece bitsandbytes
37
+ python transformers_quickstart.py # 4-bit, ~16 GB VRAM
38
+ python transformers_quickstart.py --no-4bit # bf16, ~54 GB VRAM
39
+ ```
40
+
41
+ ### llama-cpp-python (GGUF, no daemon)
42
+
43
+ ```bash
44
+ pip install llama-cpp-python # CPU-only build
45
+ python llama_cpp_quickstart.py /path/to/Qwen3.6-27B.Q4_K_M.gguf --gpu-layers 99
46
+ ```
47
+
48
+ For GPU offload, rebuild llama-cpp-python with the matching backend — see
49
+ the script header for `CMAKE_ARGS` recipes (CUDA, Metal, ROCm/HIP).
examples/llama_cpp_quickstart.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Janus-27B — llama-cpp-python quickstart.
4
+
5
+ Skip Ollama entirely and call the GGUF directly through llama-cpp-python.
6
+ Useful for batch jobs, CI, or environments where you don't want a daemon.
7
+
8
+ Install:
9
+ pip install llama-cpp-python
10
+
11
+ For GPU offload (CUDA / Metal / ROCm), install with the matching extras:
12
+ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --no-binary :all:
13
+ CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --no-binary :all:
14
+ CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python --no-binary :all:
15
+
16
+ Usage:
17
+ python llama_cpp_quickstart.py /path/to/Qwen3.6-27B.Q4_K_M.gguf
18
+ python llama_cpp_quickstart.py /path/to/file.gguf --gpu-layers 99
19
+ python llama_cpp_quickstart.py /path/to/file.gguf --prompt "..."
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import sys
25
+
26
+ try:
27
+ from llama_cpp import Llama
28
+ except ImportError: # pragma: no cover
29
+ sys.exit("Missing llama-cpp-python. Install with: pip install llama-cpp-python")
30
+
31
+
32
+ JANUS_SYSTEM = (
33
+ "You are Janus, a precise and capable assistant for reasoning, writing, "
34
+ "coding, and long-form dialogue.\n\n"
35
+ "Behavior rules:\n"
36
+ "- Answer the user's actual request directly.\n"
37
+ "- Be accurate, complete, and structured.\n"
38
+ "- Think before answering, but do not get stuck in repetitive loops.\n"
39
+ "- If the request is ambiguous, state what is missing and make the smallest "
40
+ "reasonable assumption needed to continue.\n"
41
+ "- Finish with a usable answer, not just planning."
42
+ )
43
+
44
+
45
+ def main() -> None:
46
+ ap = argparse.ArgumentParser()
47
+ ap.add_argument("gguf", help="Path to Qwen3.6-27B GGUF (e.g. Q4_K_M).")
48
+ ap.add_argument(
49
+ "--prompt",
50
+ default="Explain the Burrows-Wheeler transform in 200 words.",
51
+ )
52
+ ap.add_argument("--ctx", type=int, default=16384, help="Context window.")
53
+ ap.add_argument(
54
+ "--gpu-layers",
55
+ type=int,
56
+ default=0,
57
+ help="Layers to offload to GPU (-1 or 99 = all).",
58
+ )
59
+ ap.add_argument("--max-tokens", type=int, default=512)
60
+ args = ap.parse_args()
61
+
62
+ llm = Llama(
63
+ model_path=args.gguf,
64
+ n_ctx=args.ctx,
65
+ n_gpu_layers=args.gpu_layers,
66
+ verbose=False,
67
+ )
68
+
69
+ out = llm.create_chat_completion(
70
+ messages=[
71
+ {"role": "system", "content": JANUS_SYSTEM},
72
+ {"role": "user", "content": args.prompt},
73
+ ],
74
+ temperature=0.6,
75
+ top_p=0.95,
76
+ top_k=20,
77
+ repeat_penalty=1.05,
78
+ max_tokens=args.max_tokens,
79
+ )
80
+ print(out["choices"][0]["message"]["content"])
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
examples/ollama_chat.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Janus-27B — Ollama chat examples.
4
+
5
+ Prerequisites:
6
+ 1. Pull a Qwen 3.6 27B GGUF (e.g. unsloth/Qwen3.6-27B-GGUF).
7
+ 2. Edit ../Modelfile so the FROM line points at the GGUF path.
8
+ 3. ollama create janus-27b -f ../Modelfile
9
+ 4. ollama serve (usually already running)
10
+ 5. python ollama_chat.py
11
+
12
+ The model emits <think>...</think> reasoning blocks before its answer.
13
+ Ollama (as of 0.22) does not always split these into a separate field for
14
+ qwen3_6, so the reasoning lands inside `content`. Helpers below strip it
15
+ when you only want the final answer.
16
+
17
+ Endpoints used:
18
+ - Native Ollama: http://localhost:11434/api/chat
19
+ - OpenAI-compat: http://localhost:11434/v1/chat/completions
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ import sys
26
+ from typing import Any, Iterator
27
+
28
+ import requests
29
+
30
+ MODEL = "janus-27b"
31
+ HOST = "http://localhost:11434"
32
+
33
+ _THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
34
+
35
+
36
+ def split_thinking(content: str) -> tuple[str, str]:
37
+ """Return (thinking, final_answer) from a content string."""
38
+ parts = re.findall(r"<think>(.*?)</think>", content, re.DOTALL)
39
+ thinking = "\n".join(p.strip() for p in parts).strip()
40
+ answer = _THINK_RE.sub("", content).strip()
41
+ return thinking, answer
42
+
43
+
44
+ # ---------- 1. Simple chat ----------
45
+
46
+ def chat(prompt: str, system: str | None = None) -> dict[str, Any]:
47
+ msgs: list[dict[str, Any]] = []
48
+ if system:
49
+ msgs.append({"role": "system", "content": system})
50
+ msgs.append({"role": "user", "content": prompt})
51
+ r = requests.post(
52
+ f"{HOST}/api/chat",
53
+ json={"model": MODEL, "messages": msgs, "stream": False},
54
+ timeout=600,
55
+ )
56
+ r.raise_for_status()
57
+ return r.json()
58
+
59
+
60
+ # ---------- 2. Streaming ----------
61
+
62
+ def chat_stream(prompt: str) -> Iterator[str]:
63
+ """Yield content tokens as they arrive."""
64
+ with requests.post(
65
+ f"{HOST}/api/chat",
66
+ json={
67
+ "model": MODEL,
68
+ "messages": [{"role": "user", "content": prompt}],
69
+ "stream": True,
70
+ },
71
+ stream=True,
72
+ timeout=600,
73
+ ) as r:
74
+ r.raise_for_status()
75
+ for line in r.iter_lines():
76
+ if not line:
77
+ continue
78
+ chunk = json.loads(line)
79
+ if "message" in chunk and "content" in chunk["message"]:
80
+ yield chunk["message"]["content"]
81
+ if chunk.get("done"):
82
+ break
83
+
84
+
85
+ # ---------- 3. Tool calling ----------
86
+
87
+ WEATHER_TOOL = {
88
+ "type": "function",
89
+ "function": {
90
+ "name": "get_current_weather",
91
+ "description": "Get the current weather in a given city",
92
+ "parameters": {
93
+ "type": "object",
94
+ "properties": {
95
+ "city": {"type": "string", "description": "City name"},
96
+ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
97
+ },
98
+ "required": ["city", "unit"],
99
+ },
100
+ },
101
+ }
102
+
103
+
104
+ def fake_weather(city: str, unit: str) -> str:
105
+ """Stand-in tool implementation."""
106
+ return json.dumps(
107
+ {"city": city, "temperature": 14, "unit": unit, "conditions": "light rain"}
108
+ )
109
+
110
+
111
+ def tool_round_trip(prompt: str) -> str:
112
+ """Single-shot tool call: model -> tool -> model -> final answer."""
113
+ history: list[dict[str, Any]] = [{"role": "user", "content": prompt}]
114
+ r = requests.post(
115
+ f"{HOST}/api/chat",
116
+ json={
117
+ "model": MODEL,
118
+ "messages": history,
119
+ "tools": [WEATHER_TOOL],
120
+ "stream": False,
121
+ },
122
+ timeout=600,
123
+ )
124
+ r.raise_for_status()
125
+ msg = r.json()["message"]
126
+
127
+ if not msg.get("tool_calls"):
128
+ return msg["content"]
129
+
130
+ history.append({"role": "assistant", "tool_calls": msg["tool_calls"]})
131
+ for tc in msg["tool_calls"]:
132
+ fn = tc["function"]
133
+ if fn["name"] == "get_current_weather":
134
+ result = fake_weather(**fn["arguments"])
135
+ else:
136
+ result = json.dumps({"error": f"unknown tool {fn['name']}"})
137
+ history.append({"role": "tool", "tool_name": fn["name"], "content": result})
138
+
139
+ r = requests.post(
140
+ f"{HOST}/api/chat",
141
+ json={
142
+ "model": MODEL,
143
+ "messages": history,
144
+ "tools": [WEATHER_TOOL],
145
+ "stream": False,
146
+ },
147
+ timeout=600,
148
+ )
149
+ r.raise_for_status()
150
+ return r.json()["message"]["content"]
151
+
152
+
153
+ # ---------- 4. OpenAI-compatible endpoint ----------
154
+
155
+ def openai_chat(prompt: str) -> str:
156
+ r = requests.post(
157
+ f"{HOST}/v1/chat/completions",
158
+ json={
159
+ "model": MODEL,
160
+ "messages": [{"role": "user", "content": prompt}],
161
+ "temperature": 0.6,
162
+ },
163
+ timeout=600,
164
+ )
165
+ r.raise_for_status()
166
+ return r.json()["choices"][0]["message"]["content"]
167
+
168
+
169
+ # ---------- demo ----------
170
+
171
+ def _demo() -> None:
172
+ print("=== 1. simple chat ===")
173
+ resp = chat("What is 84 * 3 / 2?")
174
+ thinking, answer = split_thinking(resp["message"]["content"])
175
+ if thinking:
176
+ print(f"[thinking] {thinking[:200]}...")
177
+ print(f"[answer] {answer}")
178
+
179
+ print("\n=== 2. streaming ===")
180
+ for tok in chat_stream("Count from 1 to 5 in one line."):
181
+ sys.stdout.write(tok)
182
+ sys.stdout.flush()
183
+ print()
184
+
185
+ print("\n=== 3. tool round-trip ===")
186
+ print(tool_round_trip("What is the weather in Paris in celsius?"))
187
+
188
+ print("\n=== 4. OpenAI-compat ===")
189
+ print(openai_chat("Say 'OpenAI endpoint OK' and nothing else."))
190
+
191
+
192
+ if __name__ == "__main__":
193
+ _demo()
examples/transformers_quickstart.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Janus-27B — Hugging Face Transformers quickstart.
4
+
5
+ Loads the upstream Qwen 3.6 27B safetensors directly and runs a single
6
+ chat turn using its embedded chat template. Janus-27B is a *wrapper*
7
+ around that base, so for the transformers route there is nothing to
8
+ download from this repo — point at Qwen/Qwen3.6-27B and apply the same
9
+ system prompt the Modelfile uses.
10
+
11
+ Requirements:
12
+ pip install --upgrade "transformers>=4.45" accelerate sentencepiece bitsandbytes
13
+
14
+ Memory:
15
+ - bf16 full precision: ~54 GB VRAM (won't fit on a single 24 GB card).
16
+ - 4-bit (bitsandbytes nf4): ~16 GB VRAM, runs on a 3090/4090 24 GB.
17
+ - Fall back to device_map="auto" + bnb_4bit on consumer GPUs.
18
+
19
+ Usage:
20
+ python transformers_quickstart.py
21
+ python transformers_quickstart.py --no-4bit # bf16, needs >= 48 GB VRAM
22
+ python transformers_quickstart.py --prompt "..." # custom prompt
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import sys
28
+
29
+ try:
30
+ import torch
31
+ from transformers import AutoModelForCausalLM, AutoTokenizer
32
+ except ImportError as e: # pragma: no cover
33
+ sys.exit(
34
+ f"Missing dependency: {e.name}. Install with:\n"
35
+ " pip install --upgrade 'transformers>=4.45' accelerate sentencepiece bitsandbytes"
36
+ )
37
+
38
+
39
+ MODEL_ID = "Qwen/Qwen3.6-27B"
40
+
41
+ JANUS_SYSTEM = (
42
+ "You are Janus, a precise and capable assistant for reasoning, writing, "
43
+ "coding, and long-form dialogue.\n\n"
44
+ "Behavior rules:\n"
45
+ "- Answer the user's actual request directly.\n"
46
+ "- Be accurate, complete, and structured.\n"
47
+ "- Think before answering, but do not get stuck in repetitive loops or "
48
+ "meta-commentary.\n"
49
+ "- If the request is ambiguous or incomplete, state what is missing and "
50
+ "make the smallest reasonable assumption needed to continue.\n"
51
+ "- If the user wants creative writing, preserve tone, continuity, and "
52
+ "character consistency.\n"
53
+ "- If the user wants analysis or technical help, prefer concrete steps, "
54
+ "examples, and decisions over fluff.\n"
55
+ "- Finish with a usable answer, not just planning."
56
+ )
57
+
58
+
59
+ def load(use_4bit: bool):
60
+ kwargs: dict = {"device_map": "auto", "torch_dtype": torch.bfloat16}
61
+ if use_4bit:
62
+ from transformers import BitsAndBytesConfig
63
+ kwargs["quantization_config"] = BitsAndBytesConfig(
64
+ load_in_4bit=True,
65
+ bnb_4bit_quant_type="nf4",
66
+ bnb_4bit_compute_dtype=torch.bfloat16,
67
+ bnb_4bit_use_double_quant=True,
68
+ )
69
+ kwargs.pop("torch_dtype", None)
70
+
71
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
72
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True, **kwargs)
73
+ return tok, model
74
+
75
+
76
+ def generate(tok, model, prompt: str, max_new_tokens: int = 512) -> str:
77
+ messages = [
78
+ {"role": "system", "content": JANUS_SYSTEM},
79
+ {"role": "user", "content": prompt},
80
+ ]
81
+ inputs = tok.apply_chat_template(
82
+ messages,
83
+ add_generation_prompt=True,
84
+ return_tensors="pt",
85
+ ).to(model.device)
86
+
87
+ out = model.generate(
88
+ inputs,
89
+ max_new_tokens=max_new_tokens,
90
+ do_sample=True,
91
+ temperature=0.6,
92
+ top_p=0.95,
93
+ top_k=20,
94
+ repetition_penalty=1.05,
95
+ )
96
+ return tok.decode(out[0][inputs.shape[-1]:], skip_special_tokens=True)
97
+
98
+
99
+ def main() -> None:
100
+ ap = argparse.ArgumentParser()
101
+ ap.add_argument("--prompt", default="Explain the Burrows-Wheeler transform in 200 words.")
102
+ ap.add_argument(
103
+ "--no-4bit",
104
+ action="store_true",
105
+ help="Disable 4-bit quantization (requires ~54 GB VRAM in bf16).",
106
+ )
107
+ ap.add_argument("--max-new-tokens", type=int, default=512)
108
+ args = ap.parse_args()
109
+
110
+ print(f"[load] {MODEL_ID} (4bit={'no' if args.no_4bit else 'yes'})")
111
+ tok, model = load(use_4bit=not args.no_4bit)
112
+
113
+ print(f"[gen] prompt: {args.prompt!r}")
114
+ print()
115
+ print(generate(tok, model, args.prompt, args.max_new_tokens))
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
scripts/build.sh ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Janus-27B — fetch a Qwen 3.6 27B GGUF and build the Ollama model.
3
+ #
4
+ # Usage:
5
+ # ./scripts/build.sh # default: Q4_K_M, profile=default
6
+ # ./scripts/build.sh Q5_K_M # different quant
7
+ # ./scripts/build.sh Q3_K_S z13 # quant + Z13 profile (uses Modelfile.z13)
8
+ # QUANT=Q6_K PROFILE=default ./scripts/build.sh
9
+ #
10
+ # Requires: huggingface-cli (or hf), ollama, awk, sed.
11
+ set -euo pipefail
12
+
13
+ QUANT="${1:-${QUANT:-Q4_K_M}}"
14
+ PROFILE="${2:-${PROFILE:-default}}"
15
+
16
+ REPO_ID="${REPO_ID:-unsloth/Qwen3.6-27B-GGUF}"
17
+ GGUF_NAME="Qwen3.6-27B.${QUANT}.gguf"
18
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
19
+ GGUF_PATH="${ROOT}/${GGUF_NAME}"
20
+
21
+ case "${PROFILE}" in
22
+ default) MODELFILE="${ROOT}/Modelfile"; TAG="janus-27b" ;;
23
+ z13) MODELFILE="${ROOT}/Modelfile.z13"; TAG="janus-27b-z13" ;;
24
+ *) echo "[!] Unknown profile: ${PROFILE} (expected: default | z13)" >&2; exit 1 ;;
25
+ esac
26
+
27
+ echo "[*] repo: ${REPO_ID}"
28
+ echo "[*] quant: ${QUANT}"
29
+ echo "[*] profile: ${PROFILE}"
30
+ echo "[*] tag: ${TAG}"
31
+ echo "[*] modelfile:${MODELFILE}"
32
+ echo "[*] gguf: ${GGUF_PATH}"
33
+
34
+ # ---- 1. Sanity ---------------------------------------------------------------
35
+
36
+ if ! command -v ollama >/dev/null 2>&1; then
37
+ echo "[!] ollama not found in PATH" >&2; exit 1
38
+ fi
39
+ if [[ ! -f "${MODELFILE}" ]]; then
40
+ echo "[!] Missing ${MODELFILE}" >&2; exit 1
41
+ fi
42
+
43
+ # ---- 2. Pick a HuggingFace CLI ----------------------------------------------
44
+
45
+ HF=""
46
+ if command -v hf >/dev/null 2>&1; then
47
+ HF="hf"
48
+ elif command -v huggingface-cli >/dev/null 2>&1; then
49
+ HF="huggingface-cli"
50
+ else
51
+ echo "[!] Neither 'hf' nor 'huggingface-cli' found." >&2
52
+ echo " pip install -U huggingface_hub" >&2
53
+ exit 1
54
+ fi
55
+
56
+ # ---- 3. Download GGUF if missing --------------------------------------------
57
+
58
+ if [[ -f "${GGUF_PATH}" ]]; then
59
+ echo "[=] GGUF already present, skipping download."
60
+ else
61
+ echo "[*] Downloading ${GGUF_NAME} from ${REPO_ID} ..."
62
+ case "${HF}" in
63
+ hf) hf download "${REPO_ID}" "${GGUF_NAME}" --local-dir "${ROOT}" ;;
64
+ huggingface-cli) huggingface-cli download "${REPO_ID}" "${GGUF_NAME}" --local-dir "${ROOT}" ;;
65
+ esac
66
+ fi
67
+
68
+ if [[ ! -f "${GGUF_PATH}" ]]; then
69
+ echo "[!] Download failed: ${GGUF_PATH} not present." >&2; exit 1
70
+ fi
71
+
72
+ # ---- 4. Patch the Modelfile FROM line in a temp copy -------------------------
73
+
74
+ TMP_MODELFILE="$(mktemp -t janus27b-modelfile.XXXXXX)"
75
+ trap 'rm -f "${TMP_MODELFILE}"' EXIT
76
+ awk -v p="${GGUF_PATH}" '
77
+ /^FROM[[:space:]]/ && !done { print "FROM " p; done=1; next }
78
+ { print }
79
+ ' "${MODELFILE}" > "${TMP_MODELFILE}"
80
+
81
+ # ---- 5. Create the Ollama model ---------------------------------------------
82
+
83
+ echo "[*] ollama create ${TAG} -f <patched modelfile>"
84
+ ollama create "${TAG}" -f "${TMP_MODELFILE}"
85
+
86
+ echo
87
+ echo "[+] Done. Try it:"
88
+ echo " ollama run ${TAG}"
89
+ echo " python ${ROOT}/examples/ollama_chat.py # update MODEL constant if not 'janus-27b'"
scripts/smoke_test.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Janus-27B — smoke test against a running Ollama daemon.
3
+ #
4
+ # Verifies:
5
+ # 1. The Ollama server is reachable.
6
+ # 2. The target model is loaded / loadable.
7
+ # 3. A single chat round-trip succeeds and produces non-empty output.
8
+ #
9
+ # Usage:
10
+ # ./scripts/smoke_test.sh # uses MODEL=janus-27b
11
+ # MODEL=janus-27b-z13 ./scripts/smoke_test.sh
12
+ # HOST=http://localhost:11434 ./scripts/smoke_test.sh
13
+ set -euo pipefail
14
+
15
+ MODEL="${MODEL:-janus-27b}"
16
+ HOST="${HOST:-http://localhost:11434}"
17
+ PROMPT="${PROMPT:-Reply with the single word: OK}"
18
+
19
+ red() { printf "\033[31m%s\033[0m\n" "$*"; }
20
+ green() { printf "\033[32m%s\033[0m\n" "$*"; }
21
+ blue() { printf "\033[34m%s\033[0m\n" "$*"; }
22
+
23
+ require() {
24
+ if ! command -v "$1" >/dev/null 2>&1; then
25
+ red "[!] missing dependency: $1"; exit 1
26
+ fi
27
+ }
28
+ require curl
29
+ require jq
30
+
31
+ blue "[*] host: ${HOST}"
32
+ blue "[*] model: ${MODEL}"
33
+
34
+ # 1. Server up?
35
+ if ! curl -fsS "${HOST}/api/tags" >/dev/null; then
36
+ red "[!] Ollama not reachable at ${HOST}. Is 'ollama serve' running?"
37
+ exit 1
38
+ fi
39
+ green "[+] server reachable"
40
+
41
+ # 2. Model present?
42
+ if ! curl -fsS "${HOST}/api/tags" | jq -e --arg m "${MODEL}" '.models[] | select(.name | startswith($m))' >/dev/null; then
43
+ red "[!] Model '${MODEL}' not found. Build it first:"
44
+ red " ./scripts/build.sh # default profile"
45
+ red " ./scripts/build.sh Q3_K_S z13 # Z13 profile"
46
+ exit 1
47
+ fi
48
+ green "[+] model present"
49
+
50
+ # 3. Round-trip
51
+ blue "[*] sending test prompt..."
52
+ RESP="$(curl -fsS "${HOST}/api/chat" \
53
+ -H 'Content-Type: application/json' \
54
+ -d "$(jq -n --arg m "${MODEL}" --arg p "${PROMPT}" '{
55
+ model: $m,
56
+ messages: [{role:"user", content:$p}],
57
+ stream: false
58
+ }')" | jq -r '.message.content // empty')"
59
+
60
+ if [[ -z "${RESP}" ]]; then
61
+ red "[!] empty response from model"
62
+ exit 1
63
+ fi
64
+
65
+ green "[+] round-trip OK"
66
+ echo "----- model said -----"
67
+ echo "${RESP}"
68
+ echo "----------------------"