#!/usr/bin/env python3 """ Thanatos-27B — vision (image-text-to-text) via llama-cpp-python. Why this script exists: Ollama's Go engine has the qwen35 / qwen35moe arch entries (text inference works on 0.24+), but the C++ llama.cpp fallback that Ollama switches to when an mmproj is attached still lacks them. Both `FROM mmproj.gguf` and `ADAPTER mmproj.gguf` fail at first inference with: unknown model architecture: 'qwen35moe' See ollama/ollama#15898 (still open). Until that lands, vision via Ollama is broken for Qwen 3.5 / 3.6 while text remains fine. Upstream ggml-org/llama.cpp **does** have the architecture across both code paths, so vision works fine via llama.cpp directly. This script uses the python binding. Install: pip install llama-cpp-python pillow # GPU offload? rebuild with the matching backend: # CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --no-binary :all: # CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --no-binary :all: # CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python --no-binary :all: Files you need (both from unsloth/Qwen3.6-27B-GGUF): 1. A text GGUF (any quant): e.g. Qwen3.6-27B-Q4_K_M.gguf (~17 GB) 2. A vision projector: mmproj-F16.gguf (~927 MB) Usage: python llama_cpp_vision.py \ --gguf /path/to/Qwen3.6-27B-Q4_K_M.gguf \ --mmproj /path/to/mmproj-F16.gguf \ --image /path/to/photo.jpg \ --prompt "What is in this image? Be specific." # CLI alternative without python binding (ships with llama.cpp): # llama-mtmd-cli \ # -m Qwen3.6-27B-Q4_K_M.gguf \ # --mmproj mmproj-F16.gguf \ # --image photo.jpg \ # -p "Describe this image." """ from __future__ import annotations import argparse import base64 import sys from pathlib import Path try: from llama_cpp import Llama from llama_cpp.llama_chat_format import Qwen25VLChatHandler except ImportError: # pragma: no cover sys.exit( "Missing llama-cpp-python (>=0.3 with VL handlers).\n" " pip install --upgrade llama-cpp-python pillow" ) THANATOS_SYSTEM = ( "You are Thanatos, a precise vision-language assistant. Describe images " "accurately, do not invent details, and ground every claim in the " "pixels you can actually see." ) def encode_image_data_uri(path: Path) -> str: suffix = path.suffix.lower().lstrip(".") mime = {"jpg": "jpeg", "jpeg": "jpeg", "png": "png", "webp": "webp", "gif": "gif"}.get(suffix, "jpeg") return f"data:image/{mime};base64,{base64.b64encode(path.read_bytes()).decode()}" def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--gguf", required=True, help="Text GGUF (e.g. Qwen3.6-27B-Q4_K_M.gguf).") ap.add_argument("--mmproj", required=True, help="Vision projector GGUF (mmproj-F16.gguf).") ap.add_argument("--image", required=True, help="Image to analyze.") ap.add_argument("--prompt", default="Describe this image in detail.") ap.add_argument("--ctx", type=int, default=8192) ap.add_argument( "--gpu-layers", type=int, default=0, help="Layers to offload to GPU (-1 or 99 = all).", ) ap.add_argument("--max-tokens", type=int, default=512) args = ap.parse_args() image_path = Path(args.image) if not image_path.exists(): sys.exit(f"Image not found: {image_path}") # Qwen 2.5 VL chat handler is the closest match shipped with # llama-cpp-python; Qwen 3.5/3.6 vision uses the same projector layout. # If/when llama-cpp-python ships a Qwen3VLChatHandler, swap it in. handler = Qwen25VLChatHandler(clip_model_path=args.mmproj) llm = Llama( model_path=args.gguf, chat_handler=handler, n_ctx=args.ctx, n_gpu_layers=args.gpu_layers, verbose=False, ) out = llm.create_chat_completion( messages=[ {"role": "system", "content": THANATOS_SYSTEM}, { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": encode_image_data_uri(image_path)}}, {"type": "text", "text": args.prompt}, ], }, ], temperature=0.6, top_p=0.95, top_k=20, repeat_penalty=1.05, max_tokens=args.max_tokens, ) print(out["choices"][0]["message"]["content"]) if __name__ == "__main__": main()