#!/usr/bin/env python3 """ Thanatos-27B — llama-cpp-python quickstart. Skip Ollama entirely and call the GGUF directly through llama-cpp-python. Useful for batch jobs, CI, or environments where you don't want a daemon. Install: pip install llama-cpp-python For GPU offload (CUDA / Metal / ROCm), install with the matching extras: CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --no-binary :all: CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --no-binary :all: CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python --no-binary :all: Usage: python llama_cpp_quickstart.py /path/to/Qwen3.6-27B-Q4_K_M.gguf python llama_cpp_quickstart.py /path/to/file.gguf --gpu-layers 99 python llama_cpp_quickstart.py /path/to/file.gguf --prompt "..." """ from __future__ import annotations import argparse import sys try: from llama_cpp import Llama except ImportError: # pragma: no cover sys.exit("Missing llama-cpp-python. Install with: pip install llama-cpp-python") THANATOS_SYSTEM = ( "You are Thanatos, a precise and capable assistant for reasoning, writing, " "coding, and long-form dialogue.\n\n" "Behavior rules:\n" "- Answer the user's actual request directly.\n" "- Be accurate, complete, and structured.\n" "- Think before answering, but do not get stuck in repetitive loops.\n" "- If the request is ambiguous, state what is missing and make the smallest " "reasonable assumption needed to continue.\n" "- Finish with a usable answer, not just planning." ) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("gguf", help="Path to Qwen3.6-27B GGUF (e.g. Q4_K_M).") ap.add_argument( "--prompt", default="Explain the Burrows-Wheeler transform in 200 words.", ) ap.add_argument("--ctx", type=int, default=16384, help="Context window.") ap.add_argument( "--gpu-layers", type=int, default=0, help="Layers to offload to GPU (-1 or 99 = all).", ) ap.add_argument("--max-tokens", type=int, default=512) args = ap.parse_args() llm = Llama( model_path=args.gguf, n_ctx=args.ctx, n_gpu_layers=args.gpu_layers, verbose=False, ) out = llm.create_chat_completion( messages=[ {"role": "system", "content": THANATOS_SYSTEM}, {"role": "user", "content": args.prompt}, ], temperature=0.6, top_p=0.95, top_k=20, repeat_penalty=1.05, max_tokens=args.max_tokens, ) print(out["choices"][0]["message"]["content"]) if __name__ == "__main__": main()