Thanatos-27B / examples /llama_cpp_quickstart.py
FoolDev's picture
Rename back: Thanatos-27B-Heretic → Thanatos-27B (HF repo also renamed)
7197abd
#!/usr/bin/env python3
"""
Thanatos-27B — llama-cpp-python quickstart.
Skip Ollama entirely and call the GGUF directly through llama-cpp-python.
Useful for batch jobs, CI, or environments where you don't want a daemon.
Install:
pip install llama-cpp-python
For GPU offload (CUDA / Metal / ROCm), install with the matching extras:
CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --no-binary :all:
CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --no-binary :all:
CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python --no-binary :all:
Usage:
python llama_cpp_quickstart.py /path/to/Qwen3.6-27B-Q4_K_M.gguf
python llama_cpp_quickstart.py /path/to/file.gguf --gpu-layers 99
python llama_cpp_quickstart.py /path/to/file.gguf --prompt "..."
"""
from __future__ import annotations
import argparse
import sys
try:
from llama_cpp import Llama
except ImportError: # pragma: no cover
sys.exit("Missing llama-cpp-python. Install with: pip install llama-cpp-python")
THANATOS_SYSTEM = (
"You are Thanatos, a precise and capable assistant for reasoning, writing, "
"coding, and long-form dialogue.\n\n"
"Behavior rules:\n"
"- Answer the user's actual request directly.\n"
"- Be accurate, complete, and structured.\n"
"- Think before answering, but do not get stuck in repetitive loops.\n"
"- If the request is ambiguous, state what is missing and make the smallest "
"reasonable assumption needed to continue.\n"
"- Finish with a usable answer, not just planning."
)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("gguf", help="Path to Qwen3.6-27B GGUF (e.g. Q4_K_M).")
ap.add_argument(
"--prompt",
default="Explain the Burrows-Wheeler transform in 200 words.",
)
ap.add_argument("--ctx", type=int, default=16384, help="Context window.")
ap.add_argument(
"--gpu-layers",
type=int,
default=0,
help="Layers to offload to GPU (-1 or 99 = all).",
)
ap.add_argument("--max-tokens", type=int, default=512)
args = ap.parse_args()
llm = Llama(
model_path=args.gguf,
n_ctx=args.ctx,
n_gpu_layers=args.gpu_layers,
verbose=False,
)
out = llm.create_chat_completion(
messages=[
{"role": "system", "content": THANATOS_SYSTEM},
{"role": "user", "content": args.prompt},
],
temperature=0.6,
top_p=0.95,
top_k=20,
repeat_penalty=1.05,
max_tokens=args.max_tokens,
)
print(out["choices"][0]["message"]["content"])
if __name__ == "__main__":
main()