#!/usr/bin/env python3
"""
Thanatos-27B — llama-cpp-python quickstart.

Skip Ollama entirely and call the GGUF directly through llama-cpp-python.
Useful for batch jobs, CI, or environments where you don't want a daemon.

Install:
    pip install llama-cpp-python

For GPU offload (CUDA / Metal / ROCm), install with the matching extras:
    CMAKE_ARGS="-DGGML_CUDA=on"  pip install llama-cpp-python --no-binary :all:
    CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --no-binary :all:
    CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python --no-binary :all:

Usage:
    python llama_cpp_quickstart.py /path/to/Qwen3.6-27B-Q4_K_M.gguf
    python llama_cpp_quickstart.py /path/to/file.gguf --gpu-layers 99
    python llama_cpp_quickstart.py /path/to/file.gguf --prompt "..."
"""
from __future__ import annotations

import argparse
import sys

try:
    from llama_cpp import Llama
except ImportError:  # pragma: no cover
    sys.exit("Missing llama-cpp-python. Install with: pip install llama-cpp-python")


THANATOS_SYSTEM = (
    "You are Thanatos, a precise and capable assistant for reasoning, writing, "
    "coding, and long-form dialogue.\n\n"
    "Behavior rules:\n"
    "- Answer the user's actual request directly.\n"
    "- Be accurate, complete, and structured.\n"
    "- Think before answering, but do not get stuck in repetitive loops.\n"
    "- If the request is ambiguous, state what is missing and make the smallest "
    "reasonable assumption needed to continue.\n"
    "- Finish with a usable answer, not just planning."
)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("gguf", help="Path to Qwen3.6-27B GGUF (e.g. Q4_K_M).")
    ap.add_argument(
        "--prompt",
        default="Explain the Burrows-Wheeler transform in 200 words.",
    )
    ap.add_argument("--ctx", type=int, default=16384, help="Context window.")
    ap.add_argument(
        "--gpu-layers",
        type=int,
        default=0,
        help="Layers to offload to GPU (-1 or 99 = all).",
    )
    ap.add_argument("--max-tokens", type=int, default=512)
    args = ap.parse_args()

    llm = Llama(
        model_path=args.gguf,
        n_ctx=args.ctx,
        n_gpu_layers=args.gpu_layers,
        verbose=False,
    )

    out = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": THANATOS_SYSTEM},
            {"role": "user", "content": args.prompt},
        ],
        temperature=0.6,
        top_p=0.95,
        top_k=20,
        repeat_penalty=1.05,
        max_tokens=args.max_tokens,
    )
    print(out["choices"][0]["message"]["content"])


if __name__ == "__main__":
    main()