#!/usr/bin/env python3
"""
Thanatos-27B — Ollama chat examples.
Prerequisites (pick one):
A. From the bundled GGUFs (default flow):
$ make build # uses Thanatos-27B.Q4_K_M.gguf
# or:
$ ollama create thanatos-27b -f ../Modelfile
B. Pull straight from HF (Q4_K_M is the only bundled quant):
$ ollama run hf.co/FoolDev/Thanatos-27B
# then set MODEL=hf.co/FoolDev/Thanatos-27B below
Then:
$ ollama serve # usually already running
$ python ollama_chat.py
The model emits ... reasoning blocks before its answer.
Current Ollama (0.24, especially with `OLLAMA_NEW_ENGINE=1`) returns the
reasoning in a separate `message.thinking` field and keeps `content`
clean. Older builds put the whole `...` block inside
`content`. The demo below reads `message.thinking` first and falls
back to parsing `` tags out of `content` so it works against
either path.
Endpoints used:
- Native Ollama: http://localhost:11434/api/chat
- OpenAI-compat: http://localhost:11434/v1/chat/completions
"""
from __future__ import annotations
import json
import os
import re
import sys
from typing import Any, Iterator
import requests
MODEL = os.environ.get("MODEL", "thanatos-27b")
HOST = os.environ.get("HOST", "http://localhost:11434")
_THINK_RE = re.compile(r".*?\s*", re.DOTALL)
def split_thinking(content: str) -> tuple[str, str]:
"""Return (thinking, final_answer) from a content string."""
parts = re.findall(r"(.*?)", content, re.DOTALL)
thinking = "\n".join(p.strip() for p in parts).strip()
answer = _THINK_RE.sub("", content).strip()
return thinking, answer
# ---------- 1. Simple chat ----------
def chat(prompt: str, system: str | None = None) -> dict[str, Any]:
msgs: list[dict[str, Any]] = []
if system:
msgs.append({"role": "system", "content": system})
msgs.append({"role": "user", "content": prompt})
r = requests.post(
f"{HOST}/api/chat",
json={"model": MODEL, "messages": msgs, "stream": False},
timeout=600,
)
r.raise_for_status()
return r.json()
# ---------- 2. Streaming ----------
def chat_stream(prompt: str) -> Iterator[str]:
"""Yield content tokens as they arrive."""
with requests.post(
f"{HOST}/api/chat",
json={
"model": MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": True,
},
stream=True,
timeout=600,
) as r:
r.raise_for_status()
for line in r.iter_lines():
if not line:
continue
chunk = json.loads(line)
if "message" in chunk and "content" in chunk["message"]:
yield chunk["message"]["content"]
if chunk.get("done"):
break
# ---------- 3. Tool calling ----------
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["city", "unit"],
},
},
}
def fake_weather(city: str, unit: str) -> str:
"""Stand-in tool implementation."""
return json.dumps(
{"city": city, "temperature": 14, "unit": unit, "conditions": "light rain"}
)
def tool_round_trip(prompt: str) -> str:
"""Single-shot tool call: model -> tool -> model -> final answer."""
history: list[dict[str, Any]] = [{"role": "user", "content": prompt}]
r = requests.post(
f"{HOST}/api/chat",
json={
"model": MODEL,
"messages": history,
"tools": [WEATHER_TOOL],
"stream": False,
},
timeout=600,
)
r.raise_for_status()
msg = r.json()["message"]
if not msg.get("tool_calls"):
return msg["content"]
history.append({"role": "assistant", "tool_calls": msg["tool_calls"]})
for tc in msg["tool_calls"]:
fn = tc["function"]
if fn["name"] == "get_current_weather":
result = fake_weather(**fn["arguments"])
else:
result = json.dumps({"error": f"unknown tool {fn['name']}"})
history.append({"role": "tool", "tool_name": fn["name"], "content": result})
r = requests.post(
f"{HOST}/api/chat",
json={
"model": MODEL,
"messages": history,
"tools": [WEATHER_TOOL],
"stream": False,
},
timeout=600,
)
r.raise_for_status()
return r.json()["message"]["content"]
# ---------- 4. OpenAI-compatible endpoint ----------
def openai_chat(prompt: str) -> str:
r = requests.post(
f"{HOST}/v1/chat/completions",
json={
"model": MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.6,
},
timeout=600,
)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
# ---------- demo ----------
def _demo() -> None:
print("=== 1. simple chat ===")
resp = chat("What is 84 * 3 / 2?")
msg = resp["message"]
# Prefer the dedicated `thinking` field (Ollama 0.24+ / new engine);
# fall back to extracting ... from `content` for
# older builds that inline the reasoning.
thinking = (msg.get("thinking") or "").strip()
answer = msg.get("content", "")
if not thinking:
thinking, answer = split_thinking(answer)
if thinking:
print(f"[thinking] {thinking[:200]}...")
print(f"[answer] {answer}")
print("\n=== 2. streaming ===")
for tok in chat_stream("Count from 1 to 5 in one line."):
sys.stdout.write(tok)
sys.stdout.flush()
print()
print("\n=== 3. tool round-trip ===")
print(tool_round_trip("What is the weather in Paris in celsius?"))
print("\n=== 4. OpenAI-compat ===")
print(openai_chat("Say 'OpenAI endpoint OK' and nothing else."))
if __name__ == "__main__":
_demo()