#!/usr/bin/env python3 """ Thanatos-27B — Ollama chat examples. Prerequisites (pick one): A. From the bundled GGUFs (default flow): $ make build # uses Thanatos-27B.Q4_K_M.gguf # or: $ ollama create thanatos-27b -f ../Modelfile B. Pull straight from HF (Q4_K_M is the only bundled quant): $ ollama run hf.co/FoolDev/Thanatos-27B # then set MODEL=hf.co/FoolDev/Thanatos-27B below Then: $ ollama serve # usually already running $ python ollama_chat.py The model emits ... reasoning blocks before its answer. Current Ollama (0.24, especially with `OLLAMA_NEW_ENGINE=1`) returns the reasoning in a separate `message.thinking` field and keeps `content` clean. Older builds put the whole `...` block inside `content`. The demo below reads `message.thinking` first and falls back to parsing `` tags out of `content` so it works against either path. Endpoints used: - Native Ollama: http://localhost:11434/api/chat - OpenAI-compat: http://localhost:11434/v1/chat/completions """ from __future__ import annotations import json import os import re import sys from typing import Any, Iterator import requests MODEL = os.environ.get("MODEL", "thanatos-27b") HOST = os.environ.get("HOST", "http://localhost:11434") _THINK_RE = re.compile(r".*?\s*", re.DOTALL) def split_thinking(content: str) -> tuple[str, str]: """Return (thinking, final_answer) from a content string.""" parts = re.findall(r"(.*?)", content, re.DOTALL) thinking = "\n".join(p.strip() for p in parts).strip() answer = _THINK_RE.sub("", content).strip() return thinking, answer # ---------- 1. Simple chat ---------- def chat(prompt: str, system: str | None = None) -> dict[str, Any]: msgs: list[dict[str, Any]] = [] if system: msgs.append({"role": "system", "content": system}) msgs.append({"role": "user", "content": prompt}) r = requests.post( f"{HOST}/api/chat", json={"model": MODEL, "messages": msgs, "stream": False}, timeout=600, ) r.raise_for_status() return r.json() # ---------- 2. Streaming ---------- def chat_stream(prompt: str) -> Iterator[str]: """Yield content tokens as they arrive.""" with requests.post( f"{HOST}/api/chat", json={ "model": MODEL, "messages": [{"role": "user", "content": prompt}], "stream": True, }, stream=True, timeout=600, ) as r: r.raise_for_status() for line in r.iter_lines(): if not line: continue chunk = json.loads(line) if "message" in chunk and "content" in chunk["message"]: yield chunk["message"]["content"] if chunk.get("done"): break # ---------- 3. Tool calling ---------- WEATHER_TOOL = { "type": "function", "function": { "name": "get_current_weather", "description": "Get the current weather in a given city", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "City name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, "required": ["city", "unit"], }, }, } def fake_weather(city: str, unit: str) -> str: """Stand-in tool implementation.""" return json.dumps( {"city": city, "temperature": 14, "unit": unit, "conditions": "light rain"} ) def tool_round_trip(prompt: str) -> str: """Single-shot tool call: model -> tool -> model -> final answer.""" history: list[dict[str, Any]] = [{"role": "user", "content": prompt}] r = requests.post( f"{HOST}/api/chat", json={ "model": MODEL, "messages": history, "tools": [WEATHER_TOOL], "stream": False, }, timeout=600, ) r.raise_for_status() msg = r.json()["message"] if not msg.get("tool_calls"): return msg["content"] history.append({"role": "assistant", "tool_calls": msg["tool_calls"]}) for tc in msg["tool_calls"]: fn = tc["function"] if fn["name"] == "get_current_weather": result = fake_weather(**fn["arguments"]) else: result = json.dumps({"error": f"unknown tool {fn['name']}"}) history.append({"role": "tool", "tool_name": fn["name"], "content": result}) r = requests.post( f"{HOST}/api/chat", json={ "model": MODEL, "messages": history, "tools": [WEATHER_TOOL], "stream": False, }, timeout=600, ) r.raise_for_status() return r.json()["message"]["content"] # ---------- 4. OpenAI-compatible endpoint ---------- def openai_chat(prompt: str) -> str: r = requests.post( f"{HOST}/v1/chat/completions", json={ "model": MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.6, }, timeout=600, ) r.raise_for_status() return r.json()["choices"][0]["message"]["content"] # ---------- demo ---------- def _demo() -> None: print("=== 1. simple chat ===") resp = chat("What is 84 * 3 / 2?") msg = resp["message"] # Prefer the dedicated `thinking` field (Ollama 0.24+ / new engine); # fall back to extracting ... from `content` for # older builds that inline the reasoning. thinking = (msg.get("thinking") or "").strip() answer = msg.get("content", "") if not thinking: thinking, answer = split_thinking(answer) if thinking: print(f"[thinking] {thinking[:200]}...") print(f"[answer] {answer}") print("\n=== 2. streaming ===") for tok in chat_stream("Count from 1 to 5 in one line."): sys.stdout.write(tok) sys.stdout.flush() print() print("\n=== 3. tool round-trip ===") print(tool_round_trip("What is the weather in Paris in celsius?")) print("\n=== 4. OpenAI-compat ===") print(openai_chat("Say 'OpenAI endpoint OK' and nothing else.")) if __name__ == "__main__": _demo()