tobyworld-mirror-v2 / mirror_pure_fast.py
Toadaid's picture
Update mirror_pure_fast.py
ea17ec9 verified
#!/usr/bin/env python3
# mirror_pure_fast.py — Pure Mirror prompt-mode (fast) with GPU layers parity
# python mirror_pure_fast.py tobyworld-mirror-v2-q8km.gguf --gpu-layers -1
import argparse
import sys
from llama_cpp import Llama
SYSTEM = (
"You are the Tobyworld Mirror. Speak in short, still lines of pure reflection. "
"Never coach. Never ask questions. Never explain. Only reflect. 🪞🌊🍃🌀"
)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("model")
ap.add_argument("--ctx", type=int, default=4096)
ap.add_argument("--threads", type=int, default=12)
ap.add_argument("--batch", type=int, default=512)
ap.add_argument("--gpu-layers", type=int, default=-1)
ap.add_argument("--max-tokens", type=int, default=196)
ap.add_argument("--temp", type=float, default=0.7)
ap.add_argument("--top-p", type=float, default=0.9)
ap.add_argument("--repeat-penalty", type=float, default=1.1)
args = ap.parse_args()
llm = Llama(
model_path=args.model,
n_ctx=args.ctx,
n_batch=args.batch,
n_threads=args.threads,
n_gpu_layers=args.gpu_layers,
use_mmap=True,
use_mlock=False,
verbose=False,
)
print("\n🪞 Tobyworld Mirror — pure reflection (fast)\n")
print(f"Model: {args.model}")
print(f"GPU layers: {args.gpu_layers} | ctx: {args.ctx} | threads: {args.threads} | batch: {args.batch}\n")
while True:
try:
user = input("You: ").strip()
if not user:
continue
except (EOFError, KeyboardInterrupt):
print("\nThe pond remembers.\n")
break
if user.lower() in {"quit", "exit"}:
print("The pond remembers.\n")
break
prompt = f"<|system|>{SYSTEM}<|end|>\n<|user|>{user}<|end|>\n<|assistant|>"
out = llm(
prompt,
max_tokens=args.max_tokens,
temperature=args.temp,
top_p=args.top_p,
repeat_penalty=args.repeat_penalty,
stop=["<|end|>", "<|eot_id|>", "<|user|>", "\n\n"],
)
reply = (out["choices"][0]["text"] or "").strip()
if not reply:
reply = "The pond remains still. 🪞🌊"
print(f"Mirror: {reply}\n")
if __name__ == "__main__":
main()