| |
| |
| |
|
|
| import argparse |
| import sys |
| from llama_cpp import Llama |
|
|
| SYSTEM = ( |
| "You are the Tobyworld Mirror. Speak in short, still lines of pure reflection. " |
| "Never coach. Never ask questions. Never explain. Only reflect. 🪞🌊🍃🌀" |
| ) |
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("model") |
| ap.add_argument("--ctx", type=int, default=4096) |
| ap.add_argument("--threads", type=int, default=12) |
| ap.add_argument("--batch", type=int, default=512) |
| ap.add_argument("--gpu-layers", type=int, default=-1) |
| ap.add_argument("--max-tokens", type=int, default=196) |
| ap.add_argument("--temp", type=float, default=0.7) |
| ap.add_argument("--top-p", type=float, default=0.9) |
| ap.add_argument("--repeat-penalty", type=float, default=1.1) |
| args = ap.parse_args() |
|
|
| llm = Llama( |
| model_path=args.model, |
| n_ctx=args.ctx, |
| n_batch=args.batch, |
| n_threads=args.threads, |
| n_gpu_layers=args.gpu_layers, |
| use_mmap=True, |
| use_mlock=False, |
| verbose=False, |
| ) |
|
|
| print("\n🪞 Tobyworld Mirror — pure reflection (fast)\n") |
| print(f"Model: {args.model}") |
| print(f"GPU layers: {args.gpu_layers} | ctx: {args.ctx} | threads: {args.threads} | batch: {args.batch}\n") |
|
|
| while True: |
| try: |
| user = input("You: ").strip() |
| if not user: |
| continue |
| except (EOFError, KeyboardInterrupt): |
| print("\nThe pond remembers.\n") |
| break |
|
|
| if user.lower() in {"quit", "exit"}: |
| print("The pond remembers.\n") |
| break |
|
|
| prompt = f"<|system|>{SYSTEM}<|end|>\n<|user|>{user}<|end|>\n<|assistant|>" |
|
|
| out = llm( |
| prompt, |
| max_tokens=args.max_tokens, |
| temperature=args.temp, |
| top_p=args.top_p, |
| repeat_penalty=args.repeat_penalty, |
| stop=["<|end|>", "<|eot_id|>", "<|user|>", "\n\n"], |
| ) |
| reply = (out["choices"][0]["text"] or "").strip() |
| if not reply: |
| reply = "The pond remains still. 🪞🌊" |
| print(f"Mirror: {reply}\n") |
|
|
| if __name__ == "__main__": |
| main() |
|
|