#!/usr/bin/env python3 """ llama.cpp chat with zindango-slm (GGUF) for English chat verification. Uses llama-cpp-python with the Q8_0 quantized model from Hugging Face. """ import os import sys def main(): try: from llama_cpp import Llama except ImportError: print("llama-cpp-python not installed.") print("Install: pip install llama-cpp-python") print("Or use pre-built wheels: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu") print("For GPU: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121") print("\nAlternatively run: ./scripts/llamacpp_chat.sh (requires llama-cli from llama.cpp)") return 1 script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(script_dir) model_dir = os.path.join(project_root, "models", "zindango-slm") gguf_path = os.path.join(model_dir, "zindango-slm-Q8_0.gguf") if not os.path.isfile(gguf_path): print(f"GGUF not found at {gguf_path}") print("Download with: huggingface-cli download ksjpswaroop/zindango-slm zindango-slm-Q8_0.gguf --local-dir models/zindango-slm") os.makedirs(model_dir, exist_ok=True) try: from huggingface_hub import hf_hub_download print("Downloading zindango-slm-Q8_0.gguf from Hugging Face...") path = hf_hub_download( repo_id="ksjpswaroop/zindango-slm", filename="zindango-slm-Q8_0.gguf", local_dir=model_dir, local_dir_use_symlinks=False, ) gguf_path = path except Exception as e: print(f"Download failed: {e}") return 1 print("Loading zindango-slm (Q8_0)...") llm = Llama( model_path=gguf_path, n_ctx=2048, n_threads=os.cpu_count() or 4, chat_format="chatml", verbose=False, ) messages = [ {"role": "system", "content": "You are a helpful assistant. Always respond in English."}, ] print("\n" + "=" * 60) print("zindango-slm Chat (llama.cpp) - English verification") print("=" * 60) print("Type your message and press Enter. Commands: /quit, /clear") print() while True: try: user_input = input("You: ").strip() except (EOFError, KeyboardInterrupt): print("\nBye!") break if not user_input: continue if user_input.lower() in ("/quit", "/exit", "quit", "exit"): print("Bye!") break if user_input.lower() == "/clear": messages = [messages[0]] print("[Context cleared]") continue messages.append({"role": "user", "content": user_input}) print("Assistant: ", end="", flush=True) stream = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, stream=True, ) full_reply = "" for chunk in stream: delta = chunk["choices"][0].get("delta", {}) content = delta.get("content", "") if content: print(content, end="", flush=True) full_reply += content print() if full_reply: messages.append({"role": "assistant", "content": full_reply}) return 0 if __name__ == "__main__": sys.exit(main())