| | |
| | """ |
| | llama.cpp chat with zindango-slm (GGUF) for English chat verification. |
| | Uses llama-cpp-python with the Q8_0 quantized model from Hugging Face. |
| | """ |
| | import os |
| | import sys |
| |
|
| |
|
| | def main(): |
| | try: |
| | from llama_cpp import Llama |
| | except ImportError: |
| | print("llama-cpp-python not installed.") |
| | print("Install: pip install llama-cpp-python") |
| | print("Or use pre-built wheels: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu") |
| | print("For GPU: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121") |
| | print("\nAlternatively run: ./scripts/llamacpp_chat.sh (requires llama-cli from llama.cpp)") |
| | return 1 |
| |
|
| | script_dir = os.path.dirname(os.path.abspath(__file__)) |
| | project_root = os.path.dirname(script_dir) |
| | model_dir = os.path.join(project_root, "models", "zindango-slm") |
| | gguf_path = os.path.join(model_dir, "zindango-slm-Q8_0.gguf") |
| |
|
| | if not os.path.isfile(gguf_path): |
| | print(f"GGUF not found at {gguf_path}") |
| | print("Download with: huggingface-cli download ksjpswaroop/zindango-slm zindango-slm-Q8_0.gguf --local-dir models/zindango-slm") |
| | os.makedirs(model_dir, exist_ok=True) |
| | try: |
| | from huggingface_hub import hf_hub_download |
| | print("Downloading zindango-slm-Q8_0.gguf from Hugging Face...") |
| | path = hf_hub_download( |
| | repo_id="ksjpswaroop/zindango-slm", |
| | filename="zindango-slm-Q8_0.gguf", |
| | local_dir=model_dir, |
| | local_dir_use_symlinks=False, |
| | ) |
| | gguf_path = path |
| | except Exception as e: |
| | print(f"Download failed: {e}") |
| | return 1 |
| |
|
| | print("Loading zindango-slm (Q8_0)...") |
| | llm = Llama( |
| | model_path=gguf_path, |
| | n_ctx=2048, |
| | n_threads=os.cpu_count() or 4, |
| | chat_format="chatml", |
| | verbose=False, |
| | ) |
| |
|
| | messages = [ |
| | {"role": "system", "content": "You are a helpful assistant. Always respond in English."}, |
| | ] |
| |
|
| | print("\n" + "=" * 60) |
| | print("zindango-slm Chat (llama.cpp) - English verification") |
| | print("=" * 60) |
| | print("Type your message and press Enter. Commands: /quit, /clear") |
| | print() |
| |
|
| | while True: |
| | try: |
| | user_input = input("You: ").strip() |
| | except (EOFError, KeyboardInterrupt): |
| | print("\nBye!") |
| | break |
| |
|
| | if not user_input: |
| | continue |
| | if user_input.lower() in ("/quit", "/exit", "quit", "exit"): |
| | print("Bye!") |
| | break |
| | if user_input.lower() == "/clear": |
| | messages = [messages[0]] |
| | print("[Context cleared]") |
| | continue |
| |
|
| | messages.append({"role": "user", "content": user_input}) |
| |
|
| | print("Assistant: ", end="", flush=True) |
| | stream = llm.create_chat_completion( |
| | messages=messages, |
| | max_tokens=512, |
| | temperature=0.7, |
| | stream=True, |
| | ) |
| | full_reply = "" |
| | for chunk in stream: |
| | delta = chunk["choices"][0].get("delta", {}) |
| | content = delta.get("content", "") |
| | if content: |
| | print(content, end="", flush=True) |
| | full_reply += content |
| | print() |
| |
|
| | if full_reply: |
| | messages.append({"role": "assistant", "content": full_reply}) |
| |
|
| | return 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | sys.exit(main()) |
| |
|