import os from pathlib import Path from typing import Iterable, Optional try: from llama_cpp import Llama except ImportError: Llama = None MODEL_PATH = Path(__file__).parent / "dolphin.gguf" _llm = None def get_llm(): global _llm if Llama is None: raise RuntimeError("llama-cpp-python is not installed. Install requirements.txt to enable local AI replies.") if _llm is None: if not MODEL_PATH.exists(): raise FileNotFoundError(f"Model not found at {MODEL_PATH}") print(f"Loading model from {MODEL_PATH}...") _llm = Llama( model_path=str(MODEL_PATH), n_ctx=4096, n_threads=min(os.cpu_count() or 4, 8), n_batch=512, verbose=False, ) return _llm def ask( prompt: str, system_prompt: Optional[str] = None, history: Optional[Iterable[dict]] = None, temperature: float = 0.7, max_tokens: int = 512, ) -> str: if system_prompt is None: system_prompt = "" messages = [{"role": "system", "content": system_prompt}] if history: for item in history: role = item.get("role") content = str(item.get("content", "")).strip() if role in {"user", "assistant"} and content: messages.append({"role": role, "content": content}) messages.append({"role": "user", "content": prompt}) response = get_llm().create_chat_completion( messages=messages, temperature=temperature, max_tokens=max_tokens, ) return response["choices"][0]["message"]["content"].strip() def stream_chat( messages: Iterable[dict], temperature: float = 0.7, max_tokens: int = 180, ) -> Iterable[str]: clean_messages = [] for item in messages: role = item.get("role") content = str(item.get("content", "")).strip() if role in {"system", "user", "assistant"} and content: clean_messages.append({"role": role, "content": content}) if not clean_messages: raise ValueError("stream_chat requires at least one message.") stream = get_llm().create_chat_completion( messages=clean_messages, temperature=temperature, max_tokens=max_tokens, stream=True, ) for chunk in stream: choices = chunk.get("choices") or [] if not choices: continue delta = choices[0].get("delta") or {} content = delta.get("content") if content: yield content def stream_ask( prompt: str, system_prompt: Optional[str] = None, history: Optional[Iterable[dict]] = None, temperature: float = 0.7, max_tokens: int = 180, ) -> Iterable[str]: if system_prompt is None: system_prompt = "" messages = [{"role": "system", "content": system_prompt}] if history: for item in history: role = item.get("role") content = str(item.get("content", "")).strip() if role in {"user", "assistant"} and content: messages.append({"role": role, "content": content}) messages.append({"role": "user", "content": prompt}) return stream_chat(messages, temperature=temperature, max_tokens=max_tokens) if __name__ == "__main__": while True: user_input = input("You: ") if user_input.lower() in {"exit", "quit"}: break reply = ask(user_input) print(f"Model: {reply}")