Spaces:
Sleeping
Sleeping
| import os | |
| import llama_cpp._internals | |
| # Silently fix LlamaModel destructor bug when initialization fails | |
| try: | |
| _original_close = llama_cpp._internals.LlamaModel.close | |
| def _safe_close(self, *args, **kwargs): | |
| if not hasattr(self, 'sampler'): | |
| self.sampler = None | |
| return _original_close(self, *args, **kwargs) | |
| llama_cpp._internals.LlamaModel.close = _safe_close | |
| except Exception: | |
| pass | |
| from llama_cpp import Llama | |
| from config import MODEL_PATH | |
| _llm_instance = None | |
| _llama_failed = False | |
| def get_llm() -> Llama: | |
| """Lazy-load the LLM so the server starts even without a model file.""" | |
| global _llm_instance, _llama_failed | |
| if _llama_failed: | |
| raise RuntimeError("llama-cpp-python previously failed to initialize on this hardware.") | |
| if _llm_instance is None: | |
| if not os.path.exists(MODEL_PATH): | |
| raise FileNotFoundError( | |
| f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n" | |
| f" Download a GGUF model and place it at: {MODEL_PATH}\n" | |
| f" Recommended: Gemma 3 1B Instruct (Q4_K_M)\n" | |
| f" URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n" | |
| ) | |
| try: | |
| cpu_count = os.cpu_count() | |
| default_threads = max(1, min(4, cpu_count if cpu_count else 2)) | |
| threads = int(os.getenv("LLAMA_THREADS", str(default_threads))) | |
| _llm_instance = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=2048, | |
| n_threads=threads, | |
| verbose=False, | |
| ) | |
| except Exception as e: | |
| _llama_failed = True | |
| raise e | |
| return _llm_instance | |
| # Keep a module-level alias for backwards compatibility with existing imports | |
| class _LazyLlm: | |
| """Proxy that loads the real model on first attribute access.""" | |
| def __call__(self, *args, **kwargs): | |
| return get_llm()(*args, **kwargs) | |
| def __getattr__(self, name): | |
| return getattr(get_llm(), name) | |
| llm = _LazyLlm() |