""" model/llm.py — LLM interface backed by MiniCPM4-8B via the Transformers library. Responsibility: Provide a thin, singleton wrapper around the HuggingFace pipeline so that core modules can call `get_llm().generate(prompt)` without knowing anything about the underlying model loading or tokenisation details. Model choice: build-small-hackathon/MiniCPM4.1-8B-PaperProf — QLoRA fine-tune of openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt format. Thinking mode disabled. Requires transformers >= 4.56. Environment variables: PAPERPROF_MODEL Override the default model ID (e.g. "openbmb/MiniCPM3-4B" for a smaller fallback during local testing). PAPERPROF_DEVICE "cuda", "mps", or "cpu" (default: auto-detected). PAPERPROF_RUNTIME "transformers" (default) or "llamacpp" to run the GGUF model through the llama.cpp runtime instead. PAPERPROF_GGUF_REPO GGUF repo for the llamacpp runtime (default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF). Public API: get_llm() -> LLM — return the singleton instance LLM.generate(prompt) -> str """ import os import ctypes import torch from functools import lru_cache from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf" DEFAULT_MAX_NEW_TOKENS = 512 # Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that # bitsandbytes can find it when it calls dlopen internally. def _preload_nvjitlink() -> None: try: import site for sp in site.getsitepackages(): candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13") if os.path.exists(candidate): ctypes.CDLL(candidate) return except Exception: pass _preload_nvjitlink() def _build_quantization_config(vram_gb: float): # HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"): return None # Locally: 4-bit when VRAM is detected and is < 17 GB if 0 < vram_gb < 17: try: import bitsandbytes # noqa: F401 return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) except Exception: pass return None class LLM: """Thin wrapper around a HuggingFace text-generation pipeline.""" def __init__(self, model_id: str, device: str): self.model_id = model_id self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) vram_gb = 0.0 if torch.cuda.is_available(): vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 quant_cfg = _build_quantization_config(vram_gb) print(f"[LLM] VRAM={vram_gb:.1f}GB — {'4-bit quant' if quant_cfg else 'bfloat16'}") model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=quant_cfg, torch_dtype=torch.bfloat16 if quant_cfg is None else None, device_map=device, trust_remote_code=True, ) self._pipe = pipeline( "text-generation", model=model, tokenizer=self._tokenizer, ) def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str: """Run *prompt* through the model and return the generated text only.""" messages = [{"role": "user", "content": prompt}] text = self._tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, ) sample = temperature > 0.0 output = self._pipe( text, max_new_tokens=max_new_tokens, do_sample=sample, temperature=temperature if sample else None, top_p=0.95 if sample else None, return_full_text=False, ) return output[0]["generated_text"] DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF" class LlamaCppLLM: """Same .generate() interface as LLM, backed by the llama.cpp runtime.""" def __init__(self, repo_id: str): from llama_cpp import Llama # On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU # windows and dies between calls — a cached model with GPU layers # would break on the second request. Default to CPU there; llama.cpp # makes 8B Q4 usable on CPU for our short outputs. on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME")) default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0) n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers)) print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})") self._llm = Llama.from_pretrained( repo_id=repo_id, filename="*Q4_K_M.gguf", n_gpu_layers=n_gpu_layers, n_ctx=4096, verbose=False, ) def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str: out = self._llm.create_chat_completion( messages=[{"role": "user", "content": prompt}], max_tokens=max_new_tokens, temperature=temperature, ) return out["choices"][0]["message"]["content"] @lru_cache(maxsize=1) def get_llm(): """Return the singleton LLM, loading the model on first call.""" runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower() if runtime == "llamacpp": repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO) return LlamaCppLLM(repo_id=repo_id) model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID) device = os.environ.get("PAPERPROF_DEVICE", "auto") return LLM(model_id=model_id, device=device)