Spaces:
Running on Zero
Running on Zero
| """ | |
| model/llm.py — LLM interface backed by MiniCPM4-8B via the Transformers library. | |
| Responsibility: | |
| Provide a thin, singleton wrapper around the HuggingFace pipeline so that | |
| core modules can call `get_llm().generate(prompt)` without knowing anything | |
| about the underlying model loading or tokenisation details. | |
| Model choice: | |
| build-small-hackathon/MiniCPM4.1-8B-PaperProf — QLoRA fine-tune of | |
| openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt | |
| format. Thinking mode disabled. Requires transformers >= 4.56. | |
| Environment variables: | |
| PAPERPROF_MODEL Override the default model ID (e.g. "openbmb/MiniCPM3-4B" | |
| for a smaller fallback during local testing). | |
| PAPERPROF_DEVICE "cuda", "mps", or "cpu" (default: auto-detected). | |
| PAPERPROF_RUNTIME "transformers" (default) or "llamacpp" to run the GGUF | |
| model through the llama.cpp runtime instead. | |
| PAPERPROF_GGUF_REPO GGUF repo for the llamacpp runtime | |
| (default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF). | |
| Public API: | |
| get_llm() -> LLM — return the singleton instance | |
| LLM.generate(prompt) -> str | |
| """ | |
| import os | |
| import ctypes | |
| import torch | |
| from functools import lru_cache | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig | |
| DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf" | |
| DEFAULT_MAX_NEW_TOKENS = 512 | |
| # Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that | |
| # bitsandbytes can find it when it calls dlopen internally. | |
| def _preload_nvjitlink() -> None: | |
| try: | |
| import site | |
| for sp in site.getsitepackages(): | |
| candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13") | |
| if os.path.exists(candidate): | |
| ctypes.CDLL(candidate) | |
| return | |
| except Exception: | |
| pass | |
| _preload_nvjitlink() | |
| def _build_quantization_config(vram_gb: float): | |
| # HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly | |
| if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"): | |
| return None | |
| # Locally: 4-bit when VRAM is detected and is < 17 GB | |
| if 0 < vram_gb < 17: | |
| try: | |
| import bitsandbytes # noqa: F401 | |
| return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) | |
| except Exception: | |
| pass | |
| return None | |
| class LLM: | |
| """Thin wrapper around a HuggingFace text-generation pipeline.""" | |
| def __init__(self, model_id: str, device: str): | |
| self.model_id = model_id | |
| self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| vram_gb = 0.0 | |
| if torch.cuda.is_available(): | |
| vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 | |
| quant_cfg = _build_quantization_config(vram_gb) | |
| print(f"[LLM] VRAM={vram_gb:.1f}GB — {'4-bit quant' if quant_cfg else 'bfloat16'}") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| quantization_config=quant_cfg, | |
| torch_dtype=torch.bfloat16 if quant_cfg is None else None, | |
| device_map=device, | |
| trust_remote_code=True, | |
| ) | |
| self._pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=self._tokenizer, | |
| ) | |
| def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str: | |
| """Run *prompt* through the model and return the generated text only.""" | |
| messages = [{"role": "user", "content": prompt}] | |
| text = self._tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True, | |
| enable_thinking=False, | |
| ) | |
| sample = temperature > 0.0 | |
| output = self._pipe( | |
| text, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=sample, | |
| temperature=temperature if sample else None, | |
| top_p=0.95 if sample else None, | |
| return_full_text=False, | |
| ) | |
| return output[0]["generated_text"] | |
| DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF" | |
| class LlamaCppLLM: | |
| """Same .generate() interface as LLM, backed by the llama.cpp runtime.""" | |
| def __init__(self, repo_id: str): | |
| from llama_cpp import Llama | |
| # On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU | |
| # windows and dies between calls — a cached model with GPU layers | |
| # would break on the second request. Default to CPU there; llama.cpp | |
| # makes 8B Q4 usable on CPU for our short outputs. | |
| on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME")) | |
| default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0) | |
| n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers)) | |
| print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})") | |
| self._llm = Llama.from_pretrained( | |
| repo_id=repo_id, | |
| filename="*Q4_K_M.gguf", | |
| n_gpu_layers=n_gpu_layers, | |
| n_ctx=4096, | |
| verbose=False, | |
| ) | |
| def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str: | |
| out = self._llm.create_chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=max_new_tokens, | |
| temperature=temperature, | |
| ) | |
| return out["choices"][0]["message"]["content"] | |
| def get_llm(): | |
| """Return the singleton LLM, loading the model on first call.""" | |
| runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower() | |
| if runtime == "llamacpp": | |
| repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO) | |
| return LlamaCppLLM(repo_id=repo_id) | |
| model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID) | |
| device = os.environ.get("PAPERPROF_DEVICE", "auto") | |
| return LLM(model_id=model_id, device=device) | |