Spaces:

build-small-hackathon
/

PaperProf

Sleeping

File size: 6,164 Bytes

"""
model/llm.py — LLM interface backed by MiniCPM4-8B via the Transformers library.

Responsibility:
    Provide a thin, singleton wrapper around the HuggingFace pipeline so that
    core modules can call `get_llm().generate(prompt)` without knowing anything
    about the underlying model loading or tokenisation details.

Model choice:
    build-small-hackathon/MiniCPM4.1-8B-PaperProf — QLoRA fine-tune of
    openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt
    format. Thinking mode disabled. Requires transformers >= 4.56.

Environment variables:
    PAPERPROF_MODEL      Override the default model ID (e.g. "openbmb/MiniCPM3-4B"
                         for a smaller fallback during local testing).
    PAPERPROF_DEVICE     "cuda", "mps", or "cpu" (default: auto-detected).
    PAPERPROF_RUNTIME    "transformers" (default) or "llamacpp" to run the GGUF
                         model through the llama.cpp runtime instead.
    PAPERPROF_GGUF_REPO  GGUF repo for the llamacpp runtime
                         (default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF).

Public API:
    get_llm() -> LLM          — return the singleton instance
    LLM.generate(prompt) -> str
"""

import os
import ctypes
import torch
from functools import lru_cache
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf"
DEFAULT_MAX_NEW_TOKENS = 512

# Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that
# bitsandbytes can find it when it calls dlopen internally.
def _preload_nvjitlink() -> None:
    try:
        import site
        for sp in site.getsitepackages():
            candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13")
            if os.path.exists(candidate):
                ctypes.CDLL(candidate)
                return
    except Exception:
        pass

_preload_nvjitlink()


def _build_quantization_config(vram_gb: float):
    # HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly
    if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"):
        return None
    # Locally: 4-bit when VRAM is detected and is < 17 GB
    if 0 < vram_gb < 17:
        try:
            import bitsandbytes  # noqa: F401
            return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
        except Exception:
            pass
    return None


class LLM:
    """Thin wrapper around a HuggingFace text-generation pipeline."""

    def __init__(self, model_id: str, device: str):
        self.model_id = model_id
        self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

        vram_gb = 0.0
        if torch.cuda.is_available():
            vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
        quant_cfg = _build_quantization_config(vram_gb)
        print(f"[LLM] VRAM={vram_gb:.1f}GB — {'4-bit quant' if quant_cfg else 'bfloat16'}")

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=quant_cfg,
            torch_dtype=torch.bfloat16 if quant_cfg is None else None,
            device_map=device,
            trust_remote_code=True,
        )
        self._pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=self._tokenizer,
        )

    def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
        """Run *prompt* through the model and return the generated text only."""
        messages = [{"role": "user", "content": prompt}]
        text = self._tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True,
            enable_thinking=False,
        )
        sample = temperature > 0.0
        output = self._pipe(
            text,
            max_new_tokens=max_new_tokens,
            do_sample=sample,
            temperature=temperature if sample else None,
            top_p=0.95 if sample else None,
            return_full_text=False,
        )
        return output[0]["generated_text"]


DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF"


class LlamaCppLLM:
    """Same .generate() interface as LLM, backed by the llama.cpp runtime."""

    def __init__(self, repo_id: str):
        from llama_cpp import Llama

        # On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU
        # windows and dies between calls — a cached model with GPU layers
        # would break on the second request. Default to CPU there; llama.cpp
        # makes 8B Q4 usable on CPU for our short outputs.
        on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"))
        default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0)
        n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers))
        print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})")
        self._llm = Llama.from_pretrained(
            repo_id=repo_id,
            filename="*Q4_K_M.gguf",
            n_gpu_layers=n_gpu_layers,
            n_ctx=4096,
            verbose=False,
        )

    def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
        out = self._llm.create_chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_new_tokens,
            temperature=temperature,
        )
        return out["choices"][0]["message"]["content"]


@lru_cache(maxsize=1)
def get_llm():
    """Return the singleton LLM, loading the model on first call."""
    runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower()
    if runtime == "llamacpp":
        repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO)
        return LlamaCppLLM(repo_id=repo_id)
    model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
    device = os.environ.get("PAPERPROF_DEVICE", "auto")
    return LLM(model_id=model_id, device=device)