PaperProf / model /llm.py
Mehdi
fix: enable sampling — temperature=0.8 for MCQ, 0.4 for evaluator
8f2e039
Raw
History Blame Contribute Delete
6.16 kB
"""
model/llm.py — LLM interface backed by MiniCPM4-8B via the Transformers library.
Responsibility:
Provide a thin, singleton wrapper around the HuggingFace pipeline so that
core modules can call `get_llm().generate(prompt)` without knowing anything
about the underlying model loading or tokenisation details.
Model choice:
build-small-hackathon/MiniCPM4.1-8B-PaperProf — QLoRA fine-tune of
openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt
format. Thinking mode disabled. Requires transformers >= 4.56.
Environment variables:
PAPERPROF_MODEL Override the default model ID (e.g. "openbmb/MiniCPM3-4B"
for a smaller fallback during local testing).
PAPERPROF_DEVICE "cuda", "mps", or "cpu" (default: auto-detected).
PAPERPROF_RUNTIME "transformers" (default) or "llamacpp" to run the GGUF
model through the llama.cpp runtime instead.
PAPERPROF_GGUF_REPO GGUF repo for the llamacpp runtime
(default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF).
Public API:
get_llm() -> LLM — return the singleton instance
LLM.generate(prompt) -> str
"""
import os
import ctypes
import torch
from functools import lru_cache
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf"
DEFAULT_MAX_NEW_TOKENS = 512
# Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that
# bitsandbytes can find it when it calls dlopen internally.
def _preload_nvjitlink() -> None:
try:
import site
for sp in site.getsitepackages():
candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13")
if os.path.exists(candidate):
ctypes.CDLL(candidate)
return
except Exception:
pass
_preload_nvjitlink()
def _build_quantization_config(vram_gb: float):
# HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly
if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"):
return None
# Locally: 4-bit when VRAM is detected and is < 17 GB
if 0 < vram_gb < 17:
try:
import bitsandbytes # noqa: F401
return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
except Exception:
pass
return None
class LLM:
"""Thin wrapper around a HuggingFace text-generation pipeline."""
def __init__(self, model_id: str, device: str):
self.model_id = model_id
self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
vram_gb = 0.0
if torch.cuda.is_available():
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
quant_cfg = _build_quantization_config(vram_gb)
print(f"[LLM] VRAM={vram_gb:.1f}GB — {'4-bit quant' if quant_cfg else 'bfloat16'}")
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quant_cfg,
torch_dtype=torch.bfloat16 if quant_cfg is None else None,
device_map=device,
trust_remote_code=True,
)
self._pipe = pipeline(
"text-generation",
model=model,
tokenizer=self._tokenizer,
)
def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
"""Run *prompt* through the model and return the generated text only."""
messages = [{"role": "user", "content": prompt}]
text = self._tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
enable_thinking=False,
)
sample = temperature > 0.0
output = self._pipe(
text,
max_new_tokens=max_new_tokens,
do_sample=sample,
temperature=temperature if sample else None,
top_p=0.95 if sample else None,
return_full_text=False,
)
return output[0]["generated_text"]
DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF"
class LlamaCppLLM:
"""Same .generate() interface as LLM, backed by the llama.cpp runtime."""
def __init__(self, repo_id: str):
from llama_cpp import Llama
# On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU
# windows and dies between calls — a cached model with GPU layers
# would break on the second request. Default to CPU there; llama.cpp
# makes 8B Q4 usable on CPU for our short outputs.
on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"))
default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0)
n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers))
print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})")
self._llm = Llama.from_pretrained(
repo_id=repo_id,
filename="*Q4_K_M.gguf",
n_gpu_layers=n_gpu_layers,
n_ctx=4096,
verbose=False,
)
def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
out = self._llm.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=max_new_tokens,
temperature=temperature,
)
return out["choices"][0]["message"]["content"]
@lru_cache(maxsize=1)
def get_llm():
"""Return the singleton LLM, loading the model on first call."""
runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower()
if runtime == "llamacpp":
repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO)
return LlamaCppLLM(repo_id=repo_id)
model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
device = os.environ.get("PAPERPROF_DEVICE", "auto")
return LLM(model_id=model_id, device=device)