Spaces:

build-small-hackathon
/

PaperProf

Running on Zero

Mehdi

fix: enable sampling — temperature=0.8 for MCQ, 0.4 for evaluator

8f2e039 14 days ago

6.16 kB

	"""
	model/llm.py — LLM interface backed by MiniCPM4-8B via the Transformers library.

	Responsibility:
	Provide a thin, singleton wrapper around the HuggingFace pipeline so that
	core modules can call `get_llm().generate(prompt)` without knowing anything
	about the underlying model loading or tokenisation details.

	Model choice:
	build-small-hackathon/MiniCPM4.1-8B-PaperProf — QLoRA fine-tune of
	openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt
	format. Thinking mode disabled. Requires transformers >= 4.56.

	Environment variables:
	PAPERPROF_MODEL Override the default model ID (e.g. "openbmb/MiniCPM3-4B"
	for a smaller fallback during local testing).
	PAPERPROF_DEVICE "cuda", "mps", or "cpu" (default: auto-detected).
	PAPERPROF_RUNTIME "transformers" (default) or "llamacpp" to run the GGUF
	model through the llama.cpp runtime instead.
	PAPERPROF_GGUF_REPO GGUF repo for the llamacpp runtime
	(default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF).

	Public API:
	get_llm() -> LLM — return the singleton instance
	LLM.generate(prompt) -> str
	"""

	import os
	import ctypes
	import torch
	from functools import lru_cache
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

	DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf"
	DEFAULT_MAX_NEW_TOKENS = 512

	# Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that
	# bitsandbytes can find it when it calls dlopen internally.
	def _preload_nvjitlink() -> None:
	try:
	import site
	for sp in site.getsitepackages():
	candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13")
	if os.path.exists(candidate):
	ctypes.CDLL(candidate)
	return
	except Exception:
	pass

	_preload_nvjitlink()


	def _build_quantization_config(vram_gb: float):
	# HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly
	if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"):
	return None
	# Locally: 4-bit when VRAM is detected and is < 17 GB
	if 0 < vram_gb < 17:
	try:
	import bitsandbytes # noqa: F401
	return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
	except Exception:
	pass
	return None


	class LLM:
	"""Thin wrapper around a HuggingFace text-generation pipeline."""

	def __init__(self, model_id: str, device: str):
	self.model_id = model_id
	self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

	vram_gb = 0.0
	if torch.cuda.is_available():
	vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
	quant_cfg = _build_quantization_config(vram_gb)
	print(f"[LLM] VRAM={vram_gb:.1f}GB — {'4-bit quant' if quant_cfg else 'bfloat16'}")

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	quantization_config=quant_cfg,
	torch_dtype=torch.bfloat16 if quant_cfg is None else None,
	device_map=device,
	trust_remote_code=True,
	)
	self._pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=self._tokenizer,
	)

	def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
	"""Run prompt through the model and return the generated text only."""
	messages = [{"role": "user", "content": prompt}]
	text = self._tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True,
	enable_thinking=False,
	)
	sample = temperature > 0.0
	output = self._pipe(
	text,
	max_new_tokens=max_new_tokens,
	do_sample=sample,
	temperature=temperature if sample else None,
	top_p=0.95 if sample else None,
	return_full_text=False,
	)
	return output[0]["generated_text"]


	DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF"


	class LlamaCppLLM:
	"""Same .generate() interface as LLM, backed by the llama.cpp runtime."""

	def __init__(self, repo_id: str):
	from llama_cpp import Llama

	# On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU
	# windows and dies between calls — a cached model with GPU layers
	# would break on the second request. Default to CPU there; llama.cpp
	# makes 8B Q4 usable on CPU for our short outputs.
	on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"))
	default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0)
	n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers))
	print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})")
	self._llm = Llama.from_pretrained(
	repo_id=repo_id,
	filename="*Q4_K_M.gguf",
	n_gpu_layers=n_gpu_layers,
	n_ctx=4096,
	verbose=False,
	)

	def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
	out = self._llm.create_chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_new_tokens,
	temperature=temperature,
	)
	return out["choices"][0]["message"]["content"]


	@lru_cache(maxsize=1)
	def get_llm():
	"""Return the singleton LLM, loading the model on first call."""
	runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower()
	if runtime == "llamacpp":
	repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO)
	return LlamaCppLLM(repo_id=repo_id)
	model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
	device = os.environ.get("PAPERPROF_DEVICE", "auto")
	return LLM(model_id=model_id, device=device)