Spaces:
Sleeping
Sleeping
File size: 6,164 Bytes
e1c0b77 f5c39d2 e1c0b77 8087133 e1c0b77 12c4c0f e1c0b77 12c4c0f e1c0b77 f5c39d2 e1c0b77 12c4c0f 56cd204 12c4c0f e1c0b77 83ec3f5 12c4c0f e1c0b77 12c4c0f e1c0b77 83ec3f5 e1c0b77 8f2e039 e1c0b77 83ec3f5 5ad43b3 83ec3f5 8f2e039 e1c0b77 83ec3f5 e1c0b77 8f2e039 e1c0b77 f5c39d2 8087133 a474fe5 8087133 8f2e039 8087133 8f2e039 8087133 e1c0b77 8087133 e1c0b77 8087133 e1c0b77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | """
model/llm.py β LLM interface backed by MiniCPM4-8B via the Transformers library.
Responsibility:
Provide a thin, singleton wrapper around the HuggingFace pipeline so that
core modules can call `get_llm().generate(prompt)` without knowing anything
about the underlying model loading or tokenisation details.
Model choice:
build-small-hackathon/MiniCPM4.1-8B-PaperProf β QLoRA fine-tune of
openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt
format. Thinking mode disabled. Requires transformers >= 4.56.
Environment variables:
PAPERPROF_MODEL Override the default model ID (e.g. "openbmb/MiniCPM3-4B"
for a smaller fallback during local testing).
PAPERPROF_DEVICE "cuda", "mps", or "cpu" (default: auto-detected).
PAPERPROF_RUNTIME "transformers" (default) or "llamacpp" to run the GGUF
model through the llama.cpp runtime instead.
PAPERPROF_GGUF_REPO GGUF repo for the llamacpp runtime
(default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF).
Public API:
get_llm() -> LLM β return the singleton instance
LLM.generate(prompt) -> str
"""
import os
import ctypes
import torch
from functools import lru_cache
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf"
DEFAULT_MAX_NEW_TOKENS = 512
# Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that
# bitsandbytes can find it when it calls dlopen internally.
def _preload_nvjitlink() -> None:
try:
import site
for sp in site.getsitepackages():
candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13")
if os.path.exists(candidate):
ctypes.CDLL(candidate)
return
except Exception:
pass
_preload_nvjitlink()
def _build_quantization_config(vram_gb: float):
# HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly
if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"):
return None
# Locally: 4-bit when VRAM is detected and is < 17 GB
if 0 < vram_gb < 17:
try:
import bitsandbytes # noqa: F401
return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
except Exception:
pass
return None
class LLM:
"""Thin wrapper around a HuggingFace text-generation pipeline."""
def __init__(self, model_id: str, device: str):
self.model_id = model_id
self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
vram_gb = 0.0
if torch.cuda.is_available():
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
quant_cfg = _build_quantization_config(vram_gb)
print(f"[LLM] VRAM={vram_gb:.1f}GB β {'4-bit quant' if quant_cfg else 'bfloat16'}")
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quant_cfg,
torch_dtype=torch.bfloat16 if quant_cfg is None else None,
device_map=device,
trust_remote_code=True,
)
self._pipe = pipeline(
"text-generation",
model=model,
tokenizer=self._tokenizer,
)
def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
"""Run *prompt* through the model and return the generated text only."""
messages = [{"role": "user", "content": prompt}]
text = self._tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
enable_thinking=False,
)
sample = temperature > 0.0
output = self._pipe(
text,
max_new_tokens=max_new_tokens,
do_sample=sample,
temperature=temperature if sample else None,
top_p=0.95 if sample else None,
return_full_text=False,
)
return output[0]["generated_text"]
DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF"
class LlamaCppLLM:
"""Same .generate() interface as LLM, backed by the llama.cpp runtime."""
def __init__(self, repo_id: str):
from llama_cpp import Llama
# On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU
# windows and dies between calls β a cached model with GPU layers
# would break on the second request. Default to CPU there; llama.cpp
# makes 8B Q4 usable on CPU for our short outputs.
on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"))
default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0)
n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers))
print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})")
self._llm = Llama.from_pretrained(
repo_id=repo_id,
filename="*Q4_K_M.gguf",
n_gpu_layers=n_gpu_layers,
n_ctx=4096,
verbose=False,
)
def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
out = self._llm.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=max_new_tokens,
temperature=temperature,
)
return out["choices"][0]["message"]["content"]
@lru_cache(maxsize=1)
def get_llm():
"""Return the singleton LLM, loading the model on first call."""
runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower()
if runtime == "llamacpp":
repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO)
return LlamaCppLLM(repo_id=repo_id)
model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
device = os.environ.get("PAPERPROF_DEVICE", "auto")
return LLM(model_id=model_id, device=device)
|