File size: 6,164 Bytes
e1c0b77
 
 
 
 
 
 
 
 
f5c39d2
 
 
e1c0b77
 
8087133
 
 
 
 
 
 
e1c0b77
 
 
 
 
 
 
12c4c0f
e1c0b77
 
12c4c0f
e1c0b77
f5c39d2
e1c0b77
 
12c4c0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56cd204
 
 
 
 
12c4c0f
 
 
 
 
 
 
e1c0b77
 
 
 
 
 
83ec3f5
12c4c0f
 
 
 
 
 
 
e1c0b77
 
12c4c0f
 
e1c0b77
 
 
 
 
 
83ec3f5
e1c0b77
 
8f2e039
e1c0b77
83ec3f5
 
5ad43b3
 
83ec3f5
8f2e039
e1c0b77
83ec3f5
e1c0b77
8f2e039
 
 
e1c0b77
 
 
 
 
f5c39d2
8087133
 
 
 
 
 
 
 
a474fe5
 
 
 
 
 
 
8087133
 
 
 
 
 
 
 
 
8f2e039
8087133
 
 
8f2e039
8087133
 
 
 
e1c0b77
8087133
e1c0b77
8087133
 
 
 
e1c0b77
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
model/llm.py β€” LLM interface backed by MiniCPM4-8B via the Transformers library.

Responsibility:
    Provide a thin, singleton wrapper around the HuggingFace pipeline so that
    core modules can call `get_llm().generate(prompt)` without knowing anything
    about the underlying model loading or tokenisation details.

Model choice:
    build-small-hackathon/MiniCPM4.1-8B-PaperProf β€” QLoRA fine-tune of
    openbmb/MiniCPM4.1-8B on SQuAD/SciQ in PaperProf's production prompt
    format. Thinking mode disabled. Requires transformers >= 4.56.

Environment variables:
    PAPERPROF_MODEL      Override the default model ID (e.g. "openbmb/MiniCPM3-4B"
                         for a smaller fallback during local testing).
    PAPERPROF_DEVICE     "cuda", "mps", or "cpu" (default: auto-detected).
    PAPERPROF_RUNTIME    "transformers" (default) or "llamacpp" to run the GGUF
                         model through the llama.cpp runtime instead.
    PAPERPROF_GGUF_REPO  GGUF repo for the llamacpp runtime
                         (default: build-small-hackathon/MiniCPM4-8B-PaperProf-GGUF).

Public API:
    get_llm() -> LLM          β€” return the singleton instance
    LLM.generate(prompt) -> str
"""

import os
import ctypes
import torch
from functools import lru_cache
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

DEFAULT_MODEL_ID = "build-small-hackathon/MiniCPM4.1-8B-PaperProf"
DEFAULT_MAX_NEW_TOKENS = 512

# Pre-load libnvJitLink.so.13 bundled with the nvidia-cu13 wheel so that
# bitsandbytes can find it when it calls dlopen internally.
def _preload_nvjitlink() -> None:
    try:
        import site
        for sp in site.getsitepackages():
            candidate = os.path.join(sp, "nvidia", "cu13", "lib", "libnvJitLink.so.13")
            if os.path.exists(candidate):
                ctypes.CDLL(candidate)
                return
    except Exception:
        pass

_preload_nvjitlink()


def _build_quantization_config(vram_gb: float):
    # HF Spaces (ZeroGPU A10G = 24 GB): skip quantization, use bfloat16 directly
    if os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"):
        return None
    # Locally: 4-bit when VRAM is detected and is < 17 GB
    if 0 < vram_gb < 17:
        try:
            import bitsandbytes  # noqa: F401
            return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
        except Exception:
            pass
    return None


class LLM:
    """Thin wrapper around a HuggingFace text-generation pipeline."""

    def __init__(self, model_id: str, device: str):
        self.model_id = model_id
        self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

        vram_gb = 0.0
        if torch.cuda.is_available():
            vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
        quant_cfg = _build_quantization_config(vram_gb)
        print(f"[LLM] VRAM={vram_gb:.1f}GB β€” {'4-bit quant' if quant_cfg else 'bfloat16'}")

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=quant_cfg,
            torch_dtype=torch.bfloat16 if quant_cfg is None else None,
            device_map=device,
            trust_remote_code=True,
        )
        self._pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=self._tokenizer,
        )

    def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
        """Run *prompt* through the model and return the generated text only."""
        messages = [{"role": "user", "content": prompt}]
        text = self._tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True,
            enable_thinking=False,
        )
        sample = temperature > 0.0
        output = self._pipe(
            text,
            max_new_tokens=max_new_tokens,
            do_sample=sample,
            temperature=temperature if sample else None,
            top_p=0.95 if sample else None,
            return_full_text=False,
        )
        return output[0]["generated_text"]


DEFAULT_GGUF_REPO = "build-small-hackathon/MiniCPM4.1-8B-PaperProf-GGUF"


class LlamaCppLLM:
    """Same .generate() interface as LLM, backed by the llama.cpp runtime."""

    def __init__(self, repo_id: str):
        from llama_cpp import Llama

        # On ZeroGPU Spaces the CUDA context only exists inside @spaces.GPU
        # windows and dies between calls β€” a cached model with GPU layers
        # would break on the second request. Default to CPU there; llama.cpp
        # makes 8B Q4 usable on CPU for our short outputs.
        on_spaces = bool(os.environ.get("SPACE_ID") or os.environ.get("SPACE_AUTHOR_NAME"))
        default_layers = 0 if on_spaces else (-1 if torch.cuda.is_available() else 0)
        n_gpu_layers = int(os.environ.get("PAPERPROF_GGUF_GPU_LAYERS", default_layers))
        print(f"[LlamaCppLLM] loading {repo_id} (n_gpu_layers={n_gpu_layers})")
        self._llm = Llama.from_pretrained(
            repo_id=repo_id,
            filename="*Q4_K_M.gguf",
            n_gpu_layers=n_gpu_layers,
            n_ctx=4096,
            verbose=False,
        )

    def generate(self, prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = 0.0) -> str:
        out = self._llm.create_chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_new_tokens,
            temperature=temperature,
        )
        return out["choices"][0]["message"]["content"]


@lru_cache(maxsize=1)
def get_llm():
    """Return the singleton LLM, loading the model on first call."""
    runtime = os.environ.get("PAPERPROF_RUNTIME", "transformers").lower()
    if runtime == "llamacpp":
        repo_id = os.environ.get("PAPERPROF_GGUF_REPO", DEFAULT_GGUF_REPO)
        return LlamaCppLLM(repo_id=repo_id)
    model_id = os.environ.get("PAPERPROF_MODEL", DEFAULT_MODEL_ID)
    device = os.environ.get("PAPERPROF_DEVICE", "auto")
    return LLM(model_id=model_id, device=device)