Spaces:

build-small-hackathon
/

Cook_with_a_LLM

Running on Zero

File size: 3,575 Bytes

75c5414

"""MiniCPM4.1-8B text-only planner — lazy singleton."""
from __future__ import annotations

import logging
import os
from typing import Any, Optional, Tuple

import torch

from src import config

log = logging.getLogger(__name__)

_model: Any = None
_tokenizer: Any = None


def get_planner() -> Tuple[Optional[Any], Optional[Any]]:
    """Return (model, tokenizer).  Loads once; returns (None, None) on failure."""
    global _model, _tokenizer
    if _model is not None:
        return _model, _tokenizer

    # Prefer fine-tuned repo when available
    model_id = config.PLANNER_FINETUNED_REPO or config.PLANNER_REPO
    try:
        # MiniCPM4.1 custom code imports is_torch_fx_available, which was
        # removed in transformers 5.x. Patch it back before loading.
        import transformers.utils.import_utils as _iutils
        if not hasattr(_iutils, "is_torch_fx_available"):
            def _is_torch_fx_available():
                try:
                    import torch.fx  # noqa: F401
                    return True
                except ImportError:
                    return False
            _iutils.is_torch_fx_available = _is_torch_fx_available

        from transformers import AutoModelForCausalLM, AutoTokenizer

        device_map = "auto" if os.environ.get("SPACE_ID") else (
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        log.info("Loading planner model %s (device_map=%s)...", model_id, device_map)
        _tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        _model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map=device_map,
        ).eval()
        log.info("Planner model ready.")
    except Exception as exc:
        log.error("Could not load planner model '%s': %s", model_id, exc)
        _model = None
        _tokenizer = None

    return _model, _tokenizer


def infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
    """Run text inference with the planner model.

    Returns empty string if the model is unavailable.
    """
    model, tokenizer = get_planner()
    if model is None or tokenizer is None:
        return ""

    try:
        messages = [{"role": "user", "content": prompt}]

        # return_dict=True yields a BatchEncoding (dict-like) with input_ids +
        # attention_mask. NOTE: BatchEncoding is NOT a `dict` instance, so we
        # must access it via mapping keys, never via tensor attrs like .shape.
        enc = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt",
            return_dict=True,
        )
        input_ids = enc["input_ids"].to(model.device)
        input_len = input_ids.shape[1]

        gen_inputs = {"input_ids": input_ids}
        attn = enc.get("attention_mask")
        if attn is not None:
            gen_inputs["attention_mask"] = attn.to(model.device)

        gen_kwargs: dict = dict(max_new_tokens=max_new_tokens, do_sample=False)
        if temperature > 0:
            gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.95)

        with torch.no_grad():
            output = model.generate(**gen_inputs, **gen_kwargs)

        token_ids = output[0][input_len:]
        return tokenizer.decode(token_ids, skip_special_tokens=True)

    except Exception as exc:
        log.error("Planner inference error: %r", exc, exc_info=True)
        return ""