Spaces:
Running on Zero
Running on Zero
| """MiniCPM4.1-8B text-only planner — lazy singleton.""" | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| from typing import Any, Optional, Tuple | |
| import torch | |
| from src import config | |
| log = logging.getLogger(__name__) | |
| _model: Any = None | |
| _tokenizer: Any = None | |
| def get_planner() -> Tuple[Optional[Any], Optional[Any]]: | |
| """Return (model, tokenizer). Loads once; returns (None, None) on failure.""" | |
| global _model, _tokenizer | |
| if _model is not None: | |
| return _model, _tokenizer | |
| # Prefer fine-tuned repo when available | |
| model_id = config.PLANNER_FINETUNED_REPO or config.PLANNER_REPO | |
| try: | |
| # MiniCPM4.1 custom code imports is_torch_fx_available, which was | |
| # removed in transformers 5.x. Patch it back before loading. | |
| import transformers.utils.import_utils as _iutils | |
| if not hasattr(_iutils, "is_torch_fx_available"): | |
| def _is_torch_fx_available(): | |
| try: | |
| import torch.fx # noqa: F401 | |
| return True | |
| except ImportError: | |
| return False | |
| _iutils.is_torch_fx_available = _is_torch_fx_available | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| device_map = "auto" if os.environ.get("SPACE_ID") else ( | |
| "cuda" if torch.cuda.is_available() else "cpu" | |
| ) | |
| log.info("Loading planner model %s (device_map=%s)...", model_id, device_map) | |
| _tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| device_map=device_map, | |
| ).eval() | |
| log.info("Planner model ready.") | |
| except Exception as exc: | |
| log.error("Could not load planner model '%s': %s", model_id, exc) | |
| _model = None | |
| _tokenizer = None | |
| return _model, _tokenizer | |
| def infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str: | |
| """Run text inference with the planner model. | |
| Returns empty string if the model is unavailable. | |
| """ | |
| model, tokenizer = get_planner() | |
| if model is None or tokenizer is None: | |
| return "" | |
| try: | |
| messages = [{"role": "user", "content": prompt}] | |
| # return_dict=True yields a BatchEncoding (dict-like) with input_ids + | |
| # attention_mask. NOTE: BatchEncoding is NOT a `dict` instance, so we | |
| # must access it via mapping keys, never via tensor attrs like .shape. | |
| enc = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_tensors="pt", | |
| return_dict=True, | |
| ) | |
| input_ids = enc["input_ids"].to(model.device) | |
| input_len = input_ids.shape[1] | |
| gen_inputs = {"input_ids": input_ids} | |
| attn = enc.get("attention_mask") | |
| if attn is not None: | |
| gen_inputs["attention_mask"] = attn.to(model.device) | |
| gen_kwargs: dict = dict(max_new_tokens=max_new_tokens, do_sample=False) | |
| if temperature > 0: | |
| gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.95) | |
| with torch.no_grad(): | |
| output = model.generate(**gen_inputs, **gen_kwargs) | |
| token_ids = output[0][input_len:] | |
| return tokenizer.decode(token_ids, skip_special_tokens=True) | |
| except Exception as exc: | |
| log.error("Planner inference error: %r", exc, exc_info=True) | |
| return "" | |