Spaces:

build-small-hackathon
/

Cook_with_a_LLM

Running on Zero

App Files Files Community

Cook_with_a_LLM / src /models /planner.py

Fred1e4

Complete Cook App (#5)

75c5414 1 day ago

raw

history blame contribute delete

3.58 kB

	"""MiniCPM4.1-8B text-only planner — lazy singleton."""
	from __future__ import annotations

	import logging
	import os
	from typing import Any, Optional, Tuple

	import torch

	from src import config

	log = logging.getLogger(__name__)

	_model: Any = None
	_tokenizer: Any = None


	def get_planner() -> Tuple[Optional[Any], Optional[Any]]:
	"""Return (model, tokenizer). Loads once; returns (None, None) on failure."""
	global _model, _tokenizer
	if _model is not None:
	return _model, _tokenizer

	# Prefer fine-tuned repo when available
	model_id = config.PLANNER_FINETUNED_REPO or config.PLANNER_REPO
	try:
	# MiniCPM4.1 custom code imports is_torch_fx_available, which was
	# removed in transformers 5.x. Patch it back before loading.
	import transformers.utils.import_utils as _iutils
	if not hasattr(_iutils, "is_torch_fx_available"):
	def _is_torch_fx_available():
	try:
	import torch.fx # noqa: F401
	return True
	except ImportError:
	return False
	_iutils.is_torch_fx_available = _is_torch_fx_available

	from transformers import AutoModelForCausalLM, AutoTokenizer

	device_map = "auto" if os.environ.get("SPACE_ID") else (
	"cuda" if torch.cuda.is_available() else "cpu"
	)
	log.info("Loading planner model %s (device_map=%s)...", model_id, device_map)
	_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	_model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	device_map=device_map,
	).eval()
	log.info("Planner model ready.")
	except Exception as exc:
	log.error("Could not load planner model '%s': %s", model_id, exc)
	_model = None
	_tokenizer = None

	return _model, _tokenizer


	def infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
	"""Run text inference with the planner model.

	Returns empty string if the model is unavailable.
	"""
	model, tokenizer = get_planner()
	if model is None or tokenizer is None:
	return ""

	try:
	messages = [{"role": "user", "content": prompt}]

	# return_dict=True yields a BatchEncoding (dict-like) with input_ids +
	# attention_mask. NOTE: BatchEncoding is NOT a `dict` instance, so we
	# must access it via mapping keys, never via tensor attrs like .shape.
	enc = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_tensors="pt",
	return_dict=True,
	)
	input_ids = enc["input_ids"].to(model.device)
	input_len = input_ids.shape[1]

	gen_inputs = {"input_ids": input_ids}
	attn = enc.get("attention_mask")
	if attn is not None:
	gen_inputs["attention_mask"] = attn.to(model.device)

	gen_kwargs: dict = dict(max_new_tokens=max_new_tokens, do_sample=False)
	if temperature > 0:
	gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.95)

	with torch.no_grad():
	output = model.generate(gen_inputs, gen_kwargs)

	token_ids = output[0][input_len:]
	return tokenizer.decode(token_ids, skip_special_tokens=True)

	except Exception as exc:
	log.error("Planner inference error: %r", exc, exc_info=True)
	return ""