Spaces:

serenichron
/

opencode-zerogpu

Sleeping

App Files Files Community

opencode-zerogpu / models.py

serenichron

Fix device handling: check GPU availability before device_map

6d6c01e 9 days ago

raw

history blame contribute delete

10.6 kB

	"""Model loading, caching, and memory management for ZeroGPU inference."""

	import gc
	import logging
	from dataclasses import dataclass, field
	from typing import Optional, Generator, Any

	import torch
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	TextIteratorStreamer,
	)
	from threading import Thread

	from config import get_config, should_quantize

	logger = logging.getLogger(__name__)


	@dataclass
	class LoadedModel:
	"""Container for a loaded model and its tokenizer."""

	model_id: str
	model: Any
	tokenizer: Any
	quantization: str = "none"


	# Global model cache (single model at a time due to memory constraints)
	_current_model: Optional[LoadedModel] = None


	def get_quantization_config(quantization: str) -> Optional[BitsAndBytesConfig]:
	"""Get BitsAndBytes configuration for the specified quantization level."""
	if quantization == "int8":
	return BitsAndBytesConfig(load_in_8bit=True)
	elif quantization == "int4":
	return BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	)
	return None


	def clear_gpu_memory() -> None:
	"""Clear GPU memory by running garbage collection and emptying CUDA cache."""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	logger.debug("GPU memory cleared")


	def unload_model() -> None:
	"""Unload the currently loaded model and free memory."""
	global _current_model

	if _current_model is not None:
	logger.info(f"Unloading model: {_current_model.model_id}")
	del _current_model.model
	del _current_model.tokenizer
	_current_model = None
	clear_gpu_memory()


	def load_model(
	model_id: str,
	quantization: Optional[str] = None,
	force_reload: bool = False,
	) -> LoadedModel:
	"""
	Load a model from HuggingFace Hub.

	Args:
	model_id: HuggingFace model ID (e.g., "meta-llama/Llama-3.1-8B-Instruct")
	quantization: Force specific quantization ("none", "int8", "int4")
	If None, auto-determine based on model size
	force_reload: If True, reload even if already loaded

	Returns:
	LoadedModel with model and tokenizer ready for inference

	Raises:
	ValueError: If model_id is None or empty
	"""
	global _current_model

	if not model_id:
	raise ValueError("model_id cannot be None or empty")

	# Check if already loaded
	if not force_reload and _current_model is not None:
	if _current_model.model_id == model_id:
	logger.debug(f"Model already loaded: {model_id}")
	return _current_model

	# Determine quantization
	if quantization is None:
	quantization = should_quantize(model_id)

	logger.info(f"Loading model: {model_id} (quantization: {quantization})")

	# Unload current model first
	unload_model()

	config = get_config()

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	token=config.hf_token,
	trust_remote_code=True,
	)

	# Ensure tokenizer has pad token
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with appropriate configuration
	quant_config = get_quantization_config(quantization)

	model_kwargs = {
	"token": config.hf_token,
	"trust_remote_code": True,
	}

	# On ZeroGPU, use device_map only when GPU is available
	# Otherwise load to CPU for local testing
	if torch.cuda.is_available():
	model_kwargs["device_map"] = "auto"
	if quant_config is not None:
	model_kwargs["quantization_config"] = quant_config
	else:
	model_kwargs["torch_dtype"] = torch.bfloat16
	else:
	# CPU mode - no quantization, float32
	model_kwargs["device_map"] = "cpu"
	model_kwargs["torch_dtype"] = torch.float32

	model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

	_current_model = LoadedModel(
	model_id=model_id,
	model=model,
	tokenizer=tokenizer,
	quantization=quantization,
	)

	logger.info(f"Model loaded successfully: {model_id}")
	return _current_model


	def get_current_model() -> Optional[LoadedModel]:
	"""Get the currently loaded model, if any."""
	return _current_model


	def generate_text(
	model_id: str,
	prompt: str,
	max_new_tokens: int = 512,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 50,
	repetition_penalty: float = 1.1,
	stop_sequences: Optional[list[str]] = None,
	) -> str:
	"""
	Generate text using the specified model.

	Args:
	model_id: HuggingFace model ID
	prompt: Input prompt (already formatted with chat template)
	max_new_tokens: Maximum tokens to generate
	temperature: Sampling temperature
	top_p: Nucleus sampling probability
	top_k: Top-k sampling parameter
	repetition_penalty: Penalty for repeating tokens
	stop_sequences: Additional stop sequences

	Returns:
	Generated text (without the input prompt)
	"""
	loaded = load_model(model_id)

	inputs = loaded.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=loaded.tokenizer.model_max_length - max_new_tokens,
	)

	if torch.cuda.is_available():
	inputs = {k: v.cuda() for k, v in inputs.items()}

	# Build generation config
	gen_kwargs = {
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"do_sample": temperature > 0,
	"pad_token_id": loaded.tokenizer.pad_token_id,
	"eos_token_id": loaded.tokenizer.eos_token_id,
	}

	with torch.no_grad():
	outputs = loaded.model.generate(inputs, gen_kwargs)

	# Decode only the new tokens
	input_length = inputs["input_ids"].shape[1]
	generated_tokens = outputs[0][input_length:]
	response = loaded.tokenizer.decode(generated_tokens, skip_special_tokens=True)

	# Handle stop sequences
	if stop_sequences:
	for stop_seq in stop_sequences:
	if stop_seq in response:
	response = response.split(stop_seq)[0]

	return response


	def generate_text_stream(
	model_id: str,
	prompt: str,
	max_new_tokens: int = 512,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 50,
	repetition_penalty: float = 1.1,
	stop_sequences: Optional[list[str]] = None,
	) -> Generator[str, None, None]:
	"""
	Generate text using streaming output.

	Yields tokens as they are generated.
	"""
	loaded = load_model(model_id)

	inputs = loaded.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=loaded.tokenizer.model_max_length - max_new_tokens,
	)

	if torch.cuda.is_available():
	inputs = {k: v.cuda() for k, v in inputs.items()}

	# Create streamer
	streamer = TextIteratorStreamer(
	loaded.tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	# Build generation config
	gen_kwargs = {
	**inputs,
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"do_sample": temperature > 0,
	"pad_token_id": loaded.tokenizer.pad_token_id,
	"eos_token_id": loaded.tokenizer.eos_token_id,
	"streamer": streamer,
	}

	# Run generation in separate thread
	thread = Thread(target=loaded.model.generate, kwargs=gen_kwargs)
	thread.start()

	# Stream tokens
	accumulated = ""
	for token in streamer:
	accumulated += token

	# Check for stop sequences
	should_stop = False
	if stop_sequences:
	for stop_seq in stop_sequences:
	if stop_seq in accumulated:
	# Yield everything before the stop sequence
	before_stop = accumulated.split(stop_seq)[0]
	if before_stop:
	yield before_stop[len(accumulated) - len(token):]
	should_stop = True
	break

	if should_stop:
	break

	yield token

	thread.join()


	# Tokenizer cache (separate from model cache, for ZeroGPU compatibility)
	_tokenizer_cache: dict = {}


	def get_tokenizer(model_id: str):
	"""
	Get or load a tokenizer for the specified model.

	This is separate from model loading for ZeroGPU compatibility -
	tokenizers can be loaded outside GPU context.
	"""
	if model_id in _tokenizer_cache:
	return _tokenizer_cache[model_id]

	config = get_config()

	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	token=config.hf_token,
	trust_remote_code=True,
	)

	# Ensure tokenizer has pad token
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	_tokenizer_cache[model_id] = tokenizer
	return tokenizer


	def apply_chat_template(
	model_id: str,
	messages: list[dict[str, str]],
	add_generation_prompt: bool = True,
	) -> str:
	"""
	Apply the model's chat template to format messages.

	Args:
	model_id: HuggingFace model ID
	messages: List of message dicts with "role" and "content"
	add_generation_prompt: Whether to add the generation prompt

	Returns:
	Formatted prompt string
	"""
	tokenizer = get_tokenizer(model_id)

	# Check if tokenizer has chat template
	if hasattr(tokenizer, "apply_chat_template"):
	return tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=add_generation_prompt,
	)

	# Fallback: simple formatting
	prompt_parts = []
	for msg in messages:
	role = msg["role"]
	content = msg["content"]
	if role == "system":
	prompt_parts.append(f"System: {content}\n")
	elif role == "user":
	prompt_parts.append(f"User: {content}\n")
	elif role == "assistant":
	prompt_parts.append(f"Assistant: {content}\n")

	if add_generation_prompt:
	prompt_parts.append("Assistant:")

	return "".join(prompt_parts)