Spaces:

NathMen12
/

Chat-With-AI

Runtime error

App Files Files Community

Chat-With-AI / model_runner.py

NathMen12

Upload 6 files

7840eb9 verified about 1 month ago

Raw

History Blame Contribute Delete

4.86 kB

	"""
	model_runner.py — Model loading + ZeroGPU inference
	The @spaces.GPU decorator is applied lazily so the GPU is only
	allocated during actual inference calls, not at startup.
	"""

	import os
	import gc
	import torch
	import spaces
	from threading import Lock
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TextIteratorStreamer,
	BitsAndBytesConfig,
	)
	from huggingface_hub import snapshot_download
	import threading

	# ── Global model cache (one model at a time) ──────────────────
	_model = None
	_tokenizer = None
	_current_model_id = None
	_lock = Lock()


	def get_device():
	if torch.cuda.is_available():
	return "cuda"
	return "cpu"


	def load_model(
	model_id: str,
	use_4bit: bool = True,
	use_cpu: bool = False,
	):
	"""
	Load a model from HuggingFace Hub.
	Unloads the previous model first to free VRAM.
	"""
	global _model, _tokenizer, _current_model_id

	with _lock:
	if _current_model_id == model_id:
	return # Already loaded

	# Unload previous
	_unload()

	device = "cpu" if use_cpu else get_device()

	quant_cfg = None
	if not use_cpu and device == "cuda" and use_4bit:
	quant_cfg = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	_tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True,
	use_fast=True,
	)
	if _tokenizer.pad_token is None:
	_tokenizer.pad_token = _tokenizer.eos_token

	model_kwargs = dict(
	trust_remote_code=True,
	torch_dtype=torch.float16 if device != "cpu" else torch.float32,
	device_map="auto" if device == "cuda" else None,
	)
	if quant_cfg:
	model_kwargs["quantization_config"] = quant_cfg

	_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

	if device == "cpu":
	_model = _model.to(device)

	_model.eval()
	_current_model_id = model_id


	def _unload():
	global _model, _tokenizer, _current_model_id
	if _model is not None:
	del _model
	_model = None
	if _tokenizer is not None:
	del _tokenizer
	_tokenizer = None
	_current_model_id = None
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()


	def is_loaded() -> bool:
	return _model is not None


	def current_model() -> str \| None:
	return _current_model_id


	# ── Inference ─────────────────────────────────────────────────

	@spaces.GPU(duration=120)
	def generate_stream(
	messages: list[dict],
	max_new_tokens: int = 512,
	temperature: float = 0.7,
	top_p: float = 0.9,
	repetition_penalty: float = 1.1,
	system_prompt: str = "",
	):
	"""
	Streaming token generator.
	Decorated with @spaces.GPU so GPU is allocated ONLY during this call.
	Yields text chunks as they are generated.
	"""
	if _model is None or _tokenizer is None:
	yield "⚠️ Aucun modèle chargé. Veuillez d'abord sélectionner et charger un modèle."
	return

	# Build prompt using chat template if available
	chat_messages = []
	if system_prompt:
	chat_messages.append({"role": "system", "content": system_prompt})
	chat_messages.extend(messages)

	try:
	input_ids = _tokenizer.apply_chat_template(
	chat_messages,
	add_generation_prompt=True,
	return_tensors="pt",
	)
	except Exception:
	# Fallback: simple concatenation
	text = ""
	if system_prompt:
	text += f"System: {system_prompt}\n\n"
	for m in messages:
	role = "Human" if m["role"] == "user" else "Assistant"
	text += f"{role}: {m['content']}\n"
	text += "Assistant:"
	input_ids = _tokenizer(text, return_tensors="pt").input_ids

	device = next(_model.parameters()).device
	input_ids = input_ids.to(device)

	streamer = TextIteratorStreamer(
	_tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	gen_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	do_sample=temperature > 0,
	streamer=streamer,
	pad_token_id=_tokenizer.eos_token_id,
	)

	thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
	thread.start()

	for chunk in streamer:
	yield chunk

	thread.join()