Spaces:

Ibou17
/

radiology-agent-langgraph

Sleeping

App Files Files Community

radiology-agent-langgraph / models /loader.py

Ibou17

Initial deployment — Radiology Agent

3a08315 about 2 months ago

raw

history blame contribute delete

3.82 kB

	"""
	Chargement des modèles HuggingFace.
	- GPU disponible → quantification 4-bit (BitsAndBytes)
	- CPU uniquement → float32, TinyLlama ou Mistral léger
	"""
	from __future__ import annotations
	import logging
	import os
	from typing import Optional, Tuple

	import torch
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	)

	from config.config import CFG

	LOGGER = logging.getLogger(__name__)

	MODEL_REGISTRY = {
	"biomistral": "BioMistral/BioMistral-7B",
	"mistral": "mistralai/Mistral-7B-Instruct-v0.2",
	"tiny": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	}

	_model_cache: dict = {}


	def _get_bnb_config():
	"""Config 4-bit pour GPU (T4 / A100)."""
	from transformers import BitsAndBytesConfig
	return BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	)


	def load_model(
	model_key: str = "biomistral",
	quantize: bool = True,
	cache_dir: Optional[str] = None,
	) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
	"""
	Charge (ou retourne depuis cache) un modèle HuggingFace.
	Gère automatiquement CPU (pas de quantification) et GPU (4-bit).
	"""
	global _model_cache

	if model_key in _model_cache:
	LOGGER.info("Modèle '%s' servi depuis cache mémoire.", model_key)
	return _model_cache[model_key]

	if cache_dir is None:
	cache_dir = str(CFG.models_cache)

	model_id = MODEL_REGISTRY.get(model_key, model_key)
	has_gpu = torch.cuda.is_available()
	LOGGER.info("Chargement modèle: %s \| GPU=%s \| quantize=%s", model_id, has_gpu, quantize)

	# Tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	cache_dir=cache_dir,
	use_fast=True,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Modèle
	if has_gpu and quantize:
	bnb_cfg = _get_bnb_config()
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	quantization_config=bnb_cfg,
	device_map="auto",
	cache_dir=cache_dir,
	trust_remote_code=True,
	)
	elif has_gpu:
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	cache_dir=cache_dir,
	trust_remote_code=True,
	)
	else:
	# CPU : float32, pas de quantification
	LOGGER.warning("Pas de GPU — chargement CPU (lent, recommandé : model_key='tiny')")
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float32,
	device_map="cpu",
	cache_dir=cache_dir,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	)

	model.eval()
	_model_cache[model_key] = (model, tokenizer)
	LOGGER.info("✅ Modèle '%s' chargé.", model_key)
	return model, tokenizer


	def generate(
	model,
	tokenizer,
	prompt: str,
	max_new_tokens: int = 200,
	temperature: float = 0.1,
	) -> str:
	"""Génère une réponse textuelle depuis un prompt."""
	inputs = tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=2048,
	)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature if temperature > 0 else 1.0,
	do_sample=(temperature > 0),
	pad_token_id=tokenizer.eos_token_id,
	)

	new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:]
	return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()