""" Chargement des modèles HuggingFace. - GPU disponible → quantification 4-bit (BitsAndBytes) - CPU uniquement → float32, TinyLlama ou Mistral léger """ from __future__ import annotations import logging import os from typing import Optional, Tuple import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, ) from config.config import CFG LOGGER = logging.getLogger(__name__) MODEL_REGISTRY = { "biomistral": "BioMistral/BioMistral-7B", "mistral": "mistralai/Mistral-7B-Instruct-v0.2", "tiny": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", } _model_cache: dict = {} def _get_bnb_config(): """Config 4-bit pour GPU (T4 / A100).""" from transformers import BitsAndBytesConfig return BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) def load_model( model_key: str = "biomistral", quantize: bool = True, cache_dir: Optional[str] = None, ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: """ Charge (ou retourne depuis cache) un modèle HuggingFace. Gère automatiquement CPU (pas de quantification) et GPU (4-bit). """ global _model_cache if model_key in _model_cache: LOGGER.info("Modèle '%s' servi depuis cache mémoire.", model_key) return _model_cache[model_key] if cache_dir is None: cache_dir = str(CFG.models_cache) model_id = MODEL_REGISTRY.get(model_key, model_key) has_gpu = torch.cuda.is_available() LOGGER.info("Chargement modèle: %s | GPU=%s | quantize=%s", model_id, has_gpu, quantize) # Tokenizer tokenizer = AutoTokenizer.from_pretrained( model_id, cache_dir=cache_dir, use_fast=True, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Modèle if has_gpu and quantize: bnb_cfg = _get_bnb_config() model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_cfg, device_map="auto", cache_dir=cache_dir, trust_remote_code=True, ) elif has_gpu: model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", cache_dir=cache_dir, trust_remote_code=True, ) else: # CPU : float32, pas de quantification LOGGER.warning("Pas de GPU — chargement CPU (lent, recommandé : model_key='tiny')") model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="cpu", cache_dir=cache_dir, trust_remote_code=True, low_cpu_mem_usage=True, ) model.eval() _model_cache[model_key] = (model, tokenizer) LOGGER.info("✅ Modèle '%s' chargé.", model_key) return model, tokenizer def generate( model, tokenizer, prompt: str, max_new_tokens: int = 200, temperature: float = 0.1, ) -> str: """Génère une réponse textuelle depuis un prompt.""" inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=2048, ) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature if temperature > 0 else 1.0, do_sample=(temperature > 0), pad_token_id=tokenizer.eos_token_id, ) new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:] return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()