Spaces:
Sleeping
Sleeping
| """ | |
| Chargement des modèles HuggingFace. | |
| - GPU disponible → quantification 4-bit (BitsAndBytes) | |
| - CPU uniquement → float32, TinyLlama ou Mistral léger | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| from typing import Optional, Tuple | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| ) | |
| from config.config import CFG | |
| LOGGER = logging.getLogger(__name__) | |
| MODEL_REGISTRY = { | |
| "biomistral": "BioMistral/BioMistral-7B", | |
| "mistral": "mistralai/Mistral-7B-Instruct-v0.2", | |
| "tiny": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| } | |
| _model_cache: dict = {} | |
| def _get_bnb_config(): | |
| """Config 4-bit pour GPU (T4 / A100).""" | |
| from transformers import BitsAndBytesConfig | |
| return BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| def load_model( | |
| model_key: str = "biomistral", | |
| quantize: bool = True, | |
| cache_dir: Optional[str] = None, | |
| ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: | |
| """ | |
| Charge (ou retourne depuis cache) un modèle HuggingFace. | |
| Gère automatiquement CPU (pas de quantification) et GPU (4-bit). | |
| """ | |
| global _model_cache | |
| if model_key in _model_cache: | |
| LOGGER.info("Modèle '%s' servi depuis cache mémoire.", model_key) | |
| return _model_cache[model_key] | |
| if cache_dir is None: | |
| cache_dir = str(CFG.models_cache) | |
| model_id = MODEL_REGISTRY.get(model_key, model_key) | |
| has_gpu = torch.cuda.is_available() | |
| LOGGER.info("Chargement modèle: %s | GPU=%s | quantize=%s", model_id, has_gpu, quantize) | |
| # Tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_id, | |
| cache_dir=cache_dir, | |
| use_fast=True, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Modèle | |
| if has_gpu and quantize: | |
| bnb_cfg = _get_bnb_config() | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| quantization_config=bnb_cfg, | |
| device_map="auto", | |
| cache_dir=cache_dir, | |
| trust_remote_code=True, | |
| ) | |
| elif has_gpu: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| cache_dir=cache_dir, | |
| trust_remote_code=True, | |
| ) | |
| else: | |
| # CPU : float32, pas de quantification | |
| LOGGER.warning("Pas de GPU — chargement CPU (lent, recommandé : model_key='tiny')") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| cache_dir=cache_dir, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| model.eval() | |
| _model_cache[model_key] = (model, tokenizer) | |
| LOGGER.info("✅ Modèle '%s' chargé.", model_key) | |
| return model, tokenizer | |
| def generate( | |
| model, | |
| tokenizer, | |
| prompt: str, | |
| max_new_tokens: int = 200, | |
| temperature: float = 0.1, | |
| ) -> str: | |
| """Génère une réponse textuelle depuis un prompt.""" | |
| inputs = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=2048, | |
| ) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature if temperature > 0 else 1.0, | |
| do_sample=(temperature > 0), | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:] | |
| return tokenizer.decode(new_tokens, skip_special_tokens=True).strip() | |