Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig | |
| from peft import PeftModel | |
| import torch | |
| import logging | |
| from pathlib import Path | |
| import os | |
| import platform | |
| from .config import settings, apply_hf_space_optimizations | |
| from .hf_api import HuggingFaceWrapper | |
| from functools import lru_cache | |
| logger = logging.getLogger(__name__) | |
| def optimize_for_hf_space(): | |
| """Apply optimizations specific to Hugging Face Spaces""" | |
| # Apply HF Space optimizations from config (includes cache dirs and other settings) | |
| apply_hf_space_optimizations() | |
| # Create cache directories | |
| cache_dirs = ["/tmp/transformers_cache", "/tmp/huggingface", "/tmp/torch"] | |
| for cache_dir in cache_dirs: | |
| Path(cache_dir).mkdir(parents=True, exist_ok=True) | |
| logger.info("🚀 Optimized cache directories for HF Space") | |
| # Globale Variable für die Pipeline, um sie zwischenzuspeichern | |
| # _cached_generator_pipeline = None # Entfernt, da wir lru_cache verwenden | |
| def load_model_and_tokenizer(): | |
| """ | |
| Optimierter Model Loader mit LoRA-Support. | |
| Lädt Basismodell und Tokenizer. | |
| Kann LoRA-Adapter von Hugging Face Hub herunterladen. | |
| Automatische Konfiguration basierend auf verfügbaren Ressourcen. | |
| """ | |
| # Apply HF Space optimizations | |
| optimize_for_hf_space() | |
| # Check if we're on macOS and disable 4-bit quantization if needed | |
| is_macos = platform.system() == "Darwin" | |
| if is_macos and settings.MODEL_LOAD_IN_4BIT: | |
| logger.warning("4-bit quantization is not recommended on macOS. Disabling 4-bit loading.") | |
| use_4bit = False | |
| else: | |
| use_4bit = settings.MODEL_LOAD_IN_4BIT | |
| # Only try to import bitsandbytes if we actually need 4-bit quantization | |
| if use_4bit: | |
| try: | |
| from transformers import BitsAndBytesConfig | |
| import bitsandbytes | |
| logger.info(f"Successfully imported bitsandbytes version: {bitsandbytes.__version__}") | |
| bitsandbytes_available = True | |
| except ImportError as e: | |
| logger.warning(f"Failed to import bitsandbytes: {e}. Disabling 4-bit quantization.") | |
| bitsandbytes_available = False | |
| use_4bit = False | |
| except Exception as e: | |
| # Catch other bitsandbytes related errors (like missing .dylib files) | |
| logger.warning(f"Bitsandbytes import failed with error: {e}. Disabling 4-bit quantization.") | |
| bitsandbytes_available = False | |
| use_4bit = False | |
| else: | |
| bitsandbytes_available = False | |
| if is_macos: | |
| logger.info("Running on macOS - using standard model loading without 4-bit quantization.") | |
| else: | |
| logger.info("4-bit quantization is disabled in settings.") | |
| base_model_id = settings.DEFAULT_MODEL_ID | |
| hf_token = os.getenv("HF_API_KEY") | |
| logger.info(f"Lade Basismodell und Tokenizer: {base_model_id}") | |
| try: | |
| # Try loading with fast tokenizer first | |
| logger.info("Versuche Fast-Tokenizer zu laden...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, | |
| token=hf_token, | |
| cache_dir="/tmp/transformers_cache" | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Fast-Tokenizer-Loading fehlgeschlagen: {e}") | |
| logger.info("Fallback auf Slow-Tokenizer...") | |
| try: | |
| # Fallback to slow tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, | |
| token=hf_token, | |
| use_fast=False, | |
| cache_dir="/tmp/transformers_cache" | |
| ) | |
| logger.info("✅ Slow-Tokenizer erfolgreich geladen.") | |
| except Exception as e2: | |
| logger.error(f"Auch Slow-Tokenizer fehlgeschlagen: {e2}") | |
| logger.error(f"Ursprünglicher Fast-Tokenizer Fehler: {e}") | |
| raise e2 | |
| if tokenizer.pad_token is None: | |
| logger.info("Tokenizer hat kein pad_token. Setze pad_token = eos_token.") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model_kwargs = { | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| "token": hf_token, | |
| "cache_dir": "/tmp/transformers_cache", # Use optimized cache directory | |
| "low_cpu_mem_usage": True, # Reduce CPU memory usage during loading | |
| } | |
| logger.info(f"DEBUG: use_4bit={use_4bit}, bitsandbytes_available={bitsandbytes_available}") | |
| if use_4bit and bitsandbytes_available: | |
| try: | |
| logger.info("Versuche, Modell mit 4-bit Quantisierung zu laden.") | |
| from transformers import BitsAndBytesConfig | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=False, | |
| bnb_4bit_compute_dtype=torch.float16 | |
| ) | |
| four_bit_model_kwargs = model_kwargs.copy() | |
| four_bit_model_kwargs["quantization_config"] = quantization_config | |
| model = AutoModelForCausalLM.from_pretrained(base_model_id, **four_bit_model_kwargs) | |
| logger.info("Modell erfolgreich mit 4-bit Quantisierung geladen.") | |
| except Exception as e: | |
| logger.warning(f"4-bit Laden fehlgeschlagen: {e}. Fallback auf Standard-Laden (FP16).") | |
| fallback_kwargs = model_kwargs.copy() | |
| fallback_kwargs["torch_dtype"] = torch.float16 | |
| model = AutoModelForCausalLM.from_pretrained(base_model_id, **fallback_kwargs) | |
| else: | |
| logger.info("4-bit Quantisierung ist deaktiviert. Lade Modell in FP16.") | |
| # Prepare kwargs for AutoConfig and AutoModelForCausalLM | |
| shared_load_kwargs = { | |
| "token": hf_token, | |
| "trust_remote_code": True, | |
| "cache_dir": "/tmp/transformers_cache" | |
| } | |
| # Load config first | |
| try: | |
| config = AutoConfig.from_pretrained(base_model_id, **shared_load_kwargs) | |
| logger.info(f"Initial loaded config.parallelize_strategies: {getattr(config, 'parallelize_strategies', 'Not set')}") | |
| # More comprehensive approach to handle parallelize_strategies | |
| # Set it to an empty list if it's None or not set, as this seems to be safer | |
| if not hasattr(config, 'parallelize_strategies') or config.parallelize_strategies is None: | |
| config.parallelize_strategies = [] | |
| logger.info("Set config.parallelize_strategies to empty list []") | |
| elif isinstance(config.parallelize_strategies, list): | |
| # Clean any None values from the list | |
| cleaned_strategies = [s for s in config.parallelize_strategies if s is not None] | |
| if len(cleaned_strategies) != len(config.parallelize_strategies): | |
| config.parallelize_strategies = cleaned_strategies | |
| logger.info(f"Cleaned config.parallelize_strategies to: {config.parallelize_strategies}") | |
| else: | |
| logger.warning(f"config.parallelize_strategies is not a list: {config.parallelize_strategies}. Setting to empty list.") | |
| config.parallelize_strategies = [] | |
| except Exception as e: | |
| logger.error(f"Error loading or processing AutoConfig: {e}") | |
| # If config loading fails, proceed without a modified config, which might lead to the original error | |
| # but at least we tried. | |
| config = None # Ensure model loading below doesn't fail on 'config' not defined | |
| # Prepare kwargs for AutoModelForCausalLM.from_pretrained | |
| final_fp16_model_kwargs = model_kwargs.copy() # Starts with device_map, trust_remote_code, token | |
| final_fp16_model_kwargs["torch_dtype"] = torch.float16 | |
| if config: # Only add config if it was successfully loaded and processed | |
| final_fp16_model_kwargs["config"] = config | |
| model = AutoModelForCausalLM.from_pretrained(base_model_id, **final_fp16_model_kwargs) | |
| # LoRA-Gewichte laden | |
| lora_path_to_load = None | |
| if settings.LORA_MODEL_REPO_ID: | |
| logger.info(f"LoRA Adapter soll von Hugging Face Hub geladen werden: {settings.LORA_MODEL_REPO_ID}") | |
| hf_wrapper = HuggingFaceWrapper(token=hf_token) # Token wird intern vom Wrapper geholt, falls nicht explizit übergeben | |
| # Zielverzeichnis für heruntergeladene LoRA-Adapter | |
| # Basierend auf MODEL_PATH aus settings, um Konsistenz zu wahren | |
| # Beispiel: cardserver/models/lora-checkpoint/downloaded_adapters/your-lora-model-repo | |
| local_lora_download_dir_base = settings.resolved_model_path.parent / "downloaded_adapters" | |
| lora_adapter_name = settings.LORA_MODEL_REPO_ID.split("/")[-1] # z.B. "your-lora-model-repo" | |
| local_lora_dir = local_lora_download_dir_base / lora_adapter_name | |
| # Prüfen, ob der Adapter bereits heruntergeladen wurde (einfache Prüfung) | |
| # Eine robustere Prüfung könnte Versions-Hashes oder Modifikationszeiten beinhalten. | |
| adapter_config_file = local_lora_dir / "adapter_config.json" | |
| if not adapter_config_file.exists() or getattr(settings, "LORA_FORCE_DOWNLOAD", False): | |
| if adapter_config_file.exists(): | |
| logger.info(f"LORA_FORCE_DOWNLOAD ist aktiv. LoRA-Adapter wird erneut heruntergeladen: {settings.LORA_MODEL_REPO_ID}") | |
| else: | |
| logger.info(f"LoRA-Adapter nicht lokal gefunden unter {local_lora_dir}. Wird heruntergeladen...") | |
| local_lora_dir.mkdir(parents=True, exist_ok=True) # Sicherstellen, dass das Verzeichnis existiert | |
| try: | |
| downloaded_path_str = hf_wrapper.download_model( | |
| repo_name=settings.LORA_MODEL_REPO_ID, | |
| local_dir=str(local_lora_dir), # Muss ein String sein | |
| # revision=settings.LORA_MODEL_REVISION # Falls eine spezifische Version benötigt wird | |
| ) | |
| lora_path_to_load = Path(downloaded_path_str) # Der Rückgabewert ist der Pfad | |
| logger.info(f"LoRA-Adapter erfolgreich von {settings.LORA_MODEL_REPO_ID} nach {lora_path_to_load} heruntergeladen.") | |
| except Exception as e: | |
| logger.error(f"Fehler beim Herunterladen des LoRA-Adapters von {settings.LORA_MODEL_REPO_ID}: {e}") | |
| logger.info("Versuche, Fallback auf lokalen Pfad (falls konfiguriert) oder verwende Basismodell.") | |
| # Fallback auf settings.resolved_model_path, falls LORA_MODEL_REPO_ID fehlschlägt | |
| if settings.resolved_model_path.exists() and (settings.resolved_model_path / "adapter_config.json").exists(): | |
| lora_path_to_load = settings.resolved_model_path | |
| logger.info(f"Fallback auf lokalen LoRA-Pfad: {lora_path_to_load}") | |
| else: | |
| lora_path_to_load = None # Kein LoRA verwenden | |
| else: | |
| lora_path_to_load = local_lora_dir | |
| logger.info(f"LoRA-Adapter {settings.LORA_MODEL_REPO_ID} bereits lokal vorhanden unter: {lora_path_to_load}") | |
| elif settings.resolved_model_path.exists() and (settings.resolved_model_path / "adapter_config.json").exists(): | |
| # Fallback: Wenn LORA_MODEL_REPO_ID nicht gesetzt ist, aber ein lokaler Pfad existiert | |
| lora_path_to_load = settings.resolved_model_path | |
| logger.info(f"Verwende lokalen LoRA-Pfad: {lora_path_to_load} (da LORA_MODEL_REPO_ID nicht gesetzt).") | |
| else: | |
| logger.info("Kein LORA_MODEL_REPO_ID in den Settings und kein gültiger lokaler LoRA-Pfad gefunden.") | |
| lora_path_to_load = None | |
| if lora_path_to_load: | |
| try: | |
| logger.info(f"Versuche, LoRA-Gewichte von Pfad zu laden: {lora_path_to_load}") | |
| model = PeftModel.from_pretrained(model, str(lora_path_to_load)) | |
| logger.info("✅ LoRA-Modell erfolgreich auf Basismodell angewendet.") | |
| except Exception as e: | |
| logger.error(f"❌ LoRA-Loading von {lora_path_to_load} fehlgeschlagen: {e}") | |
| logger.info("Verwende Basismodell ohne LoRA-Adapter.") | |
| else: | |
| logger.info("Keine LoRA-Gewichte zum Laden spezifiziert oder gefunden. Verwende Basismodell.") | |
| return model, tokenizer | |
| def get_generator(): | |
| """ | |
| Lädt das Modell und den Tokenizer (beim ersten Aufruf) | |
| und erstellt eine Textgenerierungs-Pipeline. | |
| Die Pipeline wird gecacht. | |
| """ | |
| logger.info("Initialisiere Textgenerierungs-Pipeline...") | |
| model, tokenizer = load_model_and_tokenizer() | |
| if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: | |
| logger.info(f"pad_token_id nicht im Tokenizer gefunden. Setze pad_token_id auf eos_token_id ({tokenizer.eos_token_id}).") | |
| tokenizer.pad_token_id = tokenizer.eos_token_id | |
| # Das Modell muss möglicherweise auch aktualisiert werden, wenn pad_token_id zur Laufzeit geändert wird | |
| # Dies ist jedoch oft nicht notwendig, wenn das Modell bereits mit einem eos_token trainiert wurde. | |
| # model.config.pad_token_id = tokenizer.pad_token_id | |
| # When using device_map="auto" with accelerate, don't specify device for pipeline | |
| # The pipeline will automatically use the same device mapping as the model | |
| _cached_generator_pipeline = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer | |
| # No device parameter when model uses device_map="auto" | |
| ) | |
| logger.info(f"Textgenerierungs-Pipeline erfolgreich initialisiert. Model device mapping: {getattr(model, 'hf_device_map', 'No device map found')}") | |
| return _cached_generator_pipeline | |
| def get_model_info(): | |
| """Informationen über das geladene Modell""" | |
| lora_path = settings.resolved_model_path | |
| return { | |
| "base_model": settings.DEFAULT_MODEL_ID, | |
| "lora_enabled": lora_path.exists(), | |
| "lora_path": str(lora_path) if lora_path.exists() else None, | |
| "gpu_available": torch.cuda.is_available(), | |
| "gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0 | |
| } | |
| # Optional: Pre-load model at startup if desired (in main.py or similar) | |
| # def preload_model(): | |
| # logger.info("Starte Pre-Loading des Modells...") | |
| # get_generator() | |
| # logger.info("Modell erfolgreich vorab geladen.") | |