import os import torch from transformers import pipeline, BitsAndBytesConfig from dotenv import load_dotenv from pathlib import Path env_path = Path(__file__).resolve().parent.parent / ".env" load_dotenv(dotenv_path=env_path) MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct") QUANTIZATION = os.getenv("QUANTIZATION", "auto") USE_DOUBLE_QUANT = os.getenv("USE_DOUBLE_QUANT", "true").lower() == "true" _pipe = None _current_model = None def _log(msg: str): print(f"[ModelLoader] {msg}") def _has_gpu() -> bool: return torch.cuda.is_available() def _gpu_name() -> str: if _has_gpu(): return torch.cuda.get_device_name(0) return "None" def _gpu_memory_gb() -> float: if _has_gpu(): try: return torch.cuda.get_device_properties(0).total_mem / 1e9 except: return 0 return 0 def _select_quantization() -> str: """Auto-select quantization tier based on MODEL_ID and hardware.""" user_mode = QUANTIZATION.lower() if user_mode == "none": return "none" if user_mode != "auto": return user_mode # Auto-detect: GPU with enough VRAM for requested model if "7B" in MODEL_ID: if _has_gpu() and _gpu_memory_gb() >= 5.5: _log(f"7B model detected, GPU {_gpu_name()} ({_gpu_memory_gb():.1f}GB) — using 4-bit") return "4bit" _log("7B model requested but no GPU with 5.5GB+ VRAM — falling back to 1.5B 8-bit") return "cpu_fallback_8bit" if "1.5B" in MODEL_ID: if _has_gpu(): _log(f"1.5B model detected, GPU available — using 8-bit") return "8bit" _log("1.5B model detected, CPU only — using bfloat16") return "none" return "none" def _build_model_kwargs(quant_mode: str) -> dict: """Build pipeline kwargs based on quantization mode.""" kwargs = { "trust_remote_code": True, } if quant_mode == "4bit": kwargs["device_map"] = "auto" kwargs["quantization_config"] = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=USE_DOUBLE_QUANT, bnb_4bit_quant_type="nf4", ) _log("[OK] 4-bit quantization enabled (NF4, double quant)") elif quant_mode == "8bit": kwargs["device_map"] = "auto" kwargs["quantization_config"] = BitsAndBytesConfig( load_in_8bit=True, ) _log("[OK] 8-bit quantization enabled") elif quant_mode == "cpu_fallback_8bit": kwargs["device_map"] = "auto" kwargs["quantization_config"] = BitsAndBytesConfig( load_in_8bit=True, ) _log("[OK] CPU fallback 8-bit for 1.5B model") else: kwargs["torch_dtype"] = torch.bfloat16 kwargs["device_map"] = "auto" _log(f"[OK] Loading {MODEL_ID} in bfloat16 (CPU-friendly)") return kwargs def get_pipe(): global _pipe, _current_model if _pipe is not None: return _pipe actual_model_id = MODEL_ID quant_mode = _select_quantization() # Handle CPU fallback for 7B → 1.5B if quant_mode == "cpu_fallback_8bit": actual_model_id = "Qwen/Qwen2.5-1.5B-Instruct" _log(f"[FALLBACK] loading {actual_model_id} instead of {MODEL_ID}") _log(f"Loading {actual_model_id} (quantization: {quant_mode})") _log(f" Hardware: GPU={_gpu_name()}, VRAM={_gpu_memory_gb():.1f}GB, CUDA={_has_gpu()}") try: kwargs = _build_model_kwargs(quant_mode) _pipe = pipeline( "text-generation", model=actual_model_id, **kwargs ) _current_model = actual_model_id _log("[DONE] Model loaded successfully!") except ImportError as e: if "bitsandbytes" in str(e): _log("[ERROR] bitsandbytes not installed. Falling back to CPU bfloat16.") _pipe = pipeline( "text-generation", model=actual_model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) _current_model = actual_model_id _log("[DONE] Model loaded with CPU fallback") else: _log(f"[ERROR] Model load failed: {e}") _pipe = None except Exception as e: _log(f"❌ Model load failed: {e}") _pipe = None return _pipe def generate_text(messages, temperature=0.3, max_new_tokens=2000): pipe = get_pipe() if pipe is None: return None outputs = pipe( messages, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=0.9 ) return outputs[0]["generated_text"][-1]["content"]