Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from transformers import pipeline, BitsAndBytesConfig | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| env_path = Path(__file__).resolve().parent.parent / ".env" | |
| load_dotenv(dotenv_path=env_path) | |
| MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct") | |
| QUANTIZATION = os.getenv("QUANTIZATION", "auto") | |
| USE_DOUBLE_QUANT = os.getenv("USE_DOUBLE_QUANT", "true").lower() == "true" | |
| _pipe = None | |
| _current_model = None | |
| def _log(msg: str): | |
| print(f"[ModelLoader] {msg}") | |
| def _has_gpu() -> bool: | |
| return torch.cuda.is_available() | |
| def _gpu_name() -> str: | |
| if _has_gpu(): | |
| return torch.cuda.get_device_name(0) | |
| return "None" | |
| def _gpu_memory_gb() -> float: | |
| if _has_gpu(): | |
| try: | |
| return torch.cuda.get_device_properties(0).total_mem / 1e9 | |
| except: | |
| return 0 | |
| return 0 | |
| def _select_quantization() -> str: | |
| """Auto-select quantization tier based on MODEL_ID and hardware.""" | |
| user_mode = QUANTIZATION.lower() | |
| if user_mode == "none": | |
| return "none" | |
| if user_mode != "auto": | |
| return user_mode | |
| # Auto-detect: GPU with enough VRAM for requested model | |
| if "7B" in MODEL_ID: | |
| if _has_gpu() and _gpu_memory_gb() >= 5.5: | |
| _log(f"7B model detected, GPU {_gpu_name()} ({_gpu_memory_gb():.1f}GB) — using 4-bit") | |
| return "4bit" | |
| _log("7B model requested but no GPU with 5.5GB+ VRAM — falling back to 1.5B 8-bit") | |
| return "cpu_fallback_8bit" | |
| if "1.5B" in MODEL_ID: | |
| if _has_gpu(): | |
| _log(f"1.5B model detected, GPU available — using 8-bit") | |
| return "8bit" | |
| _log("1.5B model detected, CPU only — using bfloat16") | |
| return "none" | |
| return "none" | |
| def _build_model_kwargs(quant_mode: str) -> dict: | |
| """Build pipeline kwargs based on quantization mode.""" | |
| kwargs = { | |
| "trust_remote_code": True, | |
| } | |
| if quant_mode == "4bit": | |
| kwargs["device_map"] = "auto" | |
| kwargs["quantization_config"] = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=USE_DOUBLE_QUANT, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| _log("[OK] 4-bit quantization enabled (NF4, double quant)") | |
| elif quant_mode == "8bit": | |
| kwargs["device_map"] = "auto" | |
| kwargs["quantization_config"] = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| ) | |
| _log("[OK] 8-bit quantization enabled") | |
| elif quant_mode == "cpu_fallback_8bit": | |
| kwargs["device_map"] = "auto" | |
| kwargs["quantization_config"] = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| ) | |
| _log("[OK] CPU fallback 8-bit for 1.5B model") | |
| else: | |
| kwargs["torch_dtype"] = torch.bfloat16 | |
| kwargs["device_map"] = "auto" | |
| _log(f"[OK] Loading {MODEL_ID} in bfloat16 (CPU-friendly)") | |
| return kwargs | |
| def get_pipe(): | |
| global _pipe, _current_model | |
| if _pipe is not None: | |
| return _pipe | |
| actual_model_id = MODEL_ID | |
| quant_mode = _select_quantization() | |
| # Handle CPU fallback for 7B → 1.5B | |
| if quant_mode == "cpu_fallback_8bit": | |
| actual_model_id = "Qwen/Qwen2.5-1.5B-Instruct" | |
| _log(f"[FALLBACK] loading {actual_model_id} instead of {MODEL_ID}") | |
| _log(f"Loading {actual_model_id} (quantization: {quant_mode})") | |
| _log(f" Hardware: GPU={_gpu_name()}, VRAM={_gpu_memory_gb():.1f}GB, CUDA={_has_gpu()}") | |
| try: | |
| kwargs = _build_model_kwargs(quant_mode) | |
| _pipe = pipeline( | |
| "text-generation", | |
| model=actual_model_id, | |
| **kwargs | |
| ) | |
| _current_model = actual_model_id | |
| _log("[DONE] Model loaded successfully!") | |
| except ImportError as e: | |
| if "bitsandbytes" in str(e): | |
| _log("[ERROR] bitsandbytes not installed. Falling back to CPU bfloat16.") | |
| _pipe = pipeline( | |
| "text-generation", | |
| model=actual_model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| _current_model = actual_model_id | |
| _log("[DONE] Model loaded with CPU fallback") | |
| else: | |
| _log(f"[ERROR] Model load failed: {e}") | |
| _pipe = None | |
| except Exception as e: | |
| _log(f"❌ Model load failed: {e}") | |
| _pipe = None | |
| return _pipe | |
| def generate_text(messages, temperature=0.3, max_new_tokens=2000): | |
| pipe = get_pipe() | |
| if pipe is None: | |
| return None | |
| outputs = pipe( | |
| messages, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_p=0.9 | |
| ) | |
| return outputs[0]["generated_text"][-1]["content"] | |