import torch from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor, AutoConfig from functools import lru_cache @lru_cache(maxsize=2) def load_embed_model(model_path: str = "nvidia/llama-nemotron-embed-vl-1b-v2"): """Load embedding model (cached).""" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔄 Loading embedding model on {device}...") config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) config._attn_implementation = "sdpa" if hasattr(config, 'llm_config'): config.llm_config._attn_implementation = "sdpa" # ✅ FIX: Use manual device instead of device_map="auto" model = AutoModel.from_pretrained( model_path, config=config, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True, low_cpu_mem_usage=True, # ✅ CPU optimization ).to(device).eval() print(f"✅ Embedding model loaded on {device}") return model, device @lru_cache(maxsize=2) def load_rerank_model(model_path: str = "nvidia/llama-nemotron-rerank-vl-1b-v2"): """Load reranking model (cached).""" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔄 Loading reranking model on {device}...") # ✅ FIX: Use manual device instead of device_map="auto" model = AutoModelForSequenceClassification.from_pretrained( model_path, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, trust_remote_code=False, attn_implementation="eager", ).to(device).eval() processor = AutoProcessor.from_pretrained( model_path, trust_remote_code=True, max_input_tiles=6, use_thumbnail=True, rerank_max_length=2048 ) print(f"✅ Reranking model loaded on {device}") return model, processor, device