Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor, AutoConfig | |
| from functools import lru_cache | |
| def load_embed_model(model_path: str = "nvidia/llama-nemotron-embed-vl-1b-v2"): | |
| """Load embedding model (cached).""" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"🔄 Loading embedding model on {device}...") | |
| config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) | |
| config._attn_implementation = "sdpa" | |
| if hasattr(config, 'llm_config'): | |
| config.llm_config._attn_implementation = "sdpa" | |
| # ✅ FIX: Use manual device instead of device_map="auto" | |
| model = AutoModel.from_pretrained( | |
| model_path, | |
| config=config, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, # ✅ CPU optimization | |
| ).to(device).eval() | |
| print(f"✅ Embedding model loaded on {device}") | |
| return model, device | |
| def load_rerank_model(model_path: str = "nvidia/llama-nemotron-rerank-vl-1b-v2"): | |
| """Load reranking model (cached).""" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"🔄 Loading reranking model on {device}...") | |
| # ✅ FIX: Use manual device instead of device_map="auto" | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_path, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| trust_remote_code=False, | |
| attn_implementation="eager", | |
| ).to(device).eval() | |
| processor = AutoProcessor.from_pretrained( | |
| model_path, | |
| trust_remote_code=True, | |
| max_input_tiles=6, | |
| use_thumbnail=True, | |
| rerank_max_length=2048 | |
| ) | |
| print(f"✅ Reranking model loaded on {device}") | |
| return model, processor, device |