Spaces:

TurkishCodeMan
/

multimodal-recipe-rag

Runtime error

File size: 1,942 Bytes

3f8c153
 
 
 
 
 
 
 
 
7c0c1c8
 
3f8c153
7104a49
 
 
3f8c153
7104a49
 
3f8c153
 
7c0c1c8
b684560
7104a49
7c0c1c8
3f8c153
7c0c1c8
3f8c153
 
 
 
 
 
 
 
7c0c1c8
 
 
7104a49
3f8c153
7c0c1c8
4db9aa3
3f8c153
7c0c1c8
3f8c153
 
 
b684560
3f8c153
70ba15f
3f8c153
 
 
7c0c1c8
3f8c153

import torch
from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor, AutoConfig
from functools import lru_cache

@lru_cache(maxsize=2)
def load_embed_model(model_path: str = "nvidia/llama-nemotron-embed-vl-1b-v2"):
    """Load embedding model (cached)."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"🔄 Loading embedding model on {device}...")
    
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    config._attn_implementation = "sdpa"
    if hasattr(config, 'llm_config'):
        config.llm_config._attn_implementation = "sdpa"
    
    # ✅ FIX: Use manual device instead of device_map="auto"
    model = AutoModel.from_pretrained(
        model_path,
        config=config,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        trust_remote_code=True,
        low_cpu_mem_usage=True,  # ✅ CPU optimization
    ).to(device).eval()
    
    print(f"✅ Embedding model loaded on {device}")
    return model, device


@lru_cache(maxsize=2)
def load_rerank_model(model_path: str = "nvidia/llama-nemotron-rerank-vl-1b-v2"):
    """Load reranking model (cached)."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"🔄 Loading reranking model on {device}...")
    
    # ✅ FIX: Use manual device instead of device_map="auto"
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        trust_remote_code=False,
        attn_implementation="eager",
    ).to(device).eval()
    
    processor = AutoProcessor.from_pretrained(
        model_path,
        trust_remote_code=True,
        max_input_tiles=6,
        use_thumbnail=True,
        rerank_max_length=2048
    )
    
    print(f"✅ Reranking model loaded on {device}")
    return model, processor, device