""" Ultra-fast ModernBERT-like embedder (CPU-compatible) 100% local — runs on x86 / Linux / Docker. """ from typing import List import torch from transformers import AutoTokenizer, AutoModel print("Loading ModernBERT Embed (CPU version)...") # You can use any small, fast embedding model here # e.g. nomic-ai/nomic-embed-text-v1.5 or sentence-transformers/all-MiniLM-L6-v2 MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True) model.eval() if torch.cuda.is_available(): model = model.to("cuda") else: model = model.to("cpu") class LocalEmbedder: def embed_documents(self, texts: List[str]) -> List[List[float]]: """Generate normalized embeddings for a batch of texts.""" with torch.no_grad(): inputs = tokenizer( texts, padding=True, truncation=True, max_length=512, return_tensors="pt", ).to(model.device) outputs = model(**inputs) # Mean pool embeddings (like ModernBERT) embeddings = outputs.last_hidden_state.mean(dim=1) # Normalize for cosine similarity embeddings = embeddings / embeddings.norm(p=2, dim=1, keepdim=True) return embeddings.cpu().tolist() def embed_query(self, text: str) -> List[float]: """Generate normalized embedding for a single query string.""" return self.embed_documents([text])[0] # Global instance (for llama_index) embedder = LocalEmbedder() print("ModernBERT Embed ready! (CPU-compatible)")