import os from typing import List import numpy as np from transformers import AutoTokenizer, AutoModel from dotenv import load_dotenv import torch load_dotenv() # Set cache directory os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface_cache") class EmbeddingGenerator: def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): """Initialize embedding generator for multilingual text""" self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True) self.model = AutoModel.from_pretrained( model_name, dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=False, local_files_only=True ) self.model = self.model.to("cpu") self.model.eval() def get_embedding(self, text: str) -> List[float]: """Generate embedding for text""" inputs = self.tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=512, add_special_tokens=True ) with torch.no_grad(): outputs = self.model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() return embeddings.tolist() def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: """Generate embeddings for a batch of texts""" return [self.get_embedding(text) for text in texts] def get_embedding_function(): """Return the embedding function""" embedder = EmbeddingGenerator() return embedder.get_embedding if __name__ == "__main__": embed_gen = EmbeddingGenerator() test_text = "Hello world, नमस्ते दुनिया" embedding = embed_gen.get_embedding(test_text) print(f"Embedding length: {len(embedding)}") print(f"First 10 values: {embedding[:10]}")