indicRAG / backend /src /embedding_generator.py
hardkpentium101's picture
Pre-download models in Dockerfile, use cache at runtime
d69e53e
import os
from typing import List
import numpy as np
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import torch
load_dotenv()
# Set cache directory
os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface_cache")
class EmbeddingGenerator:
def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
"""Initialize embedding generator for multilingual text"""
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
self.model = AutoModel.from_pretrained(
model_name,
dtype=torch.float32,
device_map="cpu",
low_cpu_mem_usage=False,
local_files_only=True
)
self.model = self.model.to("cpu")
self.model.eval()
def get_embedding(self, text: str) -> List[float]:
"""Generate embedding for text"""
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512,
add_special_tokens=True
)
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
return embeddings.tolist()
def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
"""Generate embeddings for a batch of texts"""
return [self.get_embedding(text) for text in texts]
def get_embedding_function():
"""Return the embedding function"""
embedder = EmbeddingGenerator()
return embedder.get_embedding
if __name__ == "__main__":
embed_gen = EmbeddingGenerator()
test_text = "Hello world, नमस्ते दुनिया"
embedding = embed_gen.get_embedding(test_text)
print(f"Embedding length: {len(embedding)}")
print(f"First 10 values: {embedding[:10]}")