Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| from dotenv import load_dotenv | |
| import torch | |
| load_dotenv() | |
| # Set cache directory | |
| os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface_cache") | |
| class EmbeddingGenerator: | |
| def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): | |
| """Initialize embedding generator for multilingual text""" | |
| self.model_name = model_name | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True) | |
| self.model = AutoModel.from_pretrained( | |
| model_name, | |
| dtype=torch.float32, | |
| device_map="cpu", | |
| low_cpu_mem_usage=False, | |
| local_files_only=True | |
| ) | |
| self.model = self.model.to("cpu") | |
| self.model.eval() | |
| def get_embedding(self, text: str) -> List[float]: | |
| """Generate embedding for text""" | |
| inputs = self.tokenizer( | |
| text, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=512, | |
| add_special_tokens=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() | |
| return embeddings.tolist() | |
| def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: | |
| """Generate embeddings for a batch of texts""" | |
| return [self.get_embedding(text) for text in texts] | |
| def get_embedding_function(): | |
| """Return the embedding function""" | |
| embedder = EmbeddingGenerator() | |
| return embedder.get_embedding | |
| if __name__ == "__main__": | |
| embed_gen = EmbeddingGenerator() | |
| test_text = "Hello world, नमस्ते दुनिया" | |
| embedding = embed_gen.get_embedding(test_text) | |
| print(f"Embedding length: {len(embedding)}") | |
| print(f"First 10 values: {embedding[:10]}") | |