import os from typing import List import numpy as np from transformers import AutoTokenizer, AutoModel from openai import OpenAI from dotenv import load_dotenv import torch load_dotenv() class HindiEmbeddingGenerator: def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): """ Initialize embedding generator for Hindi text Using a multilingual model that supports Hindi """ self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_name) # Load model with proper device handling to avoid meta tensor issues # Explicitly set device to CPU to avoid meta device issues self.model = AutoModel.from_pretrained( model_name, dtype=torch.float32, # Use consistent dtype device_map="cpu", # Explicitly set to CPU to avoid meta device low_cpu_mem_usage=False # Avoid issues with meta tensors ) # Check if the model is on a meta device and handle appropriately try: # Ensure model is on CPU - use to_empty if it's a meta tensor if next(self.model.parameters()).device.type == 'meta': # If the model is on meta device, we need to move it properly self.model = self.model.to_empty(device='cpu') else: # Otherwise, use regular to() method self.model = self.model.to('cpu') except RuntimeError as e: if "Cannot copy out of meta tensor" in str(e): # Handle the specific meta tensor error by using to_empty self.model = self.model.to_empty(device='cpu') else: raise e # Initialize OpenAI client for generation self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def get_embedding(self, text: str) -> List[float]: """ Generate embedding for Hindi text using multilingual model """ # Tokenize the input text inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Get model outputs with torch.no_grad(): outputs = self.model(**inputs) # Use mean pooling to get the sentence embedding # Move tensor to CPU before converting to numpy to avoid meta device issues embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() # Convert to list and return return embeddings.tolist() def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: """ Generate embeddings for a batch of texts """ embeddings = [] for text in texts: embedding = self.get_embedding(text) embeddings.append(embedding) return embeddings def generate_response(self, prompt: str, context: str = "") -> str: """ Generate response using OpenAI with provided context """ full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nPlease provide a helpful response in Hindi if possible, or in English." response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant for Hindi literature. Respond appropriately based on the context provided."}, {"role": "user", "content": full_prompt} ], max_tokens=500, temperature=0.7 ) return response.choices[0].message.content # Alternative implementation using OpenAI embeddings directly class OpenAIEmbeddingGenerator: def __init__(self): """ Initialize OpenAI embedding generator """ self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) self.model = "text-embedding-ada-002" def get_embedding(self, text: str) -> List[float]: """ Get embedding from OpenAI """ response = self.client.embeddings.create( input=text, model=self.model ) return response.data[0].embedding def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: """ Get embeddings for a batch of texts """ # OpenAI API limits batch size, so we'll process in chunks embeddings = [] chunk_size = 20 # Conservative batch size for i in range(0, len(texts), chunk_size): chunk = texts[i:i + chunk_size] response = self.client.embeddings.create( input=chunk, model=self.model ) chunk_embeddings = [item.embedding for item in response.data] embeddings.extend(chunk_embeddings) return embeddings # Choose which embedding generator to use # For Hindi text, the multilingual transformer model is recommended def get_embedding_function(): """ Return the appropriate embedding function """ # Using the multilingual model which works better for Hindi embedder = HindiEmbeddingGenerator() return embedder.get_embedding if __name__ == "__main__": # Example usage embed_gen = HindiEmbeddingGenerator() # Test with Hindi text hindi_text = "हिंदी साहित्य भारत के समृद्ध साहित्यिक परंपरा का प्रतिनिधित्व करता है।" embedding = embed_gen.get_embedding(hindi_text) print(f"Embedding length: {len(embedding)}") print(f"First 10 values: {embedding[:10]}")