File size: 5,688 Bytes
2d51ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51dbf39
2d51ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
from typing import List
import numpy as np
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI
from dotenv import load_dotenv
import torch

load_dotenv()

class HindiEmbeddingGenerator:
    def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """
        Initialize embedding generator for Hindi text
        Using a multilingual model that supports Hindi
        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model with proper device handling to avoid meta tensor issues
        # Explicitly set device to CPU to avoid meta device issues
        self.model = AutoModel.from_pretrained(
            model_name,
            dtype=torch.float32,  # Use consistent dtype
            device_map="cpu",  # Explicitly set to CPU to avoid meta device
            low_cpu_mem_usage=False  # Avoid issues with meta tensors
        )

        # Check if the model is on a meta device and handle appropriately
        try:
            # Ensure model is on CPU - use to_empty if it's a meta tensor
            if next(self.model.parameters()).device.type == 'meta':
                # If the model is on meta device, we need to move it properly
                self.model = self.model.to_empty(device='cpu')
            else:
                # Otherwise, use regular to() method
                self.model = self.model.to('cpu')
        except RuntimeError as e:
            if "Cannot copy out of meta tensor" in str(e):
                # Handle the specific meta tensor error by using to_empty
                self.model = self.model.to_empty(device='cpu')
            else:
                raise e

        # Initialize OpenAI client for generation
        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    def get_embedding(self, text: str) -> List[float]:
        """
        Generate embedding for Hindi text using multilingual model
        """
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Get model outputs
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Use mean pooling to get the sentence embedding
        # Move tensor to CPU before converting to numpy to avoid meta device issues
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

        # Convert to list and return
        return embeddings.tolist()

    def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a batch of texts
        """
        embeddings = []
        for text in texts:
            embedding = self.get_embedding(text)
            embeddings.append(embedding)
        return embeddings

    def generate_response(self, prompt: str, context: str = "") -> str:
        """
        Generate response using OpenAI with provided context
        """
        full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nPlease provide a helpful response in Hindi if possible, or in English."

        response = self.openai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant for Hindi literature. Respond appropriately based on the context provided."},
                {"role": "user", "content": full_prompt}
            ],
            max_tokens=500,
            temperature=0.7
        )

        return response.choices[0].message.content

# Alternative implementation using OpenAI embeddings directly
class OpenAIEmbeddingGenerator:
    def __init__(self):
        """
        Initialize OpenAI embedding generator
        """
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.model = "text-embedding-ada-002"

    def get_embedding(self, text: str) -> List[float]:
        """
        Get embedding from OpenAI
        """
        response = self.client.embeddings.create(
            input=text,
            model=self.model
        )
        return response.data[0].embedding

    def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
        """
        Get embeddings for a batch of texts
        """
        # OpenAI API limits batch size, so we'll process in chunks
        embeddings = []
        chunk_size = 20  # Conservative batch size

        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i + chunk_size]
            response = self.client.embeddings.create(
                input=chunk,
                model=self.model
            )
            chunk_embeddings = [item.embedding for item in response.data]
            embeddings.extend(chunk_embeddings)

        return embeddings

# Choose which embedding generator to use
# For Hindi text, the multilingual transformer model is recommended
def get_embedding_function():
    """
    Return the appropriate embedding function
    """
    # Using the multilingual model which works better for Hindi
    embedder = HindiEmbeddingGenerator()
    return embedder.get_embedding

if __name__ == "__main__":
    # Example usage
    embed_gen = HindiEmbeddingGenerator()

    # Test with Hindi text
    hindi_text = "हिंदी साहित्य भारत के समृद्ध साहित्यिक परंपरा का प्रतिनिधित्व करता है।"
    embedding = embed_gen.get_embedding(hindi_text)
    print(f"Embedding length: {len(embedding)}")
    print(f"First 10 values: {embedding[:10]}")