| | import os |
| | import torch |
| | import json |
| | import argparse |
| | import numpy as np |
| | import re |
| | from torch import nn |
| | from torch.nn import functional as F |
| | import sentencepiece as spm |
| | import math |
| | from safetensors.torch import save_file, load_file |
| | from tqdm import tqdm |
| | import faiss |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain.vectorstores import FAISS as LangchainFAISS |
| | from langchain.docstore.document import Document |
| | from langchain.embeddings.base import Embeddings |
| | from typing import List, Dict, Any, Optional, Callable |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
| | import gc |
| | import warnings |
| |
|
| | |
| | warnings.filterwarnings("ignore", category=UserWarning, message=".*The model doesn't have tied token embeddings.*") |
| |
|
| | |
| | class SentencePieceTokenizerWrapper: |
| | def __init__(self, sp_model_path): |
| | self.sp_model = spm.SentencePieceProcessor() |
| | self.sp_model.Load(sp_model_path) |
| | self.vocab_size = self.sp_model.GetPieceSize() |
| | |
| | |
| | self.pad_token_id = 0 |
| | self.bos_token_id = 1 |
| | self.eos_token_id = 2 |
| | self.unk_token_id = 3 |
| | |
| | |
| | self.pad_token = "<pad>" |
| | self.bos_token = "<s>" |
| | self.eos_token = "</s>" |
| | self.unk_token = "<unk>" |
| | self.mask_token = "<mask>" |
| | |
| | def __call__(self, text, padding=False, truncation=False, max_length=None, return_tensors=None): |
| | |
| | if isinstance(text, str): |
| | |
| | ids = self.sp_model.EncodeAsIds(text) |
| | |
| | |
| | if truncation and max_length and len(ids) > max_length: |
| | ids = ids[:max_length] |
| | |
| | attention_mask = [1] * len(ids) |
| | |
| | |
| | if padding and max_length: |
| | padding_length = max(0, max_length - len(ids)) |
| | ids = ids + [self.pad_token_id] * padding_length |
| | attention_mask = attention_mask + [0] * padding_length |
| | |
| | result = { |
| | 'input_ids': ids, |
| | 'attention_mask': attention_mask |
| | } |
| | |
| | |
| | if return_tensors == 'pt': |
| | import torch |
| | result = {k: torch.tensor([v]) for k, v in result.items()} |
| | |
| | return result |
| | |
| | |
| | batch_encoded = [self.sp_model.EncodeAsIds(t) for t in text] |
| | |
| | |
| | if truncation and max_length: |
| | batch_encoded = [ids[:max_length] for ids in batch_encoded] |
| | |
| | |
| | batch_attention_mask = [[1] * len(ids) for ids in batch_encoded] |
| | |
| | |
| | if padding: |
| | if max_length: |
| | max_len = max_length |
| | else: |
| | max_len = max(len(ids) for ids in batch_encoded) |
| | |
| | |
| | batch_encoded = [ids + [self.pad_token_id] * (max_len - len(ids)) for ids in batch_encoded] |
| | batch_attention_mask = [mask + [0] * (max_len - len(mask)) for mask in batch_attention_mask] |
| | |
| | result = { |
| | 'input_ids': batch_encoded, |
| | 'attention_mask': batch_attention_mask |
| | } |
| | |
| | |
| | if return_tensors == 'pt': |
| | import torch |
| | result = {k: torch.tensor(v) for k, v in result.items()} |
| | |
| | return result |
| |
|
| | |
| |
|
| | class MultiHeadAttention(nn.Module): |
| | """Advanced multi-headed attention with relative positional encoding""" |
| | def __init__(self, config): |
| | super().__init__() |
| | self.num_attention_heads = config["num_attention_heads"] |
| | self.attention_head_size = config["hidden_size"] // config["num_attention_heads"] |
| | self.all_head_size = self.num_attention_heads * self.attention_head_size |
| | |
| | |
| | self.query = nn.Linear(config["hidden_size"], self.all_head_size) |
| | self.key = nn.Linear(config["hidden_size"], self.all_head_size) |
| | self.value = nn.Linear(config["hidden_size"], self.all_head_size) |
| | |
| | |
| | self.output = nn.Sequential( |
| | nn.Linear(self.all_head_size, config["hidden_size"]), |
| | nn.Dropout(config["attention_probs_dropout_prob"]) |
| | ) |
| | |
| | |
| | self.max_position_embeddings = config["max_position_embeddings"] |
| | self.relative_attention_bias = nn.Embedding( |
| | 2 * config["max_position_embeddings"] - 1, |
| | config["num_attention_heads"] |
| | ) |
| | |
| | def transpose_for_scores(self, x): |
| | new_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) |
| | x = x.view(*new_shape) |
| | return x.permute(0, 2, 1, 3) |
| | |
| | def forward(self, hidden_states, attention_mask=None): |
| | batch_size, seq_length = hidden_states.size()[:2] |
| | |
| | |
| | query_layer = self.transpose_for_scores(self.query(hidden_states)) |
| | key_layer = self.transpose_for_scores(self.key(hidden_states)) |
| | value_layer = self.transpose_for_scores(self.value(hidden_states)) |
| | |
| | |
| | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) |
| | |
| | |
| | position_ids = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device) |
| | relative_position = position_ids.unsqueeze(1) - position_ids.unsqueeze(0) |
| | |
| | relative_position = relative_position + self.max_position_embeddings - 1 |
| | |
| | relative_position = torch.clamp(relative_position, 0, 2 * self.max_position_embeddings - 2) |
| | |
| | |
| | rel_attn_bias = self.relative_attention_bias(relative_position) |
| | |
| | |
| | rel_attn_bias = rel_attn_bias.permute(2, 0, 1).unsqueeze(0) |
| | |
| | |
| | attention_scores = attention_scores + rel_attn_bias |
| | |
| | |
| | attention_scores = attention_scores / math.sqrt(self.attention_head_size) |
| | |
| | |
| | if attention_mask is not None: |
| | attention_scores = attention_scores + attention_mask |
| | |
| | |
| | attention_probs = F.softmax(attention_scores, dim=-1) |
| | |
| | |
| | attention_probs = F.dropout(attention_probs, p=0.1, training=self.training) |
| | |
| | |
| | context_layer = torch.matmul(attention_probs, value_layer) |
| | |
| | |
| | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() |
| | new_shape = context_layer.size()[:-2] + (self.all_head_size,) |
| | context_layer = context_layer.view(*new_shape) |
| | |
| | |
| | output = self.output(context_layer) |
| | |
| | return output |
| |
|
| | class EnhancedTransformerLayer(nn.Module): |
| | """Advanced transformer layer with pre-layer norm and enhanced attention""" |
| | def __init__(self, config): |
| | super().__init__() |
| | self.attention_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"]) |
| | self.attention = MultiHeadAttention(config) |
| | |
| | self.ffn_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"]) |
| | |
| | |
| | self.ffn = nn.Sequential( |
| | nn.Linear(config["hidden_size"], config["intermediate_size"]), |
| | nn.GELU(), |
| | nn.Dropout(config["hidden_dropout_prob"]), |
| | nn.Linear(config["intermediate_size"], config["hidden_size"]), |
| | nn.Dropout(config["hidden_dropout_prob"]) |
| | ) |
| | |
| | def forward(self, hidden_states, attention_mask=None): |
| | |
| | attn_norm_hidden = self.attention_pre_norm(hidden_states) |
| | |
| | |
| | attention_output = self.attention(attn_norm_hidden, attention_mask) |
| | |
| | |
| | hidden_states = hidden_states + attention_output |
| | |
| | |
| | ffn_norm_hidden = self.ffn_pre_norm(hidden_states) |
| | |
| | |
| | ffn_output = self.ffn(ffn_norm_hidden) |
| | |
| | |
| | hidden_states = hidden_states + ffn_output |
| | |
| | return hidden_states |
| |
|
| | class AdvancedTransformerModel(nn.Module): |
| | """Advanced Transformer model for inference""" |
| | |
| | def __init__(self, config): |
| | super().__init__() |
| | self.config = config |
| | |
| | |
| | self.word_embeddings = nn.Embedding( |
| | config["vocab_size"], |
| | config["hidden_size"], |
| | padding_idx=config["pad_token_id"] |
| | ) |
| | |
| | |
| | self.position_embeddings = nn.Embedding(config["max_position_embeddings"], config["hidden_size"]) |
| | |
| | |
| | self.embedding_dropout = nn.Dropout(config["hidden_dropout_prob"]) |
| | |
| | |
| | self.layers = nn.ModuleList([ |
| | EnhancedTransformerLayer(config) for _ in range(config["num_hidden_layers"]) |
| | ]) |
| | |
| | |
| | self.final_layer_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"]) |
| | |
| | def forward(self, input_ids, attention_mask=None): |
| | input_shape = input_ids.size() |
| | batch_size, seq_length = input_shape |
| | |
| | |
| | position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) |
| | position_ids = position_ids.unsqueeze(0).expand(batch_size, -1) |
| | |
| | |
| | word_embeds = self.word_embeddings(input_ids) |
| | position_embeds = self.position_embeddings(position_ids) |
| | |
| | |
| | embeddings = word_embeds + position_embeds |
| | |
| | |
| | embeddings = self.embedding_dropout(embeddings) |
| | |
| | |
| | if attention_mask is None: |
| | attention_mask = torch.ones(input_shape, device=input_ids.device) |
| | |
| | |
| | extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) |
| | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 |
| | |
| | |
| | hidden_states = embeddings |
| | for layer in self.layers: |
| | hidden_states = layer(hidden_states, extended_attention_mask) |
| | |
| | |
| | hidden_states = self.final_layer_norm(hidden_states) |
| | |
| | return hidden_states |
| |
|
| | class AdvancedPooling(nn.Module): |
| | """Advanced pooling module supporting multiple pooling strategies""" |
| | def __init__(self, config): |
| | super().__init__() |
| | self.pooling_mode = config["pooling_mode"] |
| | self.hidden_size = config["hidden_size"] |
| | |
| | |
| | if self.pooling_mode == 'attention': |
| | self.attention_weights = nn.Linear(config["hidden_size"], 1) |
| | |
| | |
| | elif self.pooling_mode == 'weighted': |
| | self.weight_layer = nn.Linear(config["hidden_size"], 1) |
| | |
| | def forward(self, token_embeddings, attention_mask=None): |
| | if attention_mask is None: |
| | attention_mask = torch.ones_like(token_embeddings[:, :, 0]) |
| | |
| | mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
| | |
| | if self.pooling_mode == 'cls': |
| | |
| | pooled = token_embeddings[:, 0] |
| | |
| | elif self.pooling_mode == 'max': |
| | |
| | token_embeddings = token_embeddings.clone() |
| | |
| | token_embeddings[mask_expanded == 0] = -1e9 |
| | pooled = torch.max(token_embeddings, dim=1)[0] |
| | |
| | elif self.pooling_mode == 'attention': |
| | |
| | weights = self.attention_weights(token_embeddings).squeeze(-1) |
| | |
| | weights = weights.masked_fill(attention_mask == 0, -1e9) |
| | weights = F.softmax(weights, dim=1).unsqueeze(-1) |
| | pooled = torch.sum(token_embeddings * weights, dim=1) |
| | |
| | elif self.pooling_mode == 'weighted': |
| | |
| | weights = torch.sigmoid(self.weight_layer(token_embeddings)).squeeze(-1) |
| | |
| | weights = weights * attention_mask |
| | |
| | sum_weights = torch.sum(weights, dim=1, keepdim=True) |
| | sum_weights = torch.clamp(sum_weights, min=1e-9) |
| | weights = weights / sum_weights |
| | |
| | pooled = torch.sum(token_embeddings * weights.unsqueeze(-1), dim=1) |
| | |
| | else: |
| | |
| | sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1) |
| | sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) |
| | pooled = sum_embeddings / sum_mask |
| | |
| | |
| | pooled = F.normalize(pooled, p=2, dim=1) |
| | |
| | return pooled |
| |
|
| | class SentenceEmbeddingModel(nn.Module): |
| | """Complete sentence embedding model for inference""" |
| | def __init__(self, config): |
| | super(SentenceEmbeddingModel, self).__init__() |
| | self.config = config |
| | |
| | |
| | self.transformer = AdvancedTransformerModel(config) |
| | |
| | |
| | self.pooling = AdvancedPooling(config) |
| | |
| | |
| | if "projection_dim" in config and config["projection_dim"] > 0: |
| | self.use_projection = True |
| | self.projection = nn.Sequential( |
| | nn.Linear(config["hidden_size"], config["hidden_size"]), |
| | nn.GELU(), |
| | nn.Linear(config["hidden_size"], config["projection_dim"]), |
| | nn.LayerNorm(config["projection_dim"], eps=config["layer_norm_eps"]) |
| | ) |
| | else: |
| | self.use_projection = False |
| | |
| | def forward(self, input_ids, attention_mask=None): |
| | |
| | token_embeddings = self.transformer(input_ids, attention_mask) |
| | |
| | |
| | pooled_output = self.pooling(token_embeddings, attention_mask) |
| | |
| | |
| | if self.use_projection: |
| | pooled_output = self.projection(pooled_output) |
| | pooled_output = F.normalize(pooled_output, p=2, dim=1) |
| | |
| | return pooled_output |
| |
|
| | def convert_to_safetensors(model_path, output_path): |
| | """Convert PyTorch model to safetensors format""" |
| | print(f"Converting model from {model_path} to safetensors format...") |
| | |
| | try: |
| | |
| | checkpoint = torch.load(model_path, map_location="cpu", weights_only=False) |
| | print("Successfully loaded checkpoint with weights_only=False") |
| | except TypeError: |
| | |
| | print("Falling back to default torch.load behavior for older PyTorch versions") |
| | checkpoint = torch.load(model_path, map_location="cpu") |
| | |
| | |
| | if "model_state_dict" in checkpoint: |
| | state_dict = checkpoint["model_state_dict"] |
| | print("Extracted model_state_dict from checkpoint") |
| | else: |
| | state_dict = checkpoint |
| | print("Using entire checkpoint as state_dict") |
| | |
| | |
| | save_file(state_dict, output_path) |
| | print(f"Model converted and saved to {output_path}") |
| |
|
| | def load_model_and_tokenizer(model_dir, tokenizer_dir="/home/ubuntu/hindi_tokenizer"): |
| | """Load the model and tokenizer for inference""" |
| | |
| | |
| | config_path = os.path.join(model_dir, "config.json") |
| | with open(config_path, "r") as f: |
| | config = json.load(f) |
| | |
| | |
| | tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.model") |
| | if not os.path.exists(tokenizer_path): |
| | |
| | tokenizer_path = os.path.join(model_dir, "tokenizer.model") |
| | if not os.path.exists(tokenizer_path): |
| | raise FileNotFoundError(f"Could not find tokenizer model at {tokenizer_path}") |
| | |
| | tokenizer = SentencePieceTokenizerWrapper(tokenizer_path) |
| | print(f"Loaded tokenizer from {tokenizer_path} with vocabulary size: {tokenizer.vocab_size}") |
| | |
| | |
| | safetensors_path = os.path.join(model_dir, "embedding_model.safetensors") |
| | |
| | if not os.path.exists(safetensors_path): |
| | print(f"Safetensors model not found at {safetensors_path}, converting from PyTorch checkpoint...") |
| | |
| | |
| | pytorch_path = os.path.join(model_dir, "embedding_model.pt") |
| | if not os.path.exists(pytorch_path): |
| | raise FileNotFoundError(f"Could not find PyTorch model at {pytorch_path}") |
| | |
| | convert_to_safetensors(pytorch_path, safetensors_path) |
| | |
| | |
| | state_dict = load_file(safetensors_path) |
| | |
| | |
| | model = SentenceEmbeddingModel(config) |
| | |
| | |
| | try: |
| | |
| | missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) |
| | print(f"Loaded model with missing keys: {missing_keys[:10]}{'...' if len(missing_keys) > 10 else ''}") |
| | print(f"Unexpected keys: {unexpected_keys[:10]}{'...' if len(unexpected_keys) > 10 else ''}") |
| | except Exception as e: |
| | print(f"Error loading state dict: {e}") |
| | print("Model will be initialized with random weights") |
| | |
| | model.eval() |
| | |
| | return model, tokenizer, config |
| |
|
| | |
| | class HindiSentenceEmbeddings(Embeddings): |
| | """ |
| | Custom Langchain Embeddings class for Hindi sentence embeddings model |
| | """ |
| | def __init__(self, model, tokenizer, device="cuda", batch_size=32, max_length=128): |
| | """Initialize with model, tokenizer, and inference parameters""" |
| | self.model = model |
| | self.tokenizer = tokenizer |
| | self.device = device |
| | self.batch_size = batch_size |
| | self.max_length = max_length |
| | |
| | def embed_documents(self, texts: List[str]) -> List[List[float]]: |
| | """Embed a list of documents/texts""" |
| | embeddings = [] |
| | |
| | with torch.no_grad(): |
| | for i in range(0, len(texts), self.batch_size): |
| | batch = texts[i:i+self.batch_size] |
| | |
| | |
| | inputs = self.tokenizer( |
| | batch, |
| | padding="max_length", |
| | truncation=True, |
| | max_length=self.max_length, |
| | return_tensors="pt" |
| | ) |
| | |
| | |
| | input_ids = inputs["input_ids"].to(self.device) |
| | attention_mask = inputs["attention_mask"].to(self.device) |
| | |
| | |
| | batch_embeddings = self.model(input_ids, attention_mask) |
| | |
| | |
| | batch_embeddings = batch_embeddings.cpu().numpy() |
| | embeddings.append(batch_embeddings) |
| | |
| | return np.vstack(embeddings).tolist() |
| | |
| | def embed_query(self, text: str) -> List[float]: |
| | """Embed a single query/text""" |
| | return self.embed_documents([text])[0] |
| |
|
| | def extract_relevant_sentences(text, query, window_size=2): |
| | """ |
| | Extract the most relevant sentences from text based on query keywords |
| | |
| | Args: |
| | text: The full text content |
| | query: The user's query |
| | window_size: Number of sentences to include before and after matched sentence |
| | |
| | Returns: |
| | String containing the most relevant portion of the text |
| | """ |
| | |
| | query = query.strip().lower() |
| | |
| | |
| | query = re.sub(r'[?।॥!,.:]', '', query) |
| | |
| | |
| | stop_words = ['और', 'का', 'के', 'को', 'में', 'से', 'है', 'हैं', 'था', 'थे', 'की', 'कि', 'पर', 'एक', 'यह', 'वह', 'जो', 'ने', 'हो', 'कर'] |
| | query_terms = [word for word in query.split() if word not in stop_words] |
| | |
| | if not query_terms: |
| | return text |
| | |
| | |
| | sentences = re.split(r'([।॥!?.])', text) |
| | |
| | |
| | complete_sentences = [] |
| | for i in range(0, len(sentences)-1, 2): |
| | if i+1 < len(sentences): |
| | complete_sentences.append(sentences[i] + sentences[i+1]) |
| | else: |
| | complete_sentences.append(sentences[i]) |
| | |
| | |
| | if len(complete_sentences) <= 1: |
| | complete_sentences = re.split(r'[।॥!?.]', text) |
| | complete_sentences = [s.strip() for s in complete_sentences if s.strip()] |
| | |
| | |
| | sentence_scores = [] |
| | for i, sentence in enumerate(complete_sentences): |
| | sentence_lower = sentence.lower() |
| | |
| | score = sum(1 for term in query_terms if term in sentence_lower) |
| | sentence_scores.append((i, score)) |
| | |
| | |
| | if not sentence_scores: |
| | return text[:500] + "..." |
| | |
| | |
| | best_match_idx, best_score = max(sentence_scores, key=lambda x: x[1]) |
| | |
| | |
| | if best_score == 0: |
| | |
| | for i, sentence in enumerate(complete_sentences): |
| | sentence_lower = sentence.lower() |
| | partial_score = sum(1 for term in query_terms if any(term in word.lower() for word in sentence_lower.split())) |
| | if partial_score > 0: |
| | best_match_idx = i |
| | break |
| | else: |
| | |
| | if len(text) > 1000: |
| | return text[:1000] + "..." |
| | return text |
| | |
| | |
| | start_idx = max(0, best_match_idx - window_size) |
| | end_idx = min(len(complete_sentences), best_match_idx + window_size + 1) |
| | |
| | |
| | relevant_text = ' '.join(complete_sentences[start_idx:end_idx]) |
| | |
| | |
| | if len(relevant_text) < 100 and len(text) > len(relevant_text): |
| | |
| | if end_idx < len(complete_sentences): |
| | relevant_text += ' ' + ' '.join(complete_sentences[end_idx:end_idx+2]) |
| | if start_idx > 0: |
| | relevant_text = ' '.join(complete_sentences[max(0, start_idx-2):start_idx]) + ' ' + relevant_text |
| | |
| | |
| | if len(relevant_text) < 50 or len(text) < 1000: |
| | return text |
| | |
| | return relevant_text |
| |
|
| | |
| | def load_and_process_text_file(file_path, chunk_size=500, chunk_overlap=100): |
| | """ |
| | Load a text file and split it into semantically meaningful chunks |
| | """ |
| | print(f"Loading and processing text file: {file_path}") |
| | |
| | |
| | with open(file_path, 'r', encoding='utf-8') as f: |
| | content = f.read() |
| | |
| | |
| | if len(content) <= chunk_size * 2: |
| | print(f"File content is small, keeping as a single chunk") |
| | return [Document( |
| | page_content=content, |
| | metadata={ |
| | "source": file_path, |
| | "chunk_id": 0 |
| | } |
| | )] |
| | |
| | |
| | paragraphs = re.split(r'\n\s*\n', content) |
| | chunks = [] |
| | |
| | current_chunk = "" |
| | current_size = 0 |
| | |
| | for para in paragraphs: |
| | if not para.strip(): |
| | continue |
| | |
| | |
| | if current_size + len(para) > chunk_size and current_size > 0: |
| | chunks.append(current_chunk) |
| | current_chunk = para |
| | current_size = len(para) |
| | else: |
| | |
| | if current_size > 0: |
| | current_chunk += "\n\n" + para |
| | else: |
| | current_chunk = para |
| | current_size = len(current_chunk) |
| | |
| | |
| | if current_chunk: |
| | chunks.append(current_chunk) |
| | |
| | print(f"Split text into {len(chunks)} chunks") |
| | |
| | |
| | documents = [ |
| | Document( |
| | page_content=chunk, |
| | metadata={ |
| | "source": file_path, |
| | "chunk_id": i |
| | } |
| | ) for i, chunk in enumerate(chunks) |
| | ] |
| | |
| | return documents |
| |
|
| | def create_vector_store(documents, embeddings, store_path=None): |
| | """ |
| | Create a FAISS vector store from documents using the given embeddings |
| | """ |
| | print("Creating FAISS vector store...") |
| | |
| | |
| | vector_store = LangchainFAISS.from_documents(documents, embeddings) |
| | |
| | |
| | if store_path: |
| | print(f"Saving vector store to {store_path}") |
| | vector_store.save_local(store_path) |
| | |
| | return vector_store |
| |
|
| | def load_vector_store(store_path, embeddings): |
| | """ |
| | Load a FAISS vector store from disk |
| | """ |
| | print(f"Loading vector store from {store_path}") |
| | return LangchainFAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True) |
| |
|
| | def perform_similarity_search(vector_store, query, k=6): |
| | """ |
| | Perform basic similarity search on the vector store |
| | """ |
| | print(f"Searching for: {query}") |
| | return vector_store.similarity_search_with_score(query, k=k) |
| |
|
| | |
| | def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"): |
| | """ |
| | Load and prepare Llama model for text generation |
| | """ |
| | print(f"Loading LLM: {model_name}") |
| | |
| | |
| | if device == "cuda" and not torch.cuda.is_available(): |
| | print("CUDA not available, falling back to CPU") |
| | device = "cpu" |
| |
|
| | |
| | quantization = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_compute_dtype=torch.float16, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_use_double_quant=True, |
| | ) if device == "cuda" else None |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | if device == "cuda": |
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_name, |
| | device_map="auto", |
| | quantization_config=quantization |
| | ) |
| | else: |
| | model = AutoModelForCausalLM.from_pretrained(model_name) |
| | model = model.to(device) |
| | |
| | print("Successfully loaded model") |
| | |
| | return model, tokenizer |
| |
|
| | |
| |
|
| | def combine_top_results(results, query, max_results=4): |
| | """ |
| | Combine the top search results into a single coherent context |
| | |
| | Args: |
| | results: List of (Document, score) tuples from retrieval |
| | query: Original user query |
| | max_results: Maximum number of results to combine |
| | |
| | Returns: |
| | String containing combined context from top results |
| | """ |
| | |
| | sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results] |
| | |
| | combined_texts = [] |
| | seen_content = set() |
| | |
| | for doc, score in sorted_results: |
| | |
| | relevant_text = extract_relevant_sentences(doc.page_content, query, window_size=3) |
| | |
| | |
| | if relevant_text in seen_content: |
| | continue |
| | |
| | |
| | source_name = os.path.basename(doc.metadata["source"]) |
| | text_with_source = f"{relevant_text} [Source: {source_name}]" |
| | |
| | combined_texts.append(text_with_source) |
| | seen_content.add(relevant_text) |
| | |
| | |
| | combined_context = "\n\n".join(combined_texts) |
| | |
| | return combined_context |
| |
|
| | def setup_enhanced_qa_system(model, tokenizer, vector_store): |
| | """ |
| | Set up an enhanced QA system using the model and retriever with result combination |
| | """ |
| | |
| | retriever = vector_store.as_retriever( |
| | search_type="similarity", |
| | search_kwargs={"k": 6} |
| | ) |
| | |
| | |
| | def generate_enhanced_answer(query): |
| | |
| | docs = vector_store.similarity_search_with_score(query, k=6) |
| | |
| | |
| | combined_context = combine_top_results(docs, query, max_results=4) |
| | |
| | |
| | prompt = f""" |
| | आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है। |
| | यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें। अपने उत्तर में सभी प्रासंगिक जानकारी का उपयोग करें। |
| | |
| | संदर्भ: |
| | {combined_context} |
| | |
| | प्रश्न: {query} |
| | |
| | उत्तर: |
| | """ |
| | |
| | |
| | inputs = tokenizer(prompt, return_tensors="pt") |
| | |
| | |
| | for k, v in inputs.items(): |
| | if hasattr(v, "to") and callable(v.to): |
| | inputs[k] = v.to(model.device) |
| | |
| | with torch.no_grad(): |
| | try: |
| | outputs = model.generate( |
| | inputs.input_ids, |
| | max_new_tokens=512, |
| | temperature=0.7, |
| | top_p=0.9, |
| | do_sample=True |
| | ) |
| | except Exception as e: |
| | return f"Error generating response: {str(e)}", None |
| | |
| | |
| | full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | |
| | answer = full_response.split("उत्तर:")[-1].strip() |
| | |
| | return answer, combined_context |
| | |
| | return generate_enhanced_answer |
| |
|
| | |
| | def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500): |
| | """ |
| | Index text files from a directory and create a FAISS vector store |
| | """ |
| | print(f"Indexing text files from {data_dir} with chunk size ({chunk_size}) for fine-grained retrieval") |
| | |
| | |
| | embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device) |
| | |
| | |
| | os.makedirs(output_dir, exist_ok=True) |
| | |
| | |
| | text_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')] |
| | print(f"Found {len(text_files)} text files") |
| | |
| | |
| | all_documents = [] |
| | for file_path in text_files: |
| | documents = load_and_process_text_file(file_path, chunk_size=chunk_size) |
| | all_documents.extend(documents) |
| | |
| | print(f"Total documents: {len(all_documents)}") |
| | |
| | |
| | if len(all_documents) < 10 and chunk_size > 50: |
| | print(f"Not enough chunks created. Reducing chunk size and trying again...") |
| | return index_text_files(model, tokenizer, data_dir, output_dir, device, chunk_size=chunk_size//2) |
| | |
| | |
| | vector_store_path = os.path.join(output_dir, "faiss_index") |
| | vector_store = create_vector_store(all_documents, embeddings, vector_store_path) |
| | |
| | return vector_store, embeddings |
| |
|
| | def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="cuda"): |
| | """ |
| | Query the text corpus using the indexed vector store |
| | """ |
| | |
| | embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device) |
| | |
| | |
| | vector_store = load_vector_store(vector_store_path, embeddings) |
| | |
| | |
| | results = perform_similarity_search(vector_store, query, k=k) |
| | |
| | return results, vector_store |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Hindi RAG System with Combined Results") |
| | parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final", |
| | help="Directory containing the model and tokenizer") |
| | parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer", |
| | help="Directory containing the tokenizer") |
| | parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", |
| | help="Device to run inference on ('cuda' or 'cpu')") |
| | parser.add_argument("--index", action="store_true", |
| | help="Index text files from data directory") |
| | parser.add_argument("--query", type=str, default=None, |
| | help="Query to search in the indexed corpus") |
| | parser.add_argument("--data_dir", type=str, default="./data", |
| | help="Directory containing text files for indexing") |
| | parser.add_argument("--output_dir", type=str, default="./output", |
| | help="Directory to save the indexed vector store") |
| | parser.add_argument("--top_k", type=int, default=6, |
| | help="Number of top results to return") |
| | parser.add_argument("--chunk_size", type=int, default=500, |
| | help="Size of text chunks for indexing") |
| | parser.add_argument("--interactive", action="store_true", |
| | help="Run in interactive mode for querying") |
| | parser.add_argument("--reindex", action="store_true", |
| | help="Force reindexing even if index exists") |
| | parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct", |
| | help="HuggingFace model name for the LLM") |
| | parser.add_argument("--show_context", action="store_true", |
| | help="Show the combined context sent to the LLM") |
| | parser.add_argument("--show_raw_results", action="store_true", |
| | help="Show the raw search results before combination") |
| | args = parser.parse_args() |
| | |
| | |
| | embed_model, embed_tokenizer, config = load_model_and_tokenizer(args.model_dir, args.tokenizer_dir) |
| | |
| | |
| | embed_model = embed_model.to(args.device) |
| | |
| | |
| | vector_store_path = os.path.join(args.output_dir, "faiss_index") |
| | |
| | |
| | try: |
| | |
| | llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device) |
| | print("LLM loaded successfully for QA") |
| | except Exception as e: |
| | print(f"Error loading LLM: {e}") |
| | print("Cannot proceed without LLM for this combined results approach") |
| | return |
| | |
| | if args.index or args.reindex: |
| | |
| | vector_store, _ = index_text_files( |
| | embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size |
| | ) |
| | print(f"Indexing complete. Vector store saved to {vector_store_path}") |
| | |
| | |
| | embeddings = HindiSentenceEmbeddings(embed_model, embed_tokenizer, device=args.device) |
| | vector_store = load_vector_store(vector_store_path, embeddings) |
| | |
| | |
| | qa_generator = setup_enhanced_qa_system(llm_model, llm_tokenizer, vector_store) |
| | |
| | if args.query: |
| | |
| | print(f"\nProcessing query: {args.query}") |
| | |
| | |
| | if args.show_raw_results: |
| | results, _ = query_text_corpus( |
| | embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device |
| | ) |
| | |
| | print("\nRaw Search Results:") |
| | for i, (doc, score) in enumerate(results): |
| | print(f"\nResult {i+1} (Score: {score:.4f}):") |
| | print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}") |
| | print(f"Content: {doc.page_content[:200]}...") |
| | |
| | |
| | answer, context = qa_generator(args.query) |
| | |
| | if args.show_context: |
| | print("\nCombined Context:") |
| | print(context) |
| | |
| | print("\nEnhanced LLM Answer:") |
| | print(answer) |
| | |
| | if args.interactive: |
| | print("\nInteractive mode. Enter queries (or type 'quit' to exit).") |
| | |
| | while True: |
| | print("\nEnter query:") |
| | query = input() |
| | |
| | if not query.strip(): |
| | continue |
| | |
| | if query.lower() == 'quit': |
| | break |
| | |
| | |
| | if args.show_raw_results: |
| | results, _ = query_text_corpus( |
| | embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device |
| | ) |
| | |
| | print("\nRaw Search Results:") |
| | for i, (doc, score) in enumerate(results): |
| | print(f"\nResult {i+1} (Score: {score:.4f}):") |
| | print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}") |
| | print(f"Content: {doc.page_content[:200]}...") |
| | |
| | |
| | print(f"\nProcessing query: {query}") |
| | answer, context = qa_generator(query) |
| | |
| | if args.show_context: |
| | print("\nCombined Context:") |
| | print(context) |
| | |
| | print("\nEnhanced LLM Answer:") |
| | print(answer) |
| | |
| | |
| | if args.device == "cuda": |
| | gc.collect() |
| | torch.cuda.empty_cache() |
| |
|
| | if __name__ == "__main__": |
| | main() |