# # ====================================================================================================== # # CLAUDE ORIGINAL VERSION # # ====================================================================================================== # """ # Build FAISS Index from Scratch - FIXED VERSION # Creates faiss_index.pkl with proper serialization # Run this ONCE before starting the backend: # python build_faiss_index.py # Author: Banking RAG Chatbot # Date: November 2025 # """ # # Suppress warnings # import os # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # import warnings # warnings.filterwarnings('ignore') # import pickle # import json # import torch # import torch.nn as nn # import torch.nn.functional as F # import faiss # import numpy as np # from pathlib import Path # from transformers import AutoTokenizer, AutoModel # from typing import List # # ============================================================================ # # CONFIGURATION - UPDATE THESE PATHS! # # ============================================================================ # # Where is your knowledge base JSONL file? # KB_JSONL_FILE = "data/final_knowledge_base.jsonl" # # Where is your trained retriever model? # RETRIEVER_MODEL_PATH = "app/models/best_retriever_model.pth" # # Where to save the output FAISS pickle? # OUTPUT_PKL_FILE = "app/models/faiss_index.pkl" # # Device (auto-detect GPU/CPU) # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # # Batch size for encoding (reduce if you get OOM errors) # BATCH_SIZE = 32 # # ============================================================================ # # CUSTOM SENTENCE TRANSFORMER (Same as retriever.py) # # ============================================================================ # class CustomSentenceTransformer(nn.Module): # """ # Custom SentenceTransformer - exact copy from retriever.py # Uses e5-base-v2 with mean pooling and L2 normalization # """ # def __init__(self, model_name: str = "intfloat/e5-base-v2"): # super().__init__() # print(f" Loading base model: {model_name}...") # self.encoder = AutoModel.from_pretrained(model_name) # self.tokenizer = AutoTokenizer.from_pretrained(model_name) # self.config = self.encoder.config # print(f" āœ… Base model loaded") # def forward(self, input_ids, attention_mask): # """Forward pass through BERT encoder""" # outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) # # Mean pooling # token_embeddings = outputs.last_hidden_state # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() # embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( # input_mask_expanded.sum(1), min=1e-9 # ) # # L2 normalize # embeddings = F.normalize(embeddings, p=2, dim=1) # return embeddings # def encode(self, sentences: List[str], batch_size: int = 32) -> np.ndarray: # """Encode sentences - same as training""" # self.eval() # if isinstance(sentences, str): # sentences = [sentences] # # Add 'query: ' prefix for e5-base-v2 # processed_sentences = [f"query: {s.strip()}" for s in sentences] # all_embeddings = [] # with torch.no_grad(): # for i in range(0, len(processed_sentences), batch_size): # batch_sentences = processed_sentences[i:i + batch_size] # # Tokenize # tokens = self.tokenizer( # batch_sentences, # truncation=True, # padding=True, # max_length=128, # return_tensors='pt' # ).to(self.encoder.device) # # Get embeddings # embeddings = self.forward(tokens['input_ids'], tokens['attention_mask']) # all_embeddings.append(embeddings.cpu().numpy()) # return np.vstack(all_embeddings) # # ============================================================================ # # RETRIEVER MODEL (Wrapper) # # ============================================================================ # class RetrieverModel: # """Wrapper for trained retriever model""" # def __init__(self, model_path: str, device: str = "cpu"): # print(f"\nšŸ¤– Loading retriever model...") # print(f" Device: {device}") # self.device = device # self.model = CustomSentenceTransformer("intfloat/e5-base-v2").to(device) # # Load trained weights # print(f" Loading weights from: {model_path}") # try: # state_dict = torch.load(model_path, map_location=device) # self.model.load_state_dict(state_dict) # print(f" āœ… Trained weights loaded") # except Exception as e: # print(f" āš ļø Warning: Could not load trained weights: {e}") # print(f" Using base e5-base-v2 model instead") # self.model.eval() # def encode_documents(self, documents: List[str], batch_size: int = 32) -> np.ndarray: # """Encode documents""" # return self.model.encode(documents, batch_size=batch_size) # # ============================================================================ # # MAIN: BUILD FAISS INDEX # # ============================================================================ # def build_faiss_index(): # """Main function to build FAISS index from scratch""" # print("=" * 80) # print("šŸ—ļø BUILDING FAISS INDEX FROM SCRATCH") # print("=" * 80) # # ======================================================================== # # STEP 1: LOAD KNOWLEDGE BASE # # ======================================================================== # print(f"\nšŸ“– STEP 1: Loading knowledge base...") # print(f" File: {KB_JSONL_FILE}") # if not os.path.exists(KB_JSONL_FILE): # print(f" āŒ ERROR: File not found!") # print(f" Please copy your knowledge base to: {KB_JSONL_FILE}") # return False # kb_data = [] # with open(KB_JSONL_FILE, 'r', encoding='utf-8') as f: # for line_num, line in enumerate(f, 1): # try: # kb_data.append(json.loads(line)) # except json.JSONDecodeError as e: # print(f" āš ļø Warning: Skipping invalid JSON on line {line_num}: {e}") # print(f" āœ… Loaded {len(kb_data)} documents") # if len(kb_data) == 0: # print(f" āŒ ERROR: Knowledge base is empty!") # return False # # ======================================================================== # # STEP 2: PREPARE DOCUMENTS FOR ENCODING # # ======================================================================== # print(f"\nšŸ“ STEP 2: Preparing documents for encoding...") # documents = [] # for i, item in enumerate(kb_data): # # Combine instruction + response for embedding (same as training) # instruction = item.get('instruction', '') # response = item.get('response', '') # # Create combined text # if instruction and response: # text = f"{instruction} {response}" # elif instruction: # text = instruction # elif response: # text = response # else: # print(f" āš ļø Warning: Document {i} has no content, using placeholder") # text = "empty document" # documents.append(text) # print(f" āœ… Prepared {len(documents)} documents for encoding") # print(f" Average length: {sum(len(d) for d in documents) / len(documents):.1f} chars") # # ======================================================================== # # STEP 3: LOAD RETRIEVER AND ENCODE DOCUMENTS # # ======================================================================== # print(f"\nšŸ”® STEP 3: Encoding documents with trained retriever...") # if not os.path.exists(RETRIEVER_MODEL_PATH): # print(f" āŒ ERROR: Retriever model not found!") # print(f" Please copy your trained model to: {RETRIEVER_MODEL_PATH}") # return False # # Load retriever # retriever = RetrieverModel(RETRIEVER_MODEL_PATH, device=DEVICE) # # Encode all documents # print(f" Encoding {len(documents)} documents...") # print(f" Batch size: {BATCH_SIZE}") # print(f" This may take a few minutes... ā˜•") # try: # embeddings = retriever.encode_documents(documents, batch_size=BATCH_SIZE) # print(f" āœ… Encoded {embeddings.shape[0]} documents") # print(f" Embedding dimension: {embeddings.shape[1]}") # except Exception as e: # print(f" āŒ ERROR during encoding: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # STEP 4: BUILD FAISS INDEX # # ======================================================================== # print(f"\nšŸ” STEP 4: Building FAISS index...") # dimension = embeddings.shape[1] # print(f" Dimension: {dimension}") # # Create FAISS index (Inner Product = Cosine similarity after normalization) # print(f" Creating IndexFlatIP...") # index = faiss.IndexFlatIP(dimension) # # Normalize embeddings for cosine similarity # print(f" Normalizing embeddings...") # faiss.normalize_L2(embeddings) # # Add to index # print(f" Adding {embeddings.shape[0]} vectors to FAISS index...") # index.add(embeddings.astype('float32')) # print(f" āœ… FAISS index built successfully") # print(f" Total vectors: {index.ntotal}") # # ======================================================================== # # STEP 5: SAVE WITH COMPATIBLE FORMAT # # ======================================================================== # print(f"\nšŸ’¾ STEP 5: Saving with compatible format...") # # Create models directory if it doesn't exist # os.makedirs(os.path.dirname(OUTPUT_PKL_FILE), exist_ok=True) # try: # # āœ… NEW FORMAT: Save as dictionary with version info # print(f" Creating compatible save format...") # save_data = { # 'format_version': 'v2', # Mark as new format # 'index_bytes': faiss.serialize_index(index), # FAISS bytes # 'kb_data': kb_data, # Knowledge base # 'dimension': dimension, # 'num_vectors': index.ntotal, # 'faiss_version': faiss.__version__ # } # print(f" Pickling dictionary...") # with open(OUTPUT_PKL_FILE, 'wb') as f: # pickle.dump(save_data, f, protocol=pickle.HIGHEST_PROTOCOL) # file_size_mb = Path(OUTPUT_PKL_FILE).stat().st_size / (1024 * 1024) # print(f" āœ… Saved: {OUTPUT_PKL_FILE}") # print(f" File size: {file_size_mb:.2f} MB") # print(f" Format: v2 (compatible)") # print(f" FAISS version used: {faiss.__version__}") # except Exception as e: # print(f" āŒ ERROR saving pickle: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # STEP 6: VERIFY SAVED FILE # # ======================================================================== # print(f"\nāœ… STEP 6: Verifying saved file...") # try: # with open(OUTPUT_PKL_FILE, 'rb') as f: # loaded_data = pickle.load(f) # # Check format # if isinstance(loaded_data, dict) and loaded_data.get('format_version') == 'v2': # print(f" āœ… Format: v2 (compatible)") # # Deserialize FAISS index # loaded_index = faiss.deserialize_index(loaded_data['index_bytes']) # loaded_kb = loaded_data['kb_data'] # print(f" āœ… Verification successful") # print(f" Index vectors: {loaded_index.ntotal}") # print(f" KB documents: {len(loaded_kb)}") # print(f" Dimension: {loaded_data['dimension']}") # if loaded_index.ntotal != len(loaded_kb): # print(f" āš ļø WARNING: Size mismatch detected!") # else: # print(f" āš ļø WARNING: Unexpected format detected") # except Exception as e: # print(f" āŒ ERROR verifying file: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # SUCCESS! # # ======================================================================== # print("\n" + "=" * 80) # print("šŸŽ‰ SUCCESS! FAISS INDEX BUILT AND SAVED") # print("=" * 80) # print(f"\nšŸ“Š Summary:") # print(f" Documents: {len(kb_data)}") # print(f" Vectors: {index.ntotal}") # print(f" Dimension: {dimension}") # print(f" File: {OUTPUT_PKL_FILE} ({file_size_mb:.2f} MB)") # print(f" Format: v2 (compatible with all FAISS versions)") # print(f" FAISS version: {faiss.__version__}") # print(f"\nšŸš€ Next steps:") # print(f" 1. Upload {OUTPUT_PKL_FILE} to HuggingFace Hub") # print(f" 2. Update your retriever loading code (see below)") # print(f" 3. Restart your backend") # print(f" 4. Test retrieval - should work now!") # print("\n" + "=" * 80) # print("šŸ“ IMPORTANT: Update your loading code to handle v2 format:") # print("=" * 80) # print(""" # # In your retriever.py or wherever you load the index: # with open('faiss_index.pkl', 'rb') as f: # data = pickle.load(f) # # Check format # if isinstance(data, dict) and data.get('format_version') == 'v2': # # NEW FORMAT (v2) # index = faiss.deserialize_index(data['index_bytes']) # kb_data = data['kb_data'] # print(f"āœ… Loaded v2 format: {index.ntotal} vectors") # else: # # OLD FORMAT (try to handle gracefully) # print("āš ļø Old format detected, please rebuild") # raise ValueError("Please rebuild FAISS index with new script") # """) # print("=" * 80 + "\n") # return True # # ============================================================================ # # RUN SCRIPT # # ============================================================================ # if __name__ == "__main__": # success = build_faiss_index() # if not success: # print("\n" + "=" * 80) # print("āŒ FAILED TO BUILD FAISS INDEX") # print("=" * 80) # print("\nPlease check:") # print("1. Knowledge base file exists: data/final_knowledge_base.jsonl") # print("2. Retriever model exists: app/models/best_retriever_model.pth") # print("3. You have enough RAM (embeddings need ~1GB for 10k docs)") # print("=" * 80 + "\n") # exit(1) # ====================================================================================================== # CLAUDE OPTIMIZED VERSION FOR FASTER ENCODING PROCESS # ====================================================================================================== """ Build FAISS Index from Scratch - OPTIMIZED VERSION ⚔ FASTER with GPU auto-detection and larger batches Run this ONCE before starting the backend: python build_faiss_index.py Optimizations: - Auto-detects GPU (10-50x faster than CPU) - Larger batch sizes (256 instead of 32) - Progress bars - Memory efficient processing Author: Banking RAG Chatbot Date: November 2025 """ # Suppress warnings import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import warnings warnings.filterwarnings('ignore') import pickle import json import torch import torch.nn as nn import torch.nn.functional as F import faiss import numpy as np from pathlib import Path from transformers import AutoTokenizer, AutoModel from typing import List from tqdm import tqdm # Progress bars # ============================================================================ # CONFIGURATION - UPDATE THESE PATHS! # ============================================================================ KB_JSONL_FILE = "data/final_knowledge_base.jsonl" RETRIEVER_MODEL_PATH = "app/models/best_retriever_model.pth" OUTPUT_PKL_FILE = "app/models/faiss_index.pkl" # ⚔ AUTO-DETECT GPU (will use GPU if available, CPU otherwise) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # ⚔ OPTIMIZED BATCH SIZE # - GPU: Use 256 (or even 512 if you have 16GB+ VRAM) # - CPU: Use 32 (to avoid RAM issues) BATCH_SIZE = 256 if DEVICE == "cuda" else 32 print(f"šŸ”§ Configuration:") print(f" Device: {DEVICE}") print(f" Batch size: {BATCH_SIZE}") if DEVICE == "cuda": print(f" GPU: {torch.cuda.get_device_name(0)}") print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") # ============================================================================ # CUSTOM SENTENCE TRANSFORMER # ============================================================================ class CustomSentenceTransformer(nn.Module): def __init__(self, model_name: str = "intfloat/e5-base-v2"): super().__init__() print(f" Loading base model: {model_name}...") self.encoder = AutoModel.from_pretrained(model_name) self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.config = self.encoder.config print(f" āœ… Base model loaded") def forward(self, input_ids, attention_mask): outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) token_embeddings = outputs.last_hidden_state input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( input_mask_expanded.sum(1), min=1e-9 ) embeddings = F.normalize(embeddings, p=2, dim=1) return embeddings def encode(self, sentences: List[str], batch_size: int = 32) -> np.ndarray: """⚔ Optimized encoding with progress bar""" self.eval() if isinstance(sentences, str): sentences = [sentences] processed_sentences = [f"query: {s.strip()}" for s in sentences] all_embeddings = [] # ⚔ Progress bar num_batches = (len(processed_sentences) + batch_size - 1) // batch_size pbar = tqdm(total=len(processed_sentences), desc=" Encoding", unit="docs") with torch.no_grad(): for i in range(0, len(processed_sentences), batch_size): batch_sentences = processed_sentences[i:i + batch_size] tokens = self.tokenizer( batch_sentences, truncation=True, padding=True, max_length=128, return_tensors='pt' ).to(self.encoder.device) embeddings = self.forward(tokens['input_ids'], tokens['attention_mask']) all_embeddings.append(embeddings.cpu().numpy()) pbar.update(len(batch_sentences)) pbar.close() return np.vstack(all_embeddings) # ============================================================================ # RETRIEVER MODEL # ============================================================================ class RetrieverModel: def __init__(self, model_path: str, device: str = "cpu"): print(f"\nšŸ¤– Loading retriever model...") print(f" Device: {device}") self.device = device self.model = CustomSentenceTransformer("intfloat/e5-base-v2").to(device) print(f" Loading weights from: {model_path}") try: state_dict = torch.load(model_path, map_location=device) self.model.load_state_dict(state_dict) print(f" āœ… Trained weights loaded") except Exception as e: print(f" āš ļø Warning: Could not load trained weights: {e}") print(f" Using base e5-base-v2 model instead") self.model.eval() # ⚔ Mixed precision for faster inference on GPU if device == "cuda": self.model = self.model.half() # FP16 print(f" ⚔ Using FP16 (mixed precision) for faster encoding") def encode_documents(self, documents: List[str], batch_size: int = 32) -> np.ndarray: return self.model.encode(documents, batch_size=batch_size) # ============================================================================ # MAIN: BUILD FAISS INDEX # ============================================================================ def build_faiss_index(): print("=" * 80) print("šŸ—ļø BUILDING FAISS INDEX FROM SCRATCH (OPTIMIZED)") print("=" * 80) # ======================================================================== # STEP 1: LOAD KNOWLEDGE BASE # ======================================================================== print(f"\nšŸ“– STEP 1: Loading knowledge base...") print(f" File: {KB_JSONL_FILE}") if not os.path.exists(KB_JSONL_FILE): print(f" āŒ ERROR: File not found!") return False kb_data = [] with open(KB_JSONL_FILE, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): try: kb_data.append(json.loads(line)) except json.JSONDecodeError as e: print(f" āš ļø Warning: Skipping invalid JSON on line {line_num}") print(f" āœ… Loaded {len(kb_data)} documents") if len(kb_data) == 0: print(f" āŒ ERROR: Knowledge base is empty!") return False # ======================================================================== # STEP 2: PREPARE DOCUMENTS # ======================================================================== print(f"\nšŸ“ STEP 2: Preparing documents for encoding...") documents = [] for i, item in enumerate(kb_data): instruction = item.get('instruction', '') response = item.get('response', '') if instruction and response: text = f"{instruction} {response}" elif instruction: text = instruction elif response: text = response else: text = "empty document" documents.append(text) print(f" āœ… Prepared {len(documents)} documents") avg_len = sum(len(d) for d in documents) / len(documents) print(f" Average length: {avg_len:.1f} chars") # ⚔ Estimate encoding time if DEVICE == "cuda": est_time = len(documents) / (BATCH_SIZE * 100) # ~100 docs/sec on GPU print(f" ⚔ Estimated time on GPU: {est_time:.1f} seconds") else: est_time = len(documents) / (BATCH_SIZE * 2) # ~2 docs/sec on CPU print(f" ā³ Estimated time on CPU: {est_time/60:.1f} minutes") # ======================================================================== # STEP 3: ENCODE DOCUMENTS # ======================================================================== print(f"\nšŸ”® STEP 3: Encoding documents with trained retriever...") if not os.path.exists(RETRIEVER_MODEL_PATH): print(f" āŒ ERROR: Retriever model not found!") return False import time start_time = time.time() retriever = RetrieverModel(RETRIEVER_MODEL_PATH, device=DEVICE) print(f" Encoding {len(documents)} documents...") print(f" Batch size: {BATCH_SIZE}") try: embeddings = retriever.encode_documents(documents, batch_size=BATCH_SIZE) elapsed = time.time() - start_time print(f" āœ… Encoded {embeddings.shape[0]} documents in {elapsed:.1f}s") print(f" Speed: {len(documents)/elapsed:.1f} docs/sec") print(f" Embedding dimension: {embeddings.shape[1]}") except Exception as e: print(f" āŒ ERROR during encoding: {e}") import traceback traceback.print_exc() return False # ======================================================================== # STEP 4: BUILD FAISS INDEX # ======================================================================== print(f"\nšŸ” STEP 4: Building FAISS index...") dimension = embeddings.shape[1] print(f" Dimension: {dimension}") print(f" Creating IndexFlatIP...") index = faiss.IndexFlatIP(dimension) print(f" Normalizing embeddings...") faiss.normalize_L2(embeddings) print(f" Adding {embeddings.shape[0]} vectors to FAISS index...") index.add(embeddings.astype('float32')) print(f" āœ… FAISS index built successfully") print(f" Total vectors: {index.ntotal}") # ======================================================================== # STEP 5: SAVE WITH V2 FORMAT # ======================================================================== print(f"\nšŸ’¾ STEP 5: Saving with v2 format (compatible)...") os.makedirs(os.path.dirname(OUTPUT_PKL_FILE), exist_ok=True) try: print(f" Creating v2 save format...") save_data = { 'format_version': 'v2', 'index_bytes': faiss.serialize_index(index), 'kb_data': kb_data, 'dimension': dimension, 'num_vectors': index.ntotal, 'faiss_version': faiss.__version__, 'device_used': DEVICE, 'build_time': elapsed } print(f" Pickling...") with open(OUTPUT_PKL_FILE, 'wb') as f: pickle.dump(save_data, f, protocol=pickle.HIGHEST_PROTOCOL) file_size_mb = Path(OUTPUT_PKL_FILE).stat().st_size / (1024 * 1024) print(f" āœ… Saved: {OUTPUT_PKL_FILE}") print(f" File size: {file_size_mb:.2f} MB") except Exception as e: print(f" āŒ ERROR saving: {e}") return False # ======================================================================== # STEP 6: VERIFY # ======================================================================== print(f"\nāœ… STEP 6: Verifying...") try: with open(OUTPUT_PKL_FILE, 'rb') as f: loaded = pickle.load(f) loaded_index = faiss.deserialize_index(loaded['index_bytes']) loaded_kb = loaded['kb_data'] print(f" āœ… Verification successful") print(f" Index vectors: {loaded_index.ntotal}") print(f" KB documents: {len(loaded_kb)}") except Exception as e: print(f" āŒ ERROR verifying: {e}") return False # ======================================================================== # SUCCESS! # ======================================================================== total_time = time.time() - start_time print("\n" + "=" * 80) print("šŸŽ‰ SUCCESS! FAISS INDEX BUILT AND SAVED") print("=" * 80) print(f"\nšŸ“Š Summary:") print(f" Documents: {len(kb_data)}") print(f" Vectors: {index.ntotal}") print(f" Dimension: {dimension}") print(f" File: {OUTPUT_PKL_FILE} ({file_size_mb:.2f} MB)") print(f" Format: v2 (compatible)") print(f" Device: {DEVICE}") print(f" Total time: {total_time:.1f}s ({total_time/60:.1f} min)") print(f" Encoding speed: {len(documents)/elapsed:.1f} docs/sec") print(f"\nšŸš€ Next steps:") print(f" 1. Upload {OUTPUT_PKL_FILE} to HuggingFace Hub") print(f" 2. Restart your backend") print(f" 3. Test - should work now!") print("=" * 80 + "\n") return True if __name__ == "__main__": success = build_faiss_index() if not success: print("\nāŒ FAILED TO BUILD FAISS INDEX") print("\nPlease check:") print("1. data/final_knowledge_base.jsonl exists") print("2. app/models/best_retriever_model.pth exists") print("3. You have enough RAM/VRAM") exit(1) # """ # Build FAISS Index from Scratch - COMPATIBLE VERSION # Creates faiss_index.pkl with proper serialization for version compatibility # Run this ONCE before starting the backend: # python build_faiss_index.py # Author: Banking RAG Chatbot # Date: November 2025 # """ # # Suppress warnings # import os # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # import warnings # warnings.filterwarnings('ignore') # import pickle # import json # import torch # import torch.nn as nn # import torch.nn.functional as F # import faiss # import numpy as np # from pathlib import Path # from transformers import AutoTokenizer, AutoModel # from typing import List # # ============================================================================ # # CONFIGURATION - UPDATE THESE PATHS! # # ============================================================================ # # Where is your knowledge base JSONL file? # KB_JSONL_FILE = "data/final_knowledge_base.jsonl" # # Where is your trained retriever model? # RETRIEVER_MODEL_PATH = "app/models/best_retriever_model.pth" # # Where to save the output FAISS pickle? # OUTPUT_PKL_FILE = "app/models/faiss_index.pkl" # # Device (auto-detect GPU/CPU) # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # # Batch size for encoding (reduce if you get OOM errors) # BATCH_SIZE = 32 # # ============================================================================ # # CUSTOM SENTENCE TRANSFORMER (Same as retriever.py) # # ============================================================================ # class CustomSentenceTransformer(nn.Module): # """ # Custom SentenceTransformer - exact copy from retriever.py # Uses e5-base-v2 with mean pooling and L2 normalization # """ # def __init__(self, model_name: str = "intfloat/e5-base-v2"): # super().__init__() # print(f" Loading base model: {model_name}...") # self.encoder = AutoModel.from_pretrained(model_name) # self.tokenizer = AutoTokenizer.from_pretrained(model_name) # self.config = self.encoder.config # print(f" āœ… Base model loaded") # def forward(self, input_ids, attention_mask): # """Forward pass through BERT encoder""" # outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) # # Mean pooling # token_embeddings = outputs.last_hidden_state # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() # embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( # input_mask_expanded.sum(1), min=1e-9 # ) # # L2 normalize # embeddings = F.normalize(embeddings, p=2, dim=1) # return embeddings # def encode(self, sentences: List[str], batch_size: int = 32) -> np.ndarray: # """Encode sentences - same as training""" # self.eval() # if isinstance(sentences, str): # sentences = [sentences] # # Add 'query: ' prefix for e5-base-v2 # processed_sentences = [f"query: {s.strip()}" for s in sentences] # all_embeddings = [] # with torch.no_grad(): # for i in range(0, len(processed_sentences), batch_size): # batch_sentences = processed_sentences[i:i + batch_size] # # Tokenize # tokens = self.tokenizer( # batch_sentences, # truncation=True, # padding=True, # max_length=128, # return_tensors='pt' # ).to(self.encoder.device) # # Get embeddings # embeddings = self.forward(tokens['input_ids'], tokens['attention_mask']) # all_embeddings.append(embeddings.cpu().numpy()) # return np.vstack(all_embeddings) # # ============================================================================ # # RETRIEVER MODEL (Wrapper) # # ============================================================================ # class RetrieverModel: # """Wrapper for trained retriever model""" # def __init__(self, model_path: str, device: str = "cpu"): # print(f"\nšŸ¤– Loading retriever model...") # print(f" Device: {device}") # self.device = device # self.model = CustomSentenceTransformer("intfloat/e5-base-v2").to(device) # # Load trained weights # print(f" Loading weights from: {model_path}") # try: # state_dict = torch.load(model_path, map_location=device) # self.model.load_state_dict(state_dict) # print(f" āœ… Trained weights loaded") # except Exception as e: # print(f" āš ļø Warning: Could not load trained weights: {e}") # print(f" Using base e5-base-v2 model instead") # self.model.eval() # def encode_documents(self, documents: List[str], batch_size: int = 32) -> np.ndarray: # """Encode documents""" # return self.model.encode(documents, batch_size=batch_size) # # ============================================================================ # # MAIN: BUILD FAISS INDEX # # ============================================================================ # def build_faiss_index(): # """Main function to build FAISS index from scratch""" # print("=" * 80) # print("šŸ—ļø BUILDING FAISS INDEX FROM SCRATCH") # print("=" * 80) # # ======================================================================== # # STEP 1: LOAD KNOWLEDGE BASE # # ======================================================================== # print(f"\nšŸ“– STEP 1: Loading knowledge base...") # print(f" File: {KB_JSONL_FILE}") # if not os.path.exists(KB_JSONL_FILE): # print(f" āŒ ERROR: File not found!") # print(f" Please copy your knowledge base to: {KB_JSONL_FILE}") # return False # kb_data = [] # with open(KB_JSONL_FILE, 'r', encoding='utf-8') as f: # for line_num, line in enumerate(f, 1): # try: # kb_data.append(json.loads(line)) # except json.JSONDecodeError as e: # print(f" āš ļø Warning: Skipping invalid JSON on line {line_num}: {e}") # print(f" āœ… Loaded {len(kb_data)} documents") # if len(kb_data) == 0: # print(f" āŒ ERROR: Knowledge base is empty!") # return False # # ======================================================================== # # STEP 2: PREPARE DOCUMENTS FOR ENCODING # # ======================================================================== # print(f"\nšŸ“ STEP 2: Preparing documents for encoding...") # documents = [] # for i, item in enumerate(kb_data): # # Combine instruction + response for embedding (same as training) # instruction = item.get('instruction', '') # response = item.get('response', '') # # Create combined text # if instruction and response: # text = f"{instruction} {response}" # elif instruction: # text = instruction # elif response: # text = response # else: # print(f" āš ļø Warning: Document {i} has no content, using placeholder") # text = "empty document" # documents.append(text) # print(f" āœ… Prepared {len(documents)} documents for encoding") # print(f" Average length: {sum(len(d) for d in documents) / len(documents):.1f} chars") # # ======================================================================== # # STEP 3: LOAD RETRIEVER AND ENCODE DOCUMENTS # # ======================================================================== # print(f"\nšŸ”® STEP 3: Encoding documents with trained retriever...") # if not os.path.exists(RETRIEVER_MODEL_PATH): # print(f" āŒ ERROR: Retriever model not found!") # print(f" Please copy your trained model to: {RETRIEVER_MODEL_PATH}") # return False # # Load retriever # retriever = RetrieverModel(RETRIEVER_MODEL_PATH, device=DEVICE) # # Encode all documents # print(f" Encoding {len(documents)} documents...") # print(f" Batch size: {BATCH_SIZE}") # print(f" This may take a few minutes... ā˜•") # try: # embeddings = retriever.encode_documents(documents, batch_size=BATCH_SIZE) # print(f" āœ… Encoded {embeddings.shape[0]} documents") # print(f" Embedding dimension: {embeddings.shape[1]}") # except Exception as e: # print(f" āŒ ERROR during encoding: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # STEP 4: BUILD FAISS INDEX WITH PROPER SERIALIZATION # # ======================================================================== # print(f"\nšŸ” STEP 4: Building FAISS index...") # dimension = embeddings.shape[1] # print(f" Dimension: {dimension}") # # Create FAISS index (Inner Product = Cosine similarity after normalization) # print(f" Creating IndexFlatIP...") # index = faiss.IndexFlatIP(dimension) # # Normalize embeddings for cosine similarity # print(f" Normalizing embeddings...") # faiss.normalize_L2(embeddings) # # Add to index # print(f" Adding {embeddings.shape[0]} vectors to FAISS index...") # index.add(embeddings.astype('float32')) # print(f" āœ… FAISS index built successfully") # print(f" Total vectors: {index.ntotal}") # # ======================================================================== # # STEP 5: SAVE WITH PROPER FAISS SERIALIZATION (VERSION COMPATIBLE!) # # ======================================================================== # print(f"\nšŸ’¾ STEP 5: Saving with FAISS serialization (version-compatible)...") # # Create models directory if it doesn't exist # os.makedirs(os.path.dirname(OUTPUT_PKL_FILE), exist_ok=True) # # āœ… PROPER WAY: Serialize FAISS index to bytes first # print(f" Serializing FAISS index to bytes...") # try: # # Write FAISS index to bytes (works across FAISS versions!) # index_bytes = faiss.serialize_index(index) # # Now pickle the bytes + kb_data # print(f" Pickling (index_bytes, kb_data) tuple...") # with open(OUTPUT_PKL_FILE, 'wb') as f: # pickle.dump((index_bytes, kb_data), f, protocol=pickle.HIGHEST_PROTOCOL) # file_size_mb = Path(OUTPUT_PKL_FILE).stat().st_size / (1024 * 1024) # print(f" āœ… Saved: {OUTPUT_PKL_FILE}") # print(f" File size: {file_size_mb:.2f} MB") # except Exception as e: # print(f" āŒ ERROR saving pickle: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # STEP 6: VERIFY SAVED FILE # # ======================================================================== # print(f"\nāœ… STEP 6: Verifying saved file...") # try: # with open(OUTPUT_PKL_FILE, 'rb') as f: # loaded_index_bytes, loaded_kb = pickle.load(f) # # Deserialize FAISS index from bytes # loaded_index = faiss.deserialize_index(loaded_index_bytes) # print(f" āœ… Verification successful") # print(f" Index vectors: {loaded_index.ntotal}") # print(f" KB documents: {len(loaded_kb)}") # if loaded_index.ntotal != len(loaded_kb): # print(f" āš ļø WARNING: Size mismatch detected!") # except Exception as e: # print(f" āŒ ERROR verifying file: {e}") # import traceback # traceback.print_exc() # return False # # ======================================================================== # # SUCCESS! # # ======================================================================== # print("\n" + "=" * 80) # print("šŸŽ‰ SUCCESS! FAISS INDEX BUILT AND SAVED") # print("=" * 80) # print(f"\nšŸ“Š Summary:") # print(f" Documents: {len(kb_data)}") # print(f" Vectors: {index.ntotal}") # print(f" Dimension: {dimension}") # print(f" File: {OUTPUT_PKL_FILE} ({file_size_mb:.2f} MB)") # print(f"\nšŸš€ Next steps:") # print(f" 1. Upload {OUTPUT_PKL_FILE} to HuggingFace Hub") # print(f" 2. Restart your backend") # print(f" 3. Test retrieval - should work now!") # print("=" * 80 + "\n") # return True # # ============================================================================ # # RUN SCRIPT # # ============================================================================ # if __name__ == "__main__": # success = build_faiss_index() # if not success: # print("\n" + "=" * 80) # print("āŒ FAILED TO BUILD FAISS INDEX") # print("=" * 80) # print("\nPlease check:") # print("1. Knowledge base file exists: data/final_knowledge_base.jsonl") # print("2. Retriever model exists: app/models/best_retriever_model.pth") # print("3. You have enough RAM (embeddings need ~1GB for 10k docs)") # print("=" * 80 + "\n") # exit(1)