import json import faiss import numpy as np from sentence_transformers import SentenceTransformer # Load the JSON data def load_json_data(file_path): with open(file_path, 'r') as f: data = json.load(f) return data # Extract content from JSON for embedding def extract_content(data): contents = [entry['content'] for entry in data] return contents # Generate embeddings using Sentence Transformers def generate_embeddings(contents): model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight model for embedding embeddings = model.encode(contents, show_progress_bar=True) return embeddings # Build FAISS index def build_faiss_index(embeddings): dimension = embeddings.shape[1] # Embedding dimension index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search index.add(embeddings) # Add embeddings to the index return index # Save the FAISS index to disk def save_faiss_index(index, file_path): faiss.write_index(index, file_path) # Main function to process the JSON and build the RAG system def main(json_file_path, index_file_path): # Load and process data data = load_json_data(json_file_path) contents = extract_content(data) # Generate embeddings embeddings = generate_embeddings(contents) # Build and save FAISS index index = build_faiss_index(embeddings) save_faiss_index(index, index_file_path) print(f"FAISS index built and saved to {index_file_path}") print(f"Number of documents embedded: {len(contents)}") if __name__ == "__main__": json_file_path = "input.json" # Path to your input JSON file index_file_path = "faiss_index.bin" # Path to save the FAISS index main(json_file_path, index_file_path)