| import json | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| # Load the JSON data | |
| def load_json_data(file_path): | |
| with open(file_path, 'r') as f: | |
| data = json.load(f) | |
| return data | |
| # Extract content from JSON for embedding | |
| def extract_content(data): | |
| contents = [entry['content'] for entry in data] | |
| return contents | |
| # Generate embeddings using Sentence Transformers | |
| def generate_embeddings(contents): | |
| model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight model for embedding | |
| embeddings = model.encode(contents, show_progress_bar=True) | |
| return embeddings | |
| # Build FAISS index | |
| def build_faiss_index(embeddings): | |
| dimension = embeddings.shape[1] # Embedding dimension | |
| index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search | |
| index.add(embeddings) # Add embeddings to the index | |
| return index | |
| # Save the FAISS index to disk | |
| def save_faiss_index(index, file_path): | |
| faiss.write_index(index, file_path) | |
| # Main function to process the JSON and build the RAG system | |
| def main(json_file_path, index_file_path): | |
| # Load and process data | |
| data = load_json_data(json_file_path) | |
| contents = extract_content(data) | |
| # Generate embeddings | |
| embeddings = generate_embeddings(contents) | |
| # Build and save FAISS index | |
| index = build_faiss_index(embeddings) | |
| save_faiss_index(index, index_file_path) | |
| print(f"FAISS index built and saved to {index_file_path}") | |
| print(f"Number of documents embedded: {len(contents)}") | |
| if __name__ == "__main__": | |
| json_file_path = "input.json" # Path to your input JSON file | |
| index_file_path = "faiss_index.bin" # Path to save the FAISS index | |
| main(json_file_path, index_file_path) |