File size: 1,815 Bytes
29fc451 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
# Load the JSON data
def load_json_data(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
# Extract content from JSON for embedding
def extract_content(data):
contents = [entry['content'] for entry in data]
return contents
# Generate embeddings using Sentence Transformers
def generate_embeddings(contents):
model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight model for embedding
embeddings = model.encode(contents, show_progress_bar=True)
return embeddings
# Build FAISS index
def build_faiss_index(embeddings):
dimension = embeddings.shape[1] # Embedding dimension
index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search
index.add(embeddings) # Add embeddings to the index
return index
# Save the FAISS index to disk
def save_faiss_index(index, file_path):
faiss.write_index(index, file_path)
# Main function to process the JSON and build the RAG system
def main(json_file_path, index_file_path):
# Load and process data
data = load_json_data(json_file_path)
contents = extract_content(data)
# Generate embeddings
embeddings = generate_embeddings(contents)
# Build and save FAISS index
index = build_faiss_index(embeddings)
save_faiss_index(index, index_file_path)
print(f"FAISS index built and saved to {index_file_path}")
print(f"Number of documents embedded: {len(contents)}")
if __name__ == "__main__":
json_file_path = "input.json" # Path to your input JSON file
index_file_path = "faiss_index.bin" # Path to save the FAISS index
main(json_file_path, index_file_path) |