File size: 1,815 Bytes
29fc451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the JSON data
def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Extract content from JSON for embedding
def extract_content(data):
    contents = [entry['content'] for entry in data]
    return contents

# Generate embeddings using Sentence Transformers
def generate_embeddings(contents):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model for embedding
    embeddings = model.encode(contents, show_progress_bar=True)
    return embeddings

# Build FAISS index
def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]  # Embedding dimension
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
    index.add(embeddings)  # Add embeddings to the index
    return index

# Save the FAISS index to disk
def save_faiss_index(index, file_path):
    faiss.write_index(index, file_path)

# Main function to process the JSON and build the RAG system
def main(json_file_path, index_file_path):
    # Load and process data
    data = load_json_data(json_file_path)
    contents = extract_content(data)
    
    # Generate embeddings
    embeddings = generate_embeddings(contents)
    
    # Build and save FAISS index
    index = build_faiss_index(embeddings)
    save_faiss_index(index, index_file_path)
    
    print(f"FAISS index built and saved to {index_file_path}")
    print(f"Number of documents embedded: {len(contents)}")

if __name__ == "__main__":
    json_file_path = "input.json"  # Path to your input JSON file
    index_file_path = "faiss_index.bin"  # Path to save the FAISS index
    main(json_file_path, index_file_path)