table_test / embedding.txt
Kushalguptaiitb's picture
Upload embedding.txt
29fc451 verified
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
# Load the JSON data
def load_json_data(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
# Extract content from JSON for embedding
def extract_content(data):
contents = [entry['content'] for entry in data]
return contents
# Generate embeddings using Sentence Transformers
def generate_embeddings(contents):
model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight model for embedding
embeddings = model.encode(contents, show_progress_bar=True)
return embeddings
# Build FAISS index
def build_faiss_index(embeddings):
dimension = embeddings.shape[1] # Embedding dimension
index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search
index.add(embeddings) # Add embeddings to the index
return index
# Save the FAISS index to disk
def save_faiss_index(index, file_path):
faiss.write_index(index, file_path)
# Main function to process the JSON and build the RAG system
def main(json_file_path, index_file_path):
# Load and process data
data = load_json_data(json_file_path)
contents = extract_content(data)
# Generate embeddings
embeddings = generate_embeddings(contents)
# Build and save FAISS index
index = build_faiss_index(embeddings)
save_faiss_index(index, index_file_path)
print(f"FAISS index built and saved to {index_file_path}")
print(f"Number of documents embedded: {len(contents)}")
if __name__ == "__main__":
json_file_path = "input.json" # Path to your input JSON file
index_file_path = "faiss_index.bin" # Path to save the FAISS index
main(json_file_path, index_file_path)