Spaces:

Dheeraj-13
/

RAG_Knowledge_Assistant

Sleeping

App Files Files Community

RAG_Knowledge_Assistant / services /rag /index.py

Dheeraj-13

Feature: Additive Ingestion - Allow adding new docs without wiping old ones. Added Clear button.

d3a38ee about 1 month ago

raw

history blame contribute delete

2.61 kB

	import os
	import json
	import faiss
	import numpy as np
	import argparse
	import pickle
	from typing import List, Dict
	from .embed import get_embedder

	def load_processed_data(processed_dir: str) -> List[Dict]:
	chunks = []
	# Always glob for all JSONs to support additive ingestion
	import glob
	json_files = glob.glob(os.path.join(processed_dir, "*.json"))

	print(f"Found {len(json_files)} existing documents to index.")

	for f_path in json_files:
	if f_path.endswith("manifest.json"): continue
	try:
	with open(f_path, 'r') as f:
	doc_data = json.load(f)
	if 'chunks' in doc_data:
	chunks.extend(doc_data['chunks'])
	except Exception as e:
	print(f"Error loading {f_path}: {e}")

	return chunks

	def build_index(processed_dir: str, output_dir: str):
	os.makedirs(output_dir, exist_ok=True)

	print("Loading chunks...")
	chunks = load_processed_data(processed_dir)
	print(f"Loaded {len(chunks)} chunks.")

	if not chunks:
	print("No chunks found. Exiting.")
	return

	texts = [c['content'] for c in chunks]
	metadatas = [c['metadata'] for c in chunks]

	print("Generating embeddings...")
	embedder = get_embedder()
	embeddings = embedder.embed(texts)

	dimension = embeddings.shape[1]
	print(f"Embedding dimension: {dimension}")

	print("Building FAISS index...")
	# Using simple IndexFlatP, or IVF if dataset large.
	# For "local lightweight", FlatL2 is safest and exact.
	# Normalize for cosine similarity if using IP (Inner Product).
	# SentenceTransformers are usually cosine-sim optimized.

	# Normalize embeddings for Cosine Similarity with IndexFlatIP
	faiss.normalize_L2(embeddings)
	index = faiss.IndexFlatIP(dimension)
	index.add(embeddings)

	print(f"Index built with {index.ntotal} vectors.")

	# Save index
	faiss.write_index(index, os.path.join(output_dir, "vector.index"))

	# Save metadatas (chunks map)
	# We need to map index ID -> Chunk Metadata + Content
	with open(os.path.join(output_dir, "doc_store.pkl"), 'wb') as f:
	pickle.dump(chunks, f)

	print(f"Index saved to {output_dir}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--processed", required=True, help="Path to processed data directory")
	parser.add_argument("--out", required=True, help="Output directory for index")
	args = parser.parse_args()

	build_index(args.processed, args.out)