Dheeraj-13's picture
Feature: Additive Ingestion - Allow adding new docs without wiping old ones. Added Clear button.
d3a38ee
import os
import json
import faiss
import numpy as np
import argparse
import pickle
from typing import List, Dict
from .embed import get_embedder
def load_processed_data(processed_dir: str) -> List[Dict]:
chunks = []
# Always glob for all JSONs to support additive ingestion
import glob
json_files = glob.glob(os.path.join(processed_dir, "*.json"))
print(f"Found {len(json_files)} existing documents to index.")
for f_path in json_files:
if f_path.endswith("manifest.json"): continue
try:
with open(f_path, 'r') as f:
doc_data = json.load(f)
if 'chunks' in doc_data:
chunks.extend(doc_data['chunks'])
except Exception as e:
print(f"Error loading {f_path}: {e}")
return chunks
def build_index(processed_dir: str, output_dir: str):
os.makedirs(output_dir, exist_ok=True)
print("Loading chunks...")
chunks = load_processed_data(processed_dir)
print(f"Loaded {len(chunks)} chunks.")
if not chunks:
print("No chunks found. Exiting.")
return
texts = [c['content'] for c in chunks]
metadatas = [c['metadata'] for c in chunks]
print("Generating embeddings...")
embedder = get_embedder()
embeddings = embedder.embed(texts)
dimension = embeddings.shape[1]
print(f"Embedding dimension: {dimension}")
print("Building FAISS index...")
# Using simple IndexFlatP, or IVF if dataset large.
# For "local lightweight", FlatL2 is safest and exact.
# Normalize for cosine similarity if using IP (Inner Product).
# SentenceTransformers are usually cosine-sim optimized.
# Normalize embeddings for Cosine Similarity with IndexFlatIP
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
print(f"Index built with {index.ntotal} vectors.")
# Save index
faiss.write_index(index, os.path.join(output_dir, "vector.index"))
# Save metadatas (chunks map)
# We need to map index ID -> Chunk Metadata + Content
with open(os.path.join(output_dir, "doc_store.pkl"), 'wb') as f:
pickle.dump(chunks, f)
print(f"Index saved to {output_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--processed", required=True, help="Path to processed data directory")
parser.add_argument("--out", required=True, help="Output directory for index")
args = parser.parse_args()
build_index(args.processed, args.out)