Spaces:
Sleeping
Sleeping
Feature: Additive Ingestion - Allow adding new docs without wiping old ones. Added Clear button.
d3a38ee
| import os | |
| import json | |
| import faiss | |
| import numpy as np | |
| import argparse | |
| import pickle | |
| from typing import List, Dict | |
| from .embed import get_embedder | |
| def load_processed_data(processed_dir: str) -> List[Dict]: | |
| chunks = [] | |
| # Always glob for all JSONs to support additive ingestion | |
| import glob | |
| json_files = glob.glob(os.path.join(processed_dir, "*.json")) | |
| print(f"Found {len(json_files)} existing documents to index.") | |
| for f_path in json_files: | |
| if f_path.endswith("manifest.json"): continue | |
| try: | |
| with open(f_path, 'r') as f: | |
| doc_data = json.load(f) | |
| if 'chunks' in doc_data: | |
| chunks.extend(doc_data['chunks']) | |
| except Exception as e: | |
| print(f"Error loading {f_path}: {e}") | |
| return chunks | |
| def build_index(processed_dir: str, output_dir: str): | |
| os.makedirs(output_dir, exist_ok=True) | |
| print("Loading chunks...") | |
| chunks = load_processed_data(processed_dir) | |
| print(f"Loaded {len(chunks)} chunks.") | |
| if not chunks: | |
| print("No chunks found. Exiting.") | |
| return | |
| texts = [c['content'] for c in chunks] | |
| metadatas = [c['metadata'] for c in chunks] | |
| print("Generating embeddings...") | |
| embedder = get_embedder() | |
| embeddings = embedder.embed(texts) | |
| dimension = embeddings.shape[1] | |
| print(f"Embedding dimension: {dimension}") | |
| print("Building FAISS index...") | |
| # Using simple IndexFlatP, or IVF if dataset large. | |
| # For "local lightweight", FlatL2 is safest and exact. | |
| # Normalize for cosine similarity if using IP (Inner Product). | |
| # SentenceTransformers are usually cosine-sim optimized. | |
| # Normalize embeddings for Cosine Similarity with IndexFlatIP | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(dimension) | |
| index.add(embeddings) | |
| print(f"Index built with {index.ntotal} vectors.") | |
| # Save index | |
| faiss.write_index(index, os.path.join(output_dir, "vector.index")) | |
| # Save metadatas (chunks map) | |
| # We need to map index ID -> Chunk Metadata + Content | |
| with open(os.path.join(output_dir, "doc_store.pkl"), 'wb') as f: | |
| pickle.dump(chunks, f) | |
| print(f"Index saved to {output_dir}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--processed", required=True, help="Path to processed data directory") | |
| parser.add_argument("--out", required=True, help="Output directory for index") | |
| args = parser.parse_args() | |
| build_index(args.processed, args.out) | |