Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

Semantic_File / indexer /store.py

JackSparrow89

Update indexer/store.py

608a156 verified about 1 month ago

raw

history blame contribute delete

8.29 kB

	# indexer/store.py

	import os
	import sqlite3
	import numpy as np
	import faiss
	import yaml


	class Store:
	"""
	Handles two storage systems:

	1. FAISS — stores dense vectors for fast similarity search
	Uses IndexHNSWFlat instead of IndexFlatL2
	HNSW = Hierarchical Navigable Small World graph
	- IndexFlatL2 : scans every vector (slow at scale)
	- IndexHNSWFlat: graph-based navigation (fast, same accuracy)

	2. SQLite — stores metadata about each chunk
	"""

	# HNSW parameter — higher = more accurate but more memory
	# 32 is the standard default, good balance for this use case
	HNSW_M = 32

	def __init__(self, config_path="config.yaml"):
	"""
	Load config, set up file paths, initialize FAISS index and SQLite.
	"""
	config_path = os.path.abspath(config_path)
	with open(config_path, "r") as f:
	config = yaml.safe_load(f)

	config_dir = os.path.dirname(config_path)
	data_dir = config["data_dir"]
	self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))
	os.makedirs(self.data_dir, exist_ok=True)

	self.faiss_path = os.path.join(self.data_dir, "index.faiss")
	self.db_path = os.path.join(self.data_dir, "metadata.db")

	self._init_db()
	self._load_or_create_index()

	def _init_db(self):
	"""
	Create SQLite tables if they don't already exist.
	"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS chunks (
	id INTEGER PRIMARY KEY,
	filepath TEXT NOT NULL,
	chunk_text TEXT NOT NULL,
	chunk_index INTEGER,
	FOREIGN KEY (filepath) REFERENCES files(filepath)
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS files (
	filepath TEXT PRIMARY KEY,
	file_hash TEXT NOT NULL,
	total_chunks INTEGER
	)
	''')

	conn.commit()
	conn.close()

	def _load_or_create_index(self):
	"""
	Load an existing FAISS index from disk, or set to None.
	The actual index is created on first add_chunks() call
	so we know the embedding dimension at that point.
	"""
	if os.path.exists(self.faiss_path):
	self.index = faiss.read_index(self.faiss_path)
	print(f"[Store] Loaded FAISS index — {self.index.ntotal} vectors")
	else:
	self.index = None
	print("[Store] No existing index found — will create on first insert")

	def _create_hnsw_index(self, dimension: int):
	"""
	Create a new HNSW-based FAISS index.

	Why HNSW over FlatL2:
	FlatL2 — exact search, O(n) per query, slow at scale
	HNSWFlat — approximate search, O(log n) per query, same accuracy
	for top-k retrieval tasks

	IndexIDMap2 wraps HNSW to support custom integer IDs and deletion.

	Args:
	dimension — embedding size (384 for MiniLM and BGE-small)
	"""
	hnsw_index = faiss.IndexHNSWFlat(dimension, self.HNSW_M)
	hnsw_index.hnsw.efSearch = 64 # search quality — higher = better recall
	hnsw_index.hnsw.efConstruction = 64 # build quality — higher = better graph
	self.index = faiss.IndexIDMap2(hnsw_index)
	print(f"[Store] Created HNSW index — dim={dimension}, M={self.HNSW_M}")

	def get_next_id(self):
	"""
	Get the next available chunk ID from SQLite.
	"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()
	cursor.execute("SELECT MAX(id) FROM chunks")
	result = cursor.fetchone()[0]
	conn.close()
	return 0 if result is None else result + 1

	def add_chunks(self, chunks_with_metadata, embeddings):
	"""
	Add new chunks and their embeddings to both FAISS and SQLite.

	Args:
	chunks_with_metadata (list[dict]) — from chunker.chunk_file()
	Each dict has: text, filepath, chunk_index
	embeddings (numpy.ndarray) — shape (num_chunks, embedding_dim)
	From embedder.embed_chunks()
	"""
	embeddings = embeddings.astype("float32")

	# create index on first insert — dimension comes from embeddings
	if self.index is None:
	dimension = embeddings.shape[1]
	self._create_hnsw_index(dimension)

	start_id = self.get_next_id()
	ids = np.array(
	[start_id + i for i in range(len(chunks_with_metadata))],
	dtype=np.int64
	)

	self.index.add_with_ids(embeddings, ids)
	faiss.write_index(self.index, self.faiss_path)

	# save chunk metadata to SQLite
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	for i, chunk in enumerate(chunks_with_metadata):
	vector_id = start_id + i
	cursor.execute(
	"INSERT INTO chunks (id, filepath, chunk_text, chunk_index) "
	"VALUES (?, ?, ?, ?)",
	(vector_id, chunk["filepath"], chunk["text"], chunk["chunk_index"])
	)

	conn.commit()
	conn.close()

	def save_file_info(self, filepath, file_hash, total_chunks):
	"""
	Save or update file info in SQLite.

	Args:
	filepath — file path or fake path e.g. "scifact://12345"
	file_hash — SHA256 hash or doc_id string
	total_chunks — number of chunks this file was split into
	"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()
	cursor.execute(
	"INSERT OR REPLACE INTO files (filepath, file_hash, total_chunks) "
	"VALUES (?, ?, ?)",
	(filepath, file_hash, total_chunks)
	)
	conn.commit()
	conn.close()

	def load_hashes(self):
	"""
	Load all stored file hashes from SQLite.

	Returns:
	dict — {filepath: hash_string}
	"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()
	cursor.execute("SELECT filepath, file_hash FROM files")
	rows = cursor.fetchall()
	conn.close()
	return {row[0]: row[1] for row in rows}

	def remove_file_chunks(self, filepath):
	"""
	Delete all chunks for a file from both SQLite and FAISS.

	Args:
	filepath — the filepath to remove
	"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	ids = cursor.execute(
	"SELECT id FROM chunks WHERE filepath = ?", (filepath,)
	).fetchall()

	cursor.execute("DELETE FROM chunks WHERE filepath = ?", (filepath,))
	cursor.execute("DELETE FROM files WHERE filepath = ?", (filepath,))
	conn.commit()
	conn.close()

	if ids and self.index is not None:
	id_array = np.array([i[0] for i in ids], dtype=np.int64)
	self.index.remove_ids(id_array)
	faiss.write_index(self.index, self.faiss_path)

	def get_total_vectors(self):
	"""
	Return how many vectors are in the FAISS index.

	Returns:
	int — number of vectors, or 0 if index is empty
	"""
	if self.index is None:
	return 0
	return self.index.ntotal


	if __name__ == "__main__":
	store = Store()

	fake_chunks = [
	{"text": "quarterly budget report summary", "filepath": "/docs/report.pdf", "chunk_index": 0},
	{"text": "revenue increased by fifteen percent", "filepath": "/docs/report.pdf", "chunk_index": 1},
	{"text": "python machine learning tutorial", "filepath": "/docs/tutorial.txt", "chunk_index": 0},
	]

	fake_embeddings = np.random.rand(3, 384).astype("float32")

	print(f"Vectors before: {store.get_total_vectors()}")
	store.add_chunks(fake_chunks, fake_embeddings)
	print(f"Vectors after: {store.get_total_vectors()}")