Spaces:

biswaa000
/

smartedu

Runtime error

smartedu / build_vector_store.py

Bishal Sharma

Upload 5 files

48e85cb verified 7 months ago

4.65 kB

	# build_vector_store.py
	import os
	import json
	import math
	from pathlib import Path
	from tqdm import tqdm

	import numpy as np
	import pdfplumber
	from sentence_transformers import SentenceTransformer
	import faiss

	# --------- CONFIG ----------
	DOCS_DIR = Path("docs")
	DATA_DIR = Path("data")
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	CHUNK_CHAR_SIZE = 1000 # ~400-500 tokens approx (tweak if you want)
	CHUNK_OVERLAP = 200
	EMBED_DIM = 384 # embedding dimension of all-MiniLM-L6-v2
	BATCH_SIZE = 32
	TOP_K = 5
	# ---------------------------

	DATA_DIR.mkdir(exist_ok=True)

	def extract_text_from_pdf(pdf_path: Path):
	pages = []
	with pdfplumber.open(pdf_path) as pdf:
	for i, page in enumerate(pdf.pages):
	text = page.extract_text() or ""
	pages.append({"page_number": i+1, "text": text})
	return pages

	def split_text_into_chunks(text, chunk_size=CHUNK_CHAR_SIZE, overlap=CHUNK_OVERLAP):
	text = text.strip()
	if not text:
	return []
	chunks = []
	start = 0
	text_len = len(text)
	while start < text_len:
	end = start + chunk_size
	# try to avoid breaking mid-sentence: find last newline or period inside chunk
	if end < text_len:
	snippet = text[start:end]
	# prefer last sentence boundary
	cut = max(snippet.rfind('\n'), snippet.rfind('. '), snippet.rfind('? '), snippet.rfind('! '))
	if cut != -1 and cut > int(chunk_size * 0.5):
	end = start + cut + 1
	chunk_text = text[start:end].strip()
	if chunk_text:
	chunks.append(chunk_text)
	start = end - overlap
	if start < 0:
	start = 0
	if end >= text_len:
	break
	return chunks

	def build_embeddings(model, texts):
	embeddings = []
	for i in range(0, len(texts), BATCH_SIZE):
	batch = texts[i:i+BATCH_SIZE]
	embs = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
	embeddings.append(embs)
	if embeddings:
	return np.vstack(embeddings)
	return np.empty((0, model.get_sentence_embedding_dimension()))

	def normalize_embeddings(embeddings: np.ndarray):
	# normalize in-place to unit vectors for cosine via inner product index
	faiss.normalize_L2(embeddings)
	return embeddings

	def main():
	model = SentenceTransformer(EMBED_MODEL)
	EMBED_DIM_LOCAL = model.get_sentence_embedding_dimension()
	print(f"Loaded embed model '{EMBED_MODEL}' with dim={EMBED_DIM_LOCAL}")

	all_text_chunks = []
	metadata = []

	chunk_id = 0
	pdf_files = list(DOCS_DIR.glob("*.pdf"))
	if not pdf_files:
	print("No PDF files found in docs/ — put your PDFs there and re-run.")
	return

	for pdf_path in pdf_files:
	print(f"Processing: {pdf_path.name}")
	pages = extract_text_from_pdf(pdf_path)
	for page in pages:
	page_text = page["text"]
	if not page_text:
	continue
	chunks = split_text_into_chunks(page_text)
	for i, c in enumerate(chunks):
	doc_meta = {
	"chunk_id": chunk_id,
	"source_file": pdf_path.name,
	"page": page["page_number"],
	"chunk_index_in_page": i,
	"text": c[:1000] # store a preview (or store full text if you want)
	}
	metadata.append(doc_meta)
	all_text_chunks.append(c)
	chunk_id += 1

	if not all_text_chunks:
	print("No text extracted from PDFs.")
	return

	print(f"Total chunks: {len(all_text_chunks)}")
	# compute embeddings
	embeddings = build_embeddings(model, all_text_chunks)
	print("Embeddings shape:", embeddings.shape)

	# normalize
	embeddings = normalize_embeddings(embeddings)

	# build FAISS index (inner-product on normalized vectors == cosine sim)
	index = faiss.IndexFlatIP(EMBED_DIM_LOCAL)
	index.add(embeddings.astype('float32'))
	print("FAISS index built. n_total:", index.ntotal)

	# save index and metadata
	index_path = DATA_DIR / "vector_store.index"
	faiss.write_index(index, str(index_path))
	meta_path = DATA_DIR / "metadata.json"
	with open(meta_path, "w", encoding="utf-8") as f:
	json.dump(metadata, f, ensure_ascii=False, indent=2)

	print(f"Saved FAISS index -> {index_path}")
	print(f"Saved metadata -> {meta_path}")

	if __name__ == "__main__":
	main()