Spaces:

Debanna
/

Customer-Support-Copilot

Sleeping

Customer-Support-Copilot / src /indexer.py

dasdebanna

Prepare app for Hugging Face Space (include index files)

37a70cc 4 months ago

3.23 kB

	# src/indexer.py
	"""
	Index the cleaned corpus into FAISS using sentence-level/small-chunk passages.

	Outputs:
	- faiss_index.bin (FAISS index)
	- docs_meta.jsonl (one JSON line per vector with fields: id, url, title, text)
	"""
	from sentence_transformers import SentenceTransformer
	import faiss
	import ujson as json
	from pathlib import Path
	from tqdm import tqdm
	import numpy as np
	import re

	CORPUS_PATH = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
	META_PATH = Path(__file__).parent.parent.joinpath("docs_meta.jsonl")
	INDEX_PATH = Path(__file__).parent.parent.joinpath("faiss_index.bin")
	EMBED_MODEL = "all-MiniLM-L6-v2"

	# chunking by sentences, group up to N sentences per chunk (1-3 recommended)
	MAX_SENTENCES_PER_CHUNK = 2

	SENT_SPLIT_RE = re.compile(r'([.!?])\s+')

	def split_into_sentences(text: str):
	if not text:
	return []
	parts = SENT_SPLIT_RE.split(text)
	sents = []
	for i in range(0, len(parts), 2):
	chunk = parts[i].strip()
	punct = parts[i+1] if (i+1)<len(parts) else ""
	sent = (chunk + punct).strip()
	if sent:
	sents.append(sent)
	return sents

	def build_index():
	if not CORPUS_PATH.exists():
	raise FileNotFoundError(f"Corpus not found at {CORPUS_PATH}. Run src/scrape_docs.py first.")

	model = SentenceTransformer(EMBED_MODEL)
	embeddings = []
	meta = []
	idx = 0

	# read corpus and chunk into sentence groups
	with CORPUS_PATH.open("r", encoding="utf-8") as f:
	for line in tqdm(f, desc="Reading corpus"):
	doc = json.loads(line)
	url = doc.get("url")
	title = doc.get("title","")
	text = doc.get("text","")
	sents = split_into_sentences(text)
	if not sents:
	continue
	# group sentences into small chunks (1..MAX_SENTENCES_PER_CHUNK)
	i = 0
	while i < len(sents):
	chunk_sents = sents[i:i+MAX_SENTENCES_PER_CHUNK]
	chunk_text = " ".join(chunk_sents).strip()
	if chunk_text:
	meta.append({"id": idx, "url": url, "title": title, "text": chunk_text})
	idx += 1
	i += MAX_SENTENCES_PER_CHUNK

	if not meta:
	raise RuntimeError("No chunks created from corpus (empty corpus?)")

	# encode in batches for memory efficiency
	texts = [m["text"] for m in meta]
	batch_size = 64
	all_embs = []
	for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
	batch = texts[i:i+batch_size]
	embs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
	all_embs.append(embs)
	embeddings = np.vstack(all_embs).astype("float32")

	# normalize to use inner-product as cosine
	faiss.normalize_L2(embeddings)
	d = embeddings.shape[1]
	index = faiss.IndexFlatIP(d)
	index.add(embeddings)
	faiss.write_index(index, str(INDEX_PATH))
	with META_PATH.open("w", encoding="utf-8") as f:
	for m in meta:
	f.write(json.dumps(m, ensure_ascii=False) + "\n")
	print(f"Built index with {index.ntotal} vectors. Saved to {INDEX_PATH}, meta to {META_PATH}")

	if __name__ == "__main__":
	build_index()