dasdebanna
Prepare app for Hugging Face Space (include index files)
37a70cc
# src/indexer.py
"""
Index the cleaned corpus into FAISS using sentence-level/small-chunk passages.
Outputs:
- faiss_index.bin (FAISS index)
- docs_meta.jsonl (one JSON line per vector with fields: id, url, title, text)
"""
from sentence_transformers import SentenceTransformer
import faiss
import ujson as json
from pathlib import Path
from tqdm import tqdm
import numpy as np
import re
CORPUS_PATH = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
META_PATH = Path(__file__).parent.parent.joinpath("docs_meta.jsonl")
INDEX_PATH = Path(__file__).parent.parent.joinpath("faiss_index.bin")
EMBED_MODEL = "all-MiniLM-L6-v2"
# chunking by sentences, group up to N sentences per chunk (1-3 recommended)
MAX_SENTENCES_PER_CHUNK = 2
SENT_SPLIT_RE = re.compile(r'([.!?])\s+')
def split_into_sentences(text: str):
if not text:
return []
parts = SENT_SPLIT_RE.split(text)
sents = []
for i in range(0, len(parts), 2):
chunk = parts[i].strip()
punct = parts[i+1] if (i+1)<len(parts) else ""
sent = (chunk + punct).strip()
if sent:
sents.append(sent)
return sents
def build_index():
if not CORPUS_PATH.exists():
raise FileNotFoundError(f"Corpus not found at {CORPUS_PATH}. Run src/scrape_docs.py first.")
model = SentenceTransformer(EMBED_MODEL)
embeddings = []
meta = []
idx = 0
# read corpus and chunk into sentence groups
with CORPUS_PATH.open("r", encoding="utf-8") as f:
for line in tqdm(f, desc="Reading corpus"):
doc = json.loads(line)
url = doc.get("url")
title = doc.get("title","")
text = doc.get("text","")
sents = split_into_sentences(text)
if not sents:
continue
# group sentences into small chunks (1..MAX_SENTENCES_PER_CHUNK)
i = 0
while i < len(sents):
chunk_sents = sents[i:i+MAX_SENTENCES_PER_CHUNK]
chunk_text = " ".join(chunk_sents).strip()
if chunk_text:
meta.append({"id": idx, "url": url, "title": title, "text": chunk_text})
idx += 1
i += MAX_SENTENCES_PER_CHUNK
if not meta:
raise RuntimeError("No chunks created from corpus (empty corpus?)")
# encode in batches for memory efficiency
texts = [m["text"] for m in meta]
batch_size = 64
all_embs = []
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
batch = texts[i:i+batch_size]
embs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
all_embs.append(embs)
embeddings = np.vstack(all_embs).astype("float32")
# normalize to use inner-product as cosine
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)
faiss.write_index(index, str(INDEX_PATH))
with META_PATH.open("w", encoding="utf-8") as f:
for m in meta:
f.write(json.dumps(m, ensure_ascii=False) + "\n")
print(f"Built index with {index.ntotal} vectors. Saved to {INDEX_PATH}, meta to {META_PATH}")
if __name__ == "__main__":
build_index()