# src/indexer.py """ Index the cleaned corpus into FAISS using sentence-level/small-chunk passages. Outputs: - faiss_index.bin (FAISS index) - docs_meta.jsonl (one JSON line per vector with fields: id, url, title, text) """ from sentence_transformers import SentenceTransformer import faiss import ujson as json from pathlib import Path from tqdm import tqdm import numpy as np import re CORPUS_PATH = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl") META_PATH = Path(__file__).parent.parent.joinpath("docs_meta.jsonl") INDEX_PATH = Path(__file__).parent.parent.joinpath("faiss_index.bin") EMBED_MODEL = "all-MiniLM-L6-v2" # chunking by sentences, group up to N sentences per chunk (1-3 recommended) MAX_SENTENCES_PER_CHUNK = 2 SENT_SPLIT_RE = re.compile(r'([.!?])\s+') def split_into_sentences(text: str): if not text: return [] parts = SENT_SPLIT_RE.split(text) sents = [] for i in range(0, len(parts), 2): chunk = parts[i].strip() punct = parts[i+1] if (i+1)