Substack-Search / src /build_index.py
DTanzillo's picture
Upload 32 files
382c248 verified
import os
import re
import pickle
import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
POST_DIR = os.path.join(ROOT, "posts")
INDEX_PATH = os.path.join(ROOT, "faiss_index.bin")
META_PATH = os.path.join(ROOT, "faiss_meta.pkl")
def split_into_chunks(text, chunk_size=4):
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
for i in range(0, len(sentences), chunk_size):
chunk = " ".join(sentences[i:i + chunk_size]).strip()
if chunk:
chunks.append(chunk)
return chunks
def load_html_posts(folder):
texts = []
ids = []
meta = []
for filename in os.listdir(folder):
if not filename.endswith(".html"):
continue
path = os.path.join(folder, filename)
with open(path, "r", encoding="utf-8") as f:
raw_html = f.read()
soup = BeautifulSoup(raw_html, "html.parser")
cleaned = soup.get_text(separator="\n")
chunks = split_into_chunks(cleaned, chunk_size=4)
for i, chunk in enumerate(chunks):
texts.append(chunk)
ids.append(f"{filename}_{i}")
meta.append({"source": filename, "chunk": i})
return texts, ids, meta
def main():
print("Loading posts...")
texts, ids, meta = load_html_posts(POST_DIR)
if not texts:
print("No data found.")
return
print(f"Loaded {len(texts)} chunks. Embedding now...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings.astype("float32"))
print("Saving FAISS index and metadata...")
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "wb") as f:
pickle.dump((texts, ids, meta), f)
print("Done.")
if __name__ == "__main__":
main()