import os import re import pickle import faiss from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) POST_DIR = os.path.join(ROOT, "posts") INDEX_PATH = os.path.join(ROOT, "faiss_index.bin") META_PATH = os.path.join(ROOT, "faiss_meta.pkl") def split_into_chunks(text, chunk_size=4): sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] for i in range(0, len(sentences), chunk_size): chunk = " ".join(sentences[i:i + chunk_size]).strip() if chunk: chunks.append(chunk) return chunks def load_html_posts(folder): texts = [] ids = [] meta = [] for filename in os.listdir(folder): if not filename.endswith(".html"): continue path = os.path.join(folder, filename) with open(path, "r", encoding="utf-8") as f: raw_html = f.read() soup = BeautifulSoup(raw_html, "html.parser") cleaned = soup.get_text(separator="\n") chunks = split_into_chunks(cleaned, chunk_size=4) for i, chunk in enumerate(chunks): texts.append(chunk) ids.append(f"{filename}_{i}") meta.append({"source": filename, "chunk": i}) return texts, ids, meta def main(): print("Loading posts...") texts, ids, meta = load_html_posts(POST_DIR) if not texts: print("No data found.") return print(f"Loaded {len(texts)} chunks. Embedding now...") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = model.encode(texts) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(embeddings.astype("float32")) print("Saving FAISS index and metadata...") faiss.write_index(index, INDEX_PATH) with open(META_PATH, "wb") as f: pickle.dump((texts, ids, meta), f) print("Done.") if __name__ == "__main__": main()