Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import pickle | |
| import faiss | |
| from bs4 import BeautifulSoup | |
| from sentence_transformers import SentenceTransformer | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| POST_DIR = os.path.join(ROOT, "posts") | |
| INDEX_PATH = os.path.join(ROOT, "faiss_index.bin") | |
| META_PATH = os.path.join(ROOT, "faiss_meta.pkl") | |
| def split_into_chunks(text, chunk_size=4): | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| for i in range(0, len(sentences), chunk_size): | |
| chunk = " ".join(sentences[i:i + chunk_size]).strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| return chunks | |
| def load_html_posts(folder): | |
| texts = [] | |
| ids = [] | |
| meta = [] | |
| for filename in os.listdir(folder): | |
| if not filename.endswith(".html"): | |
| continue | |
| path = os.path.join(folder, filename) | |
| with open(path, "r", encoding="utf-8") as f: | |
| raw_html = f.read() | |
| soup = BeautifulSoup(raw_html, "html.parser") | |
| cleaned = soup.get_text(separator="\n") | |
| chunks = split_into_chunks(cleaned, chunk_size=4) | |
| for i, chunk in enumerate(chunks): | |
| texts.append(chunk) | |
| ids.append(f"{filename}_{i}") | |
| meta.append({"source": filename, "chunk": i}) | |
| return texts, ids, meta | |
| def main(): | |
| print("Loading posts...") | |
| texts, ids, meta = load_html_posts(POST_DIR) | |
| if not texts: | |
| print("No data found.") | |
| return | |
| print(f"Loaded {len(texts)} chunks. Embedding now...") | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| embeddings = model.encode(texts) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings.astype("float32")) | |
| print("Saving FAISS index and metadata...") | |
| faiss.write_index(index, INDEX_PATH) | |
| with open(META_PATH, "wb") as f: | |
| pickle.dump((texts, ids, meta), f) | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |